Skip to content

Commit

Permalink
wip: use new 小韻 data for 反切 & 音韻地位
Browse files Browse the repository at this point in the history
- 反切 now includes annotations for representing original and corrected
  forms
- 音韻地位 updated accordingly
  - Every 小韻 now has a 音韻地位

TBD:
- Fix 反切 in 釋義 and 釋義補充
- Add missing checks back
  • Loading branch information
syimyuzya committed Dec 26, 2024
1 parent 858d35b commit 270bd76
Show file tree
Hide file tree
Showing 6 changed files with 29,271 additions and 29,290 deletions.
89 changes: 36 additions & 53 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
import csv
from dataclasses import dataclass


# 「通俗地位」
音韻地位_patches = {
'892': ('幫二庚平', '幫二耕平'),
'1016': ('明一侯平', '明三C尤平'),
'3059': ('明一侯去', '明三C尤去'),
}
# 補全缺失釋義補充
釋義補充_patch_from = {
('949', '蔆'): None,
Expand All @@ -17,18 +12,6 @@
}


def process_音韻地位(row: list[str]) -> str:
, , 等類, , = row[10:15]
if not :
return ''
if (pos := .find('→')) != -1:
= [pos + 1 :]
# NOTE 原資料莊組真殷韻依原貌。由於資料中已列「韻目原貌」,故地位不需再分
if in ('真', '殷') and == '開' and in tuple('莊初崇生俟'):
= '臻'
return + + 等類 + +


def fix_pua(s: str) -> str:
fixed = s.replace('\uee42', '𧞬').replace('\uece0', '勳')
for ch in fixed:
Expand All @@ -38,18 +21,27 @@ def fix_pua(s: str) -> str:
return fixed


@dataclass
class 小韻Row:
小韻號: str
首字: str
反切: str
音韻地位: str


def main():
小韻_data: dict[str, list[str]] = {}
with open('src/rime-table-0b69606.tsv') as fin:
next(fin)
小韻_data: dict[str, 小韻Row] = {}
with open('src/小韻表.tsv') as fin:
header = next(fin)
assert header.rstrip('\n').split('\t') == [
'小韻號',
'首字',
'反切',
'音韻地位',
], repr(header)
for line in fin:
row = line.rstrip('\n').split('\t')
小韻號 = row[0]
小韻_data[小韻號] = row

音韻地位_data: dict[str, str] = {
key: process_音韻地位(row) for key, row in 小韻_data.items()
}
小韻_data[row[0]] = 小韻Row(*row)

has_細分: dict[str, list[str]] = {}
小韻細分_data: dict[str, list[str]] = {}
Expand All @@ -60,7 +52,7 @@ def main():
assert 小韻號[-1].isalpha()
反切 = row[1]
assert (
小韻_data[小韻號][2] == 反切
小韻_data[小韻號].反切 == 反切
), f'反切 mismatch in 小韻 #{小韻號}, 小韻_data: {小韻_data[小韻號][2]}, 小韻細分_data: {反切}'
has_細分.setdefault(小韻號[:-1], []).append(小韻號[-1])
小韻細分_data[小韻號] = row
Expand All @@ -70,23 +62,22 @@ def main():
with open('src/廣韻(20170209).csv') as fin:
for row in csv.DictReader(fin):
# Formerly used fields (field number is 1-based, same as awk & MS Excel):
# '廣韻反切原貌(覈校前)', # 20
# '廣韻反切(覈校後)', # 21
# '廣韻字頭原貌(覈校前)', # 24
# '廣韻頁序', # 57
(
增刪說明,
反切原貌,
字頭,
釋義,
釋義補充,
韻目原貌,
小韻號原貌,
原書小韻號,
小韻內字序,
) = (
row[key]
for key in (
'字頭-補', # 19
'廣韻反切原貌(覈校前)', # 20
'廣韻字頭(覈校後)', # 25
'廣韻釋義', # 26
'釋義補充', # 27
Expand All @@ -99,36 +90,29 @@ def main():
if 增刪說明 == '應刪':
continue

order_key = (int(小韻號原貌), float(小韻內字序))
order_key = (int(原書小韻號), float(小韻內字序))

# 小韻號
if 小韻號原貌 in has_細分:
for 細分 in has_細分[小韻號原貌]:
小韻號 = 小韻號原貌 + 細分
if 原書小韻號 in has_細分:
for 細分 in has_細分[原書小韻號]:
小韻號 = 原書小韻號 + 細分
if 字頭 in 小韻細分_data[小韻號][2]:
小韻細分_coverage.setdefault(小韻號, set()).add(字頭)
break
else:
raise ValueError(
f'cannot determine 小韻細分 for {字頭} (小韻 #{小韻號原貌})'
f'cannot determine 小韻細分 for {字頭} (小韻 #{原書小韻號})'
)
else:
小韻號 = 小韻號原貌
小韻號 = 原書小韻號

音韻地位 = 音韻地位_data[小韻號]
patch = 音韻地位_patches.get(小韻號)
if patch is not None:
assert (
音韻地位 == patch[0]
), f'invalid patch: expect {patch[0]} -> {patch[1]}, got {音韻地位}'
音韻地位 = patch[1]
音韻地位 = 小韻_data[小韻號].音韻地位

反切 = 小韻_data[小韻號][2]
if 反切 == '':
反切 = 小韻_data[小韻號].反切
if 反切 == '-':
反切 = ''

if len(反切原貌) != 2 or 反切原貌 == 反切:
反切原貌 = ''
# TODO patch 反切 in 釋義 (and in 釋義補充)

釋義_key = (小韻號, 字頭)
if 釋義_key in 釋義補充_patch_from:
Expand All @@ -146,7 +130,6 @@ def main():
韻目原貌,
音韻地位,
反切,
反切原貌,
字頭,
釋義,
釋義補充,
Expand All @@ -160,18 +143,18 @@ def main():
assert not diff, f'字頭 listed in 小韻細分_data but not seen: {"".join(sorted(diff))} (小韻 #{小韻號})'

for 條目 in 廣韻_data:
key = 條目[1][0], 條目[1][6]
key = 條目[1][0], 條目[1][5]
if (patch := 釋義補充_patch_to.get(key)) is not None:
assert not 條目[1][8], f'條目 already containing 釋義補充: {條目[1]}'
條目[1][8] = 釋義補充_patch_from[(patch[0], patch[1])][patch[2]]
assert not 條目[1][7], f'條目 already containing 釋義補充: {條目[1]}'
條目[1][7] = 釋義補充_patch_from[(patch[0], patch[1])][patch[2]]

廣韻_data.sort(key=lambda x: x[0])

last_原小韻號 = 0
小韻內字序 = 0
with open('韻書/廣韻.csv', 'w', newline='') as fout:
print(
'小韻號,小韻內字序,韻目原貌,音韻地位,反切,反切原貌,字頭,釋義,釋義補充',
'小韻號,小韻內字序,韻目原貌,音韻地位,反切,字頭,釋義,釋義補充',
file=fout,
)
for (原小韻號, _), row in 廣韻_data:
Expand Down
19 changes: 6 additions & 13 deletions check.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def contains_ascii(s: str):
with open('韻書/廣韻.csv') as f:
assert (
next(f).rstrip('\n')
== '小韻號,小韻內字序,韻目原貌,音韻地位,反切,反切原貌,字頭,釋義,釋義補充'
== '小韻號,小韻內字序,韻目原貌,音韻地位,反切,字頭,釋義,釋義補充'
)
for line in f:
(
Expand All @@ -31,23 +31,16 @@ def contains_ascii(s: str):
韻目原貌,
音韻地位描述,
反切,
反切原貌,
字頭,
釋義,
釋義補充,
) = line.rstrip('\n').split(',')
if 音韻地位描述 != '':
assert (
PATTERN_描述.fullmatch(音韻地位描述) is not None
), f'invalid 音韻地位: {音韻地位描述}'
assert len(反切) in (
2,
0,
), 'The length of 反切 should be 2, otherwise it should be an empty string'
assert 反切原貌 == '' or len(反切原貌) == len(
反切
), '反切原貌 should either be empty or have the same length with 反切'
assert (
PATTERN_描述.fullmatch(音韻地位描述) is not None
), f'invalid 音韻地位: {音韻地位描述}'
# TODO 反切
assert len(字頭) == 1, 'The length of 字頭 should be 1'
assert not contains_ascii(
釋義
), '釋義 should not contain any ASCII characters'
# TODO 釋義 should not be empty
Loading

0 comments on commit 270bd76

Please sign in to comment.