wip: use new 小韻 data for 反切 & 音韻地位

- 反切 now includes annotations for representing original and corrected forms - 音韻地位 updated accordingly - Every 小韻 now has a 音韻地位 TBD: - Fix 反切 in 釋義 and 釋義補充 - Add missing checks back
nk2028 · Dec 26, 2024 · 270bd76 · 270bd76
1 parent 858d35b
commit 270bd76
Show file tree

Hide file tree

Showing 6 changed files with 29,271 additions and 29,290 deletions.
diff --git a/build.py b/build.py
@@ -1,12 +1,7 @@
 import csv
+from dataclasses import dataclass
 
 
-# 「通俗地位」
-音韻地位_patches = {
-    '892': ('幫二庚平', '幫二耕平'),
-    '1016': ('明一侯平', '明三C尤平'),
-    '3059': ('明一侯去', '明三C尤去'),
-}
 # 補全缺失釋義補充
 釋義補充_patch_from = {
     ('949', '蔆'): None,
@@ -17,18 +12,6 @@
 }
 
 
-def process_音韻地位(row: list[str]) -> str:
-    母, 呼, 等類, 韻, 聲 = row[10:15]
-    if not 母:
-        return ''
-    if (pos := 韻.find('→')) != -1:
-        韻 = 韻[pos + 1 :]
-    # NOTE 原資料莊組真殷韻依原貌。由於資料中已列「韻目原貌」，故地位不需再分
-    if 韻 in ('真', '殷') and 呼 == '開' and 母 in tuple('莊初崇生俟'):
-        韻 = '臻'
-    return 母 + 呼 + 等類 + 韻 + 聲
-
-
 def fix_pua(s: str) -> str:
     fixed = s.replace('\uee42', '𧞬').replace('\uece0', '勳')
     for ch in fixed:
@@ -38,18 +21,27 @@ def fix_pua(s: str) -> str:
     return fixed
 
 
+@dataclass
+class 小韻Row:
+    小韻號: str
+    首字: str
+    反切: str
+    音韻地位: str
+
+
 def main():
-    小韻_data: dict[str, list[str]] = {}
-    with open('src/rime-table-0b69606.tsv') as fin:
-        next(fin)
+    小韻_data: dict[str, 小韻Row] = {}
+    with open('src/小韻表.tsv') as fin:
+        header = next(fin)
+        assert header.rstrip('\n').split('\t') == [
+            '小韻號',
+            '首字',
+            '反切',
+            '音韻地位',
+        ], repr(header)
         for line in fin:
             row = line.rstrip('\n').split('\t')
-            小韻號 = row[0]
-            小韻_data[小韻號] = row
-
-    音韻地位_data: dict[str, str] = {
-        key: process_音韻地位(row) for key, row in 小韻_data.items()
-    }
+            小韻_data[row[0]] = 小韻Row(*row)
 
     has_細分: dict[str, list[str]] = {}
     小韻細分_data: dict[str, list[str]] = {}
@@ -60,7 +52,7 @@ def main():
             assert 小韻號[-1].isalpha()
             反切 = row[1]
             assert (
-                小韻_data[小韻號][2] == 反切
+                小韻_data[小韻號].反切 == 反切
             ), f'反切 mismatch in 小韻 #{小韻號}, 小韻_data: {小韻_data[小韻號][2]}, 小韻細分_data: {反切}'
             has_細分.setdefault(小韻號[:-1], []).append(小韻號[-1])
             小韻細分_data[小韻號] = row
@@ -70,23 +62,22 @@ def main():
     with open('src/廣韻(20170209).csv') as fin:
         for row in csv.DictReader(fin):
             # Formerly used fields (field number is 1-based, same as awk & MS Excel):
+            # '廣韻反切原貌(覈校前)',  # 20
             # '廣韻反切(覈校後)',  # 21
             # '廣韻字頭原貌(覈校前)',  # 24
             # '廣韻頁序',  # 57
             (
                 增刪說明,
-                反切原貌,
                 字頭,
                 釋義,
                 釋義補充,
                 韻目原貌,
-                小韻號原貌,
+                原書小韻號,
                 小韻內字序,
             ) = (
                 row[key]
                 for key in (
                     '字頭-補',  # 19
-                    '廣韻反切原貌(覈校前)',  # 20
                     '廣韻字頭(覈校後)',  # 25
                     '廣韻釋義',  # 26
                     '釋義補充',  # 27
@@ -99,36 +90,29 @@ def main():
             if 增刪說明 == '應刪':
                 continue
 
-            order_key = (int(小韻號原貌), float(小韻內字序))
+            order_key = (int(原書小韻號), float(小韻內字序))
 
             # 小韻號
-            if 小韻號原貌 in has_細分:
-                for 細分 in has_細分[小韻號原貌]:
-                    小韻號 = 小韻號原貌 + 細分
+            if 原書小韻號 in has_細分:
+                for 細分 in has_細分[原書小韻號]:
+                    小韻號 = 原書小韻號 + 細分
                     if 字頭 in 小韻細分_data[小韻號][2]:
                         小韻細分_coverage.setdefault(小韻號, set()).add(字頭)
                         break
                 else:
                     raise ValueError(
-                        f'cannot determine 小韻細分 for {字頭} (小韻 #{小韻號原貌})'
+                        f'cannot determine 小韻細分 for {字頭} (小韻 #{原書小韻號})'
                     )
             else:
-                小韻號 = 小韻號原貌
+                小韻號 = 原書小韻號
 
-            音韻地位 = 音韻地位_data[小韻號]
-            patch = 音韻地位_patches.get(小韻號)
-            if patch is not None:
-                assert (
-                    音韻地位 == patch[0]
-                ), f'invalid patch: expect {patch[0]} -> {patch[1]}, got {音韻地位}'
-                音韻地位 = patch[1]
+            音韻地位 = 小韻_data[小韻號].音韻地位
 
-            反切 = 小韻_data[小韻號][2]
-            if 反切 == '無':
+            反切 = 小韻_data[小韻號].反切
+            if 反切 == '-':
                 反切 = ''
 
-            if len(反切原貌) != 2 or 反切原貌 == 反切:
-                反切原貌 = ''
+            # TODO patch 反切 in 釋義 (and in 釋義補充)
 
             釋義_key = (小韻號, 字頭)
             if 釋義_key in 釋義補充_patch_from:
@@ -146,7 +130,6 @@ def main():
                         韻目原貌,
                         音韻地位,
                         反切,
-                        反切原貌,
                         字頭,
                         釋義,
                         釋義補充,
@@ -160,18 +143,18 @@ def main():
         assert not diff, f'字頭 listed in 小韻細分_data but not seen: {"".join(sorted(diff))} (小韻 #{小韻號})'
 
     for 條目 in 廣韻_data:
-        key = 條目[1][0], 條目[1][6]
+        key = 條目[1][0], 條目[1][5]
         if (patch := 釋義補充_patch_to.get(key)) is not None:
-            assert not 條目[1][8], f'條目 already containing 釋義補充: {條目[1]}'
-            條目[1][8] = 釋義補充_patch_from[(patch[0], patch[1])][patch[2]]
+            assert not 條目[1][7], f'條目 already containing 釋義補充: {條目[1]}'
+            條目[1][7] = 釋義補充_patch_from[(patch[0], patch[1])][patch[2]]
 
     廣韻_data.sort(key=lambda x: x[0])
 
     last_原小韻號 = 0
     小韻內字序 = 0
     with open('韻書/廣韻.csv', 'w', newline='') as fout:
         print(
-            '小韻號,小韻內字序,韻目原貌,音韻地位,反切,反切原貌,字頭,釋義,釋義補充',
+            '小韻號,小韻內字序,韻目原貌,音韻地位,反切,字頭,釋義,釋義補充',
             file=fout,
         )
         for (原小韻號, _), row in 廣韻_data:

diff --git a/check.py b/check.py
@@ -22,7 +22,7 @@ def contains_ascii(s: str):
     with open('韻書/廣韻.csv') as f:
         assert (
             next(f).rstrip('\n')
-            == '小韻號,小韻內字序,韻目原貌,音韻地位,反切,反切原貌,字頭,釋義,釋義補充'
+            == '小韻號,小韻內字序,韻目原貌,音韻地位,反切,字頭,釋義,釋義補充'
         )
         for line in f:
             (
@@ -31,23 +31,16 @@ def contains_ascii(s: str):
                 韻目原貌,
                 音韻地位描述,
                 反切,
-                反切原貌,
                 字頭,
                 釋義,
                 釋義補充,
             ) = line.rstrip('\n').split(',')
-            if 音韻地位描述 != '':
-                assert (
-                    PATTERN_描述.fullmatch(音韻地位描述) is not None
-                ), f'invalid 音韻地位: {音韻地位描述}'
-            assert len(反切) in (
-                2,
-                0,
-            ), 'The length of 反切 should be 2, otherwise it should be an empty string'
-            assert 反切原貌 == '' or len(反切原貌) == len(
-                反切
-            ), '反切原貌 should either be empty or have the same length with 反切'
+            assert (
+                PATTERN_描述.fullmatch(音韻地位描述) is not None
+            ), f'invalid 音韻地位: {音韻地位描述}'
+            # TODO 反切
             assert len(字頭) == 1, 'The length of 字頭 should be 1'
             assert not contains_ascii(
                 釋義
             ), '釋義 should not contain any ASCII characters'
+            # TODO 釋義 should not be empty