Merge pull request #896 from xxyzz/ko

[ko] improve extract gloss list and example list code
tatuylonen · Nov 1, 2024 · d6bf104 · d6bf104
2 parents d49d402 + 6bfa818
commit d6bf104
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 21 deletions.
diff --git a/src/wiktextract/extractor/ko/example.py b/src/wiktextract/extractor/ko/example.py
@@ -3,7 +3,8 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from ..ruby import extract_ruby
-from .models import Example, Sense
+from ..share import set_sound_file_url_fields
+from .models import Example, Sense, Sound
 
 
 def extract_example_list_item(
@@ -14,6 +15,8 @@ def extract_example_list_item(
     parent_example: Example | None = None,
 ) -> None:
     example = Example() if parent_example is None else parent_example
+    e_text_nodes = []
+    e_tr_nodes = []
     after_lang_template = False
     for node in list_item.children:
         if isinstance(node, TemplateNode) and node.template_name == "lang":
@@ -33,11 +36,31 @@ def extract_example_list_item(
             extract_ux_template(wxr, sense, example, node)
             break
         elif after_lang_template:
-            example.translation += clean_node(wxr, None, node)
+            e_tr_nodes.append(node)
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             break
+        elif (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.LINK
+            and len(node.largs) > 0
+            and len(node.largs[0]) > 0
+            and isinstance(node.largs[0][0], str)
+            and node.largs[0][0].startswith("File:")
+        ):
+            sound = Sound()
+            sound_file = node.largs[0][0].removeprefix("File:").strip()
+            set_sound_file_url_fields(wxr, sound_file, sound)
+            if sound.audio != "":
+                example.sounds.append(sound)
         else:
-            example.text += clean_node(wxr, None, node)
+            e_text_nodes.append(node)
+
+    e_text = clean_node(wxr, sense, e_text_nodes)
+    if e_text != "":
+        example.text = e_text
+    e_tr = clean_node(wxr, sense, e_tr_nodes)
+    if e_tr != "":
+        example.translation = e_tr
 
     if len(example.text) > 0:
         if lang_code == "zh" and "/" in example.text:
@@ -56,7 +79,11 @@ def extract_example_list_item(
     for nested_list in list_item.find_child(NodeKind.LIST):
         for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
             extract_example_list_item(
-                wxr, sense, nested_list_item, lang_code, example
+                wxr,
+                sense,
+                nested_list_item,
+                lang_code,
+                example if example.text == "" else Example(),
             )
 
 

diff --git a/src/wiktextract/extractor/ko/models.py b/src/wiktextract/extractor/ko/models.py
@@ -10,6 +10,22 @@ class KoreanBaseModel(BaseModel):
     )
 
 
+class Sound(KoreanBaseModel):
+    ipa: str = Field(default="", description="International Phonetic Alphabet")
+    audio: str = Field(default="", description="Audio file name")
+    wav_url: str = ""
+    oga_url: str = ""
+    ogg_url: str = ""
+    mp3_url: str = ""
+    opus_url: str = ""
+    flac_url: str = ""
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    hangul: str = ""
+    roman: str = ""
+    other: str = ""
+
+
 class Example(KoreanBaseModel):
     text: str = ""
     translation: str = ""
@@ -21,6 +37,7 @@ class Example(KoreanBaseModel):
     tags: list[str] = []
     literal_meaning: str = ""
     note: str = ""
+    sounds: list[Sound] = []
 
 
 class AltForm(KoreanBaseModel):
@@ -36,22 +53,7 @@ class Sense(KoreanBaseModel):
     examples: list[Example] = []
     note: str = ""
     form_of: list[AltForm] = []
-
-
-class Sound(KoreanBaseModel):
-    ipa: str = Field(default="", description="International Phonetic Alphabet")
-    audio: str = Field(default="", description="Audio file name")
-    wav_url: str = ""
-    oga_url: str = ""
-    ogg_url: str = ""
-    mp3_url: str = ""
-    opus_url: str = ""
-    flac_url: str = ""
-    tags: list[str] = []
-    raw_tags: list[str] = []
-    hangul: str = ""
-    roman: str = ""
-    other: str = ""
+    pattern: str = Field(default="", description="Sentence structure, 문형")
 
 
 class Linkage(KoreanBaseModel):
@@ -100,3 +102,6 @@ class WordEntry(KoreanBaseModel):
     etymology_texts: list[str] = []
     note: str = ""
     forms: list[Form] = []
+    pattern: str = Field(
+        default="", description="Sentence structure, 문형", exclude=True
+    )
diff --git a/src/wiktextract/extractor/ko/pos.py b/src/wiktextract/extractor/ko/pos.py
@@ -65,7 +65,10 @@ def extract_pos_section(
             for list_item in node.find_child(NodeKind.LIST_ITEM):
                 if node.sarg.startswith("#") and node.sarg.endswith("#"):
                     extract_gloss_list_item(
-                        wxr, page_data[-1], list_item, Sense()
+                        wxr,
+                        page_data[-1],
+                        list_item,
+                        Sense(pattern=page_data[-1].pattern),
                     )
                 else:
                     extract_unorderd_list_item(wxr, page_data[-1], list_item)
@@ -174,6 +177,12 @@ def extract_unorderd_list_item(
         ):
             extract_linkage_list_item(wxr, word_entry, list_item, "")
             break
+        elif isinstance(node, str) and "문형:" in node:
+            word_entry.pattern = node[node.index(":") + 1 :].strip()
+            word_entry.pattern += clean_node(
+                wxr, None, list_item.children[index + 1 :]
+            )
+            break
     else:
         if len(word_entry.senses) > 0:
             extract_example_list_item(

diff --git a/src/wiktextract/extractor/ko/section_titles.py b/src/wiktextract/extractor/ko/section_titles.py
@@ -32,6 +32,7 @@
     "연어": {"pos": "phrase", "tags": ["idiomatic"]},
     "동사 활용형": {"pos": "verb", "tags": ["form-of"]},
     "재귀동사": {"pos": "verb", "tags": ["reflexive"]},
+    "보조형용사": {"pos": "adj", "tags": ["auxiliary"]},
 }
 
 LINKAGE_SECTIONS = {

diff --git a/tests/test_ko_example.py b/tests/test_ko_example.py
@@ -177,3 +177,45 @@ def test_jibong_yuseol_template(self):
                 "ref": "1614년, 이수광, 《지봉유설》, 〈2권 外國 條〉",
             },
         )
+
+    def test_sound_file(self):
+        data = parse_page(
+            self.wxr,
+            "사람",
+            """== 중국어 ==
+=== 명사 ===
+==== 명사 1 ====
+# 어떤 지역이나 시기에 태어나거나 살고 있거나 살았던 자.
+:* 한국 '''사람''' [[File:Ko-한국 사람.oga]]""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][0]["text"], "한국 사람"
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][0]["sounds"][0]["audio"],
+            "Ko-한국 사람.oga",
+        )
+
+    def test_wrong_nested_list(self):
+        data = parse_page(
+            self.wxr,
+            "들다",
+            """== 중국어 ==
+=== 명사 ===
+==== 명사 1 ====
+#  한 곳에서 다른 어디로 또는 밖에서 속이나 안으로 향해 가거나, 오거나 또는 어디에 자리하다.
+: 안으로 드시지요.
+:* 물이 어디에 '''들어''' 있어요？ [[File:물이 어디에 들어 있어요？.ogg]]""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][0]["text"], "안으로 드시지요."
+        )
+        self.assertTrue("sounds" not in data[0]["senses"][0]["examples"][0])
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][1]["text"],
+            "물이 어디에 들어 있어요？",
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][1]["sounds"][0]["audio"],
+            "물이 어디에 들어 있어요？.ogg",
+        )
diff --git a/tests/test_ko_gloss.py b/tests/test_ko_gloss.py
@@ -203,3 +203,29 @@ def test_nested_gloss_lists(self):
                 },
             ],
         )
+
+    def test_pattern_list(self):
+        data = parse_page(
+            self.wxr,
+            "대하다",
+            """== 한국어 ==
+=== 동사 ===
+==== 동사 2 ====
+*문형: […을] [(…과) …을]
+# 마주 향하여 있다.
+*문형: […에/에게 -게] […을 …으로] […을 -게]
+# 어떤 태도로 상대하다.""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["마주 향하여 있다."],
+                    "pattern": "[…을] [(…과) …을]",
+                },
+                {
+                    "glosses": ["어떤 태도로 상대하다."],
+                    "pattern": "[…에/에게 -게] […을 …으로] […을 -게]",
+                },
+            ],
+        )