Merge pull request #993 from xxyzz/th

[th] extract thesaurus pages, linkage and form-of templates
tatuylonen · Jan 17, 2025 · ee63ee9 · ee63ee9
2 parents d9ba573 + 42b3fdb
commit ee63ee9
Show file tree

Hide file tree

Showing 8 changed files with 236 additions and 11 deletions.
diff --git a/src/wiktextract/data/th/config.json b/src/wiktextract/data/th/config.json
@@ -0,0 +1,8 @@
+{
+  "save_ns_names": [
+    "Main",
+    "Template",
+    "Module",
+    "Thesaurus"
+  ]
+}
diff --git a/src/wiktextract/extractor/th/example.py b/src/wiktextract/extractor/th/example.py
@@ -5,16 +5,19 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from ..ruby import extract_ruby
-from .models import Example, Sense
+from .models import Example, Sense, WordEntry
 from .tags import translate_raw_tags
 
 
 def extract_example_list_item(
     wxr: WiktextractContext,
+    word_entry: WordEntry,
     sense: Sense,
     list_item: WikiNode,
     ref: str = "",
 ) -> None:
+    from .linkage import LINKAGE_TEMPLATES, extract_syn_template
+
     for node in list_item.children:
         if isinstance(node, TemplateNode):
             if node.template_name in ["ux", "usex", "ko-usex"]:
@@ -25,9 +28,15 @@ def extract_example_list_item(
                 extract_template_ja_usex(wxr, sense, node, ref)
             elif node.template_name.startswith("quote-"):
                 ref = extract_quote_template(wxr, sense, node)
+            elif node.template_name in LINKAGE_TEMPLATES:
+                extract_syn_template(
+                    wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name]
+                )
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             for child_list_item in node.find_child(NodeKind.LIST_ITEM):
-                extract_example_list_item(wxr, sense, child_list_item, ref)
+                extract_example_list_item(
+                    wxr, word_entry, sense, child_list_item, ref
+                )
 
 
 def extract_ux_template(

diff --git a/src/wiktextract/extractor/th/linkage.py b/src/wiktextract/extractor/th/linkage.py
@@ -1,25 +1,37 @@
-from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+from itertools import count
+
+from wikitextprocessor.parser import (
+    LEVEL_KIND_FLAGS,
+    LevelNode,
+    NodeKind,
+    TemplateNode,
+    WikiNode,
+)
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .models import Linkage, WordEntry
+from .section_titles import LINKAGE_SECTIONS
 
 
 def extract_linkage_section(
     wxr: WiktextractContext,
     word_entry: WordEntry,
     level_node: LevelNode,
     linkage_type: str,
+    source: str = "",
 ) -> None:
     for node in level_node.children:
         if isinstance(node, TemplateNode) and node.template_name.startswith(
             "col"
         ):
-            extract_col_template(wxr, word_entry, node, linkage_type)
+            extract_col_template(wxr, word_entry, node, linkage_type, source)
+        elif isinstance(node, TemplateNode) and node.template_name == "ws":
+            extract_ws_template(wxr, word_entry, node, linkage_type, source)
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             for list_item in node.find_child(NodeKind.LIST_ITEM):
                 extract_linkage_lite_item(
-                    wxr, word_entry, list_item, linkage_type
+                    wxr, word_entry, list_item, linkage_type, source
                 )
 
 
@@ -28,12 +40,13 @@ def extract_col_template(
     word_entry: WordEntry,
     t_node: TemplateNode,
     linkage_type: str,
+    source: str,
 ) -> None:
     expanded_node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(t_node), expand_all=True
     )
     for li_tag in expanded_node.find_html_recursively("li"):
-        l_data = Linkage(word="")
+        l_data = Linkage(word="", source=source)
         for span_tag in li_tag.find_html("span"):
             span_class = span_tag.attrs.get("class", "")
             if "Latn" in span_class:
@@ -49,15 +62,103 @@ def extract_linkage_lite_item(
     word_entry: WordEntry,
     list_item: WikiNode,
     linkage_type: str,
+    source: str,
 ) -> None:
     linkages = []
 
     for node in list_item.children:
         if isinstance(node, TemplateNode) and node.template_name == "l":
             l_data = Linkage(
-                word=clean_node(wxr, None, node.template_parameters.get(2, ""))
+                word=clean_node(wxr, None, node.template_parameters.get(2, "")),
+                source=source,
             )
             if l_data.word != "":
                 linkages.append(l_data)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+            for link_node in node.find_child(NodeKind.LINK):
+                link_str = clean_node(wxr, None, link_node)
+                if link_str.startswith("อรรถาภิธาน:"):
+                    extract_thesaurus_page(
+                        wxr, word_entry, linkage_type, link_str
+                    )
 
     getattr(word_entry, linkage_type).extend(linkages)
+
+
+def extract_thesaurus_page(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    linkage_type: str,
+    page_title: str,
+) -> None:
+    page = wxr.wtp.get_page(page_title, 110)
+    if page is None or page.body is None:
+        return
+    root = wxr.wtp.parse(page.body)
+    for level2_node in root.find_child(NodeKind.LEVEL2):
+        lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
+            "ภาษา"
+        )
+        if lang_name != word_entry.lang:
+            continue
+        for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+            pos_title = clean_node(wxr, None, level3_node.largs)
+            if pos_title != word_entry.pos_title:
+                continue
+            for linkage_level_node in level3_node.find_child_recursively(
+                LEVEL_KIND_FLAGS
+            ):
+                linkage_title = clean_node(wxr, None, linkage_level_node.largs)
+                if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
+                    continue
+                extract_linkage_section(
+                    wxr,
+                    word_entry,
+                    linkage_level_node,
+                    linkage_type,
+                    page_title,
+                )
+
+
+def extract_ws_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    linkage_type: str,
+    source: str,
+) -> None:
+    word = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
+    if word != "":
+        l_data = Linkage(word=word, source=source)
+        getattr(word_entry, linkage_type).append(l_data)
+
+
+LINKAGE_TEMPLATES = {
+    "syn": "synonyms",
+    "synonyms": "synonyms",
+    "synsee": "synonyms",
+    "ant": "antonyms",
+    "antonyms": "antonyms",
+    "cot": "coordinate_terms",
+    "coordinate terms": "coordinate_terms",
+    "hyper": "hypernyms",
+    "hypernyms": "hypernyms",
+    "hypo": "hyponyms",
+    "hyponyms": "hyponyms",
+}
+
+
+def extract_syn_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    linkage_type: str,
+) -> None:
+    for arg_name in count(2):
+        if arg_name not in t_node.template_parameters:
+            break
+        arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])
+        if arg_value.startswith("อรรถาภิธาน:"):
+            extract_thesaurus_page(wxr, word_entry, linkage_type, arg_value)
+        elif arg_value != "":
+            getattr(word_entry, linkage_type).append(Linkage(word=arg_value))
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
@@ -28,13 +28,19 @@ class Example(ThaiBaseModel):
     raw_tags: list[str] = []
 
 
+class AltForm(ThaiBaseModel):
+    word: str
+    roman: str = ""
+
+
 class Sense(ThaiBaseModel):
     glosses: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
     categories: list[str] = []
     examples: list[Example] = []
     classifiers: list[str] = []
+    form_of: list[AltForm] = []
 
 
 class Form(ThaiBaseModel):
@@ -62,6 +68,7 @@ class Linkage(ThaiBaseModel):
     tags: list[str] = []
     raw_tags: list[str] = []
     roman: str = ""
+    source: str = ""
 
 
 class Descendant(ThaiBaseModel):
@@ -101,3 +108,4 @@ class WordEntry(ThaiBaseModel):
     hyponyms: list[Linkage] = []
     hypernyms: list[Linkage] = []
     idioms: list[Linkage] = []
+    coordinate_terms: list[Linkage] = []
diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
@@ -1,11 +1,16 @@
 import itertools
 
-from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+from wikitextprocessor import (
+    LevelNode,
+    NodeKind,
+    TemplateNode,
+    WikiNode,
+)
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .example import extract_example_list_item
-from .models import Form, Sense, WordEntry
+from .models import AltForm, Form, Sense, WordEntry
 from .section_titles import POS_DATA
 from .tags import translate_raw_tags
 
@@ -57,6 +62,15 @@ def extract_gloss_list_item(
             extract_label_template(wxr, sense, node)
         elif isinstance(node, TemplateNode) and node.template_name == "cls":
             extract_cls_template(wxr, sense, node)
+        elif isinstance(node, TemplateNode) and (
+            node.template_name.endswith(" of")
+            or node.template_name == "altform"
+        ):
+            expanded_node = wxr.wtp.parse(
+                wxr.wtp.node_to_wikitext(node), expand_all=True
+            )
+            extract_form_of_template(wxr, sense, expanded_node)
+            gloss_nodes.append(expanded_node)
         elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
             gloss_nodes.append(node)
 
@@ -66,7 +80,7 @@ def extract_gloss_list_item(
             (":", "*")
         ):
             for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
-                extract_example_list_item(wxr, sense, e_list_item)
+                extract_example_list_item(wxr, word_entry, sense, e_list_item)
 
     if gloss_str != "":
         sense.glosses.append(gloss_str)
@@ -167,3 +181,22 @@ def extract_note_section(
             )
             if note_str != "":
                 word_entry.notes.append(note_str)
+
+
+def extract_form_of_template(
+    wxr: WiktextractContext,
+    sense: Sense,
+    expanded_node: WikiNode,
+) -> None:
+    form = AltForm(word="")
+    for i_tag in expanded_node.find_html_recursively("i"):
+        form.word = clean_node(wxr, None, i_tag)
+        break
+    for span_tag in expanded_node.find_html_recursively("span"):
+        if "mention-tr" in span_tag.attrs.get("class", ""):
+            form.roman = clean_node(wxr, None, span_tag)
+            break
+    if form.word != "":
+        sense.form_of.append(form)
+        if "form-of" not in sense.tags:
+            sense.tags.append("form-of")
diff --git a/tests/test_th_example.py b/tests/test_th_example.py
@@ -187,7 +187,7 @@ def test_ja_x(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:syn of",
             10,
-            "คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)",
+            """<span class='form-of-definition use-with-mention'>คำพ้องความของ <span class='form-of-definition-link'><i class="Jpan mention" lang="ja">[[北極熊#ภาษาญี่ปุ่น|北極熊]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-tr tr">ฮกเกียวกุงุมะ</span>, <span class="mention-gloss-double-quote">“</span><span class="mention-gloss">หมีขั้วโลก</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span></span></span>""",
         )
         page_data = parse_page(
             self.wxr,
@@ -236,5 +236,7 @@ def test_ja_x(self):
                         "ref": "1990 มิถุนายน 15, Takahashi, Rumiko, “:แม่แบบ:jaru [PART.5 Snatching the Scroll of Secrets]”, in :แม่แบบ:wj [Ranma ½], volume 11 (fiction), Tokyo: Shogakukan, →ISBN, page 72:",
                     }
                 ],
+                "form_of": [{"word": "北極熊", "roman": "ฮกเกียวกุงุมะ"}],
+                "tags": ["form-of"],
             },
         )
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
@@ -234,3 +234,27 @@ def test_lo_alt(self):
                 {"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
             ],
         )
+
+    def test_alt_form_template(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:altform",
+            10,
+            """<span class='form-of-definition use-with-mention'>อีกรูปหนึ่งของ <span class='form-of-definition-link'><i class="Lana mention" lang="nod">[[ᨸᩣ᩠ᨠ#ภาษาคำเมือง|ᨸᩣ᩠ᨠ]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span lang="nod-Latn" class="mention-tr tr Latn">ปาก</span><span class="mention-gloss-paren annotation-paren">)</span></span></span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "ปาก",
+            """== ภาษาคำเมือง ==
+=== คำนาม ===
+{{nod-noun}}
+
+# {{altform|nod|ᨸᩣ᩠ᨠ}}""",
+        )
+        self.assertEqual(
+            page_data[0]["senses"][0],
+            {
+                "glosses": ["อีกรูปหนึ่งของ ᨸᩣ᩠ᨠ (ปาก)"],
+                "form_of": [{"word": "ᨸᩣ᩠ᨠ", "roman": "ปาก"}],
+                "tags": ["form-of"],
+            },
+        )
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
@@ -57,3 +57,43 @@ def test_list(self):
             page_data[0]["synonyms"],
             [{"word": "มณฑก"}],
         )
+
+    def test_theasurus_page(self):
+        self.wxr.wtp.add_page(
+            "อรรถาภิธาน:ระดู",
+            110,
+            """== ภาษาไทย ==
+=== คำนาม ===
+==== {{ws sense|th|เลือดที่ถูกขับถ่ายจากมดลูกออกมาทางช่องคลอดทุก ๆ ประมาณ 1 เดือน}} ====
+===== คำพ้องความ =====
+{{ws beginlist}}
+{{ws|th|ต่อมโลหิต}}
+{{ws endlist}}""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "ระดู",
+            """== ภาษาไทย ==
+=== คำนาม ===
+# [[เลือด]]
+==== คำพ้องความ ====
+:''ดูที่ [[อรรถาภิธาน:ระดู]]''""",
+        )
+        self.assertEqual(
+            page_data[0]["synonyms"],
+            [{"word": "ต่อมโลหิต", "source": "อรรถาภิธาน:ระดู"}],
+        )
+
+    def test_syn_template(self):
+        page_data = parse_page(
+            self.wxr,
+            "โทรทัศน์",
+            """== ภาษาไทย ==
+=== คำนาม ===
+# กระบวนการถ่ายทอด
+#: {{syn|th|ทีวี|โทรภาพ}}""",
+        )
+        self.assertEqual(
+            page_data[0]["synonyms"],
+            [{"word": "ทีวี"}, {"word": "โทรภาพ"}],
+        )