Skip to content

Commit

Permalink
Merge pull request #993 from xxyzz/th
Browse files Browse the repository at this point in the history
[th] extract thesaurus pages, linkage and form-of templates
  • Loading branch information
xxyzz authored Jan 17, 2025
2 parents d9ba573 + 42b3fdb commit ee63ee9
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 11 deletions.
8 changes: 8 additions & 0 deletions src/wiktextract/data/th/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"save_ns_names": [
"Main",
"Template",
"Module",
"Thesaurus"
]
}
13 changes: 11 additions & 2 deletions src/wiktextract/extractor/th/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense
from .models import Example, Sense, WordEntry
from .tags import translate_raw_tags


def extract_example_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
sense: Sense,
list_item: WikiNode,
ref: str = "",
) -> None:
from .linkage import LINKAGE_TEMPLATES, extract_syn_template

for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name in ["ux", "usex", "ko-usex"]:
Expand All @@ -25,9 +28,15 @@ def extract_example_list_item(
extract_template_ja_usex(wxr, sense, node, ref)
elif node.template_name.startswith("quote-"):
ref = extract_quote_template(wxr, sense, node)
elif node.template_name in LINKAGE_TEMPLATES:
extract_syn_template(
wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name]
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, child_list_item, ref)
extract_example_list_item(
wxr, word_entry, sense, child_list_item, ref
)


def extract_ux_template(
Expand Down
111 changes: 106 additions & 5 deletions src/wiktextract/extractor/th/linkage.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,37 @@
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
from itertools import count

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry
from .section_titles import LINKAGE_SECTIONS


def extract_linkage_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
linkage_type: str,
source: str = "",
) -> None:
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name.startswith(
"col"
):
extract_col_template(wxr, word_entry, node, linkage_type)
extract_col_template(wxr, word_entry, node, linkage_type, source)
elif isinstance(node, TemplateNode) and node.template_name == "ws":
extract_ws_template(wxr, word_entry, node, linkage_type, source)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_linkage_lite_item(
wxr, word_entry, list_item, linkage_type
wxr, word_entry, list_item, linkage_type, source
)


Expand All @@ -28,12 +40,13 @@ def extract_col_template(
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
source: str,
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for li_tag in expanded_node.find_html_recursively("li"):
l_data = Linkage(word="")
l_data = Linkage(word="", source=source)
for span_tag in li_tag.find_html("span"):
span_class = span_tag.attrs.get("class", "")
if "Latn" in span_class:
Expand All @@ -49,15 +62,103 @@ def extract_linkage_lite_item(
word_entry: WordEntry,
list_item: WikiNode,
linkage_type: str,
source: str,
) -> None:
linkages = []

for node in list_item.children:
if isinstance(node, TemplateNode) and node.template_name == "l":
l_data = Linkage(
word=clean_node(wxr, None, node.template_parameters.get(2, ""))
word=clean_node(wxr, None, node.template_parameters.get(2, "")),
source=source,
)
if l_data.word != "":
linkages.append(l_data)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
for link_node in node.find_child(NodeKind.LINK):
link_str = clean_node(wxr, None, link_node)
if link_str.startswith("อรรถาภิธาน:"):
extract_thesaurus_page(
wxr, word_entry, linkage_type, link_str
)

getattr(word_entry, linkage_type).extend(linkages)


def extract_thesaurus_page(
wxr: WiktextractContext,
word_entry: WordEntry,
linkage_type: str,
page_title: str,
) -> None:
page = wxr.wtp.get_page(page_title, 110)
if page is None or page.body is None:
return
root = wxr.wtp.parse(page.body)
for level2_node in root.find_child(NodeKind.LEVEL2):
lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
"ภาษา"
)
if lang_name != word_entry.lang:
continue
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
pos_title = clean_node(wxr, None, level3_node.largs)
if pos_title != word_entry.pos_title:
continue
for linkage_level_node in level3_node.find_child_recursively(
LEVEL_KIND_FLAGS
):
linkage_title = clean_node(wxr, None, linkage_level_node.largs)
if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
continue
extract_linkage_section(
wxr,
word_entry,
linkage_level_node,
linkage_type,
page_title,
)


def extract_ws_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
source: str,
) -> None:
word = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
if word != "":
l_data = Linkage(word=word, source=source)
getattr(word_entry, linkage_type).append(l_data)


LINKAGE_TEMPLATES = {
"syn": "synonyms",
"synonyms": "synonyms",
"synsee": "synonyms",
"ant": "antonyms",
"antonyms": "antonyms",
"cot": "coordinate_terms",
"coordinate terms": "coordinate_terms",
"hyper": "hypernyms",
"hypernyms": "hypernyms",
"hypo": "hyponyms",
"hyponyms": "hyponyms",
}


def extract_syn_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
) -> None:
for arg_name in count(2):
if arg_name not in t_node.template_parameters:
break
arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])
if arg_value.startswith("อรรถาภิธาน:"):
extract_thesaurus_page(wxr, word_entry, linkage_type, arg_value)
elif arg_value != "":
getattr(word_entry, linkage_type).append(Linkage(word=arg_value))
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,19 @@ class Example(ThaiBaseModel):
raw_tags: list[str] = []


class AltForm(ThaiBaseModel):
word: str
roman: str = ""


class Sense(ThaiBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []
classifiers: list[str] = []
form_of: list[AltForm] = []


class Form(ThaiBaseModel):
Expand Down Expand Up @@ -62,6 +68,7 @@ class Linkage(ThaiBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""
source: str = ""


class Descendant(ThaiBaseModel):
Expand Down Expand Up @@ -101,3 +108,4 @@ class WordEntry(ThaiBaseModel):
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
idioms: list[Linkage] = []
coordinate_terms: list[Linkage] = []
39 changes: 36 additions & 3 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import itertools

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
from wikitextprocessor import (
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Form, Sense, WordEntry
from .models import AltForm, Form, Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags

Expand Down Expand Up @@ -57,6 +62,15 @@ def extract_gloss_list_item(
extract_label_template(wxr, sense, node)
elif isinstance(node, TemplateNode) and node.template_name == "cls":
extract_cls_template(wxr, sense, node)
elif isinstance(node, TemplateNode) and (
node.template_name.endswith(" of")
or node.template_name == "altform"
):
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
extract_form_of_template(wxr, sense, expanded_node)
gloss_nodes.append(expanded_node)
elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
gloss_nodes.append(node)

Expand All @@ -66,7 +80,7 @@ def extract_gloss_list_item(
(":", "*")
):
for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, e_list_item)
extract_example_list_item(wxr, word_entry, sense, e_list_item)

if gloss_str != "":
sense.glosses.append(gloss_str)
Expand Down Expand Up @@ -167,3 +181,22 @@ def extract_note_section(
)
if note_str != "":
word_entry.notes.append(note_str)


def extract_form_of_template(
wxr: WiktextractContext,
sense: Sense,
expanded_node: WikiNode,
) -> None:
form = AltForm(word="")
for i_tag in expanded_node.find_html_recursively("i"):
form.word = clean_node(wxr, None, i_tag)
break
for span_tag in expanded_node.find_html_recursively("span"):
if "mention-tr" in span_tag.attrs.get("class", ""):
form.roman = clean_node(wxr, None, span_tag)
break
if form.word != "":
sense.form_of.append(form)
if "form-of" not in sense.tags:
sense.tags.append("form-of")
4 changes: 3 additions & 1 deletion tests/test_th_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_ja_x(self):
self.wxr.wtp.add_page(
"แม่แบบ:syn of",
10,
"คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)",
"""<span class='form-of-definition use-with-mention'>คำพ้องความของ <span class='form-of-definition-link'><i class="Jpan mention" lang="ja">[[北極熊#ภาษาญี่ปุ่น|北極熊]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-tr tr">ฮกเกียวกุงุมะ</span>, <span class="mention-gloss-double-quote">“</span><span class="mention-gloss">หมีขั้วโลก</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span></span></span>""",
)
page_data = parse_page(
self.wxr,
Expand Down Expand Up @@ -236,5 +236,7 @@ def test_ja_x(self):
"ref": "1990 มิถุนายน 15, Takahashi, Rumiko, “:แม่แบบ:jaru [PART.5 Snatching the Scroll of Secrets]”, in :แม่แบบ:wj [Ranma ½], volume 11 (fiction), Tokyo: Shogakukan, →ISBN, page 72:",
}
],
"form_of": [{"word": "北極熊", "roman": "ฮกเกียวกุงุมะ"}],
"tags": ["form-of"],
},
)
24 changes: 24 additions & 0 deletions tests/test_th_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,27 @@ def test_lo_alt(self):
{"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
],
)

def test_alt_form_template(self):
self.wxr.wtp.add_page(
"แม่แบบ:altform",
10,
"""<span class='form-of-definition use-with-mention'>อีกรูปหนึ่งของ <span class='form-of-definition-link'><i class="Lana mention" lang="nod">[[ᨸᩣ᩠ᨠ#ภาษาคำเมือง|ᨸᩣ᩠ᨠ]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span lang="nod-Latn" class="mention-tr tr Latn">ปาก</span><span class="mention-gloss-paren annotation-paren">)</span></span></span>""",
)
page_data = parse_page(
self.wxr,
"ปาก",
"""== ภาษาคำเมือง ==
=== คำนาม ===
{{nod-noun}}
# {{altform|nod|ᨸᩣ᩠ᨠ}}""",
)
self.assertEqual(
page_data[0]["senses"][0],
{
"glosses": ["อีกรูปหนึ่งของ ᨸᩣ᩠ᨠ (ปาก)"],
"form_of": [{"word": "ᨸᩣ᩠ᨠ", "roman": "ปาก"}],
"tags": ["form-of"],
},
)
40 changes: 40 additions & 0 deletions tests/test_th_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,43 @@ def test_list(self):
page_data[0]["synonyms"],
[{"word": "มณฑก"}],
)

def test_theasurus_page(self):
self.wxr.wtp.add_page(
"อรรถาภิธาน:ระดู",
110,
"""== ภาษาไทย ==
=== คำนาม ===
==== {{ws sense|th|เลือดที่ถูกขับถ่ายจากมดลูกออกมาทางช่องคลอดทุก ๆ ประมาณ 1 เดือน}} ====
===== คำพ้องความ =====
{{ws beginlist}}
{{ws|th|ต่อมโลหิต}}
{{ws endlist}}""",
)
page_data = parse_page(
self.wxr,
"ระดู",
"""== ภาษาไทย ==
=== คำนาม ===
# [[เลือด]]
==== คำพ้องความ ====
:''ดูที่ [[อรรถาภิธาน:ระดู]]''""",
)
self.assertEqual(
page_data[0]["synonyms"],
[{"word": "ต่อมโลหิต", "source": "อรรถาภิธาน:ระดู"}],
)

def test_syn_template(self):
page_data = parse_page(
self.wxr,
"โทรทัศน์",
"""== ภาษาไทย ==
=== คำนาม ===
# กระบวนการถ่ายทอด
#: {{syn|th|ทีวี|โทรภาพ}}""",
)
self.assertEqual(
page_data[0]["synonyms"],
[{"word": "ทีวี"}, {"word": "โทรภาพ"}],
)

0 comments on commit ee63ee9

Please sign in to comment.