Skip to content

Commit

Permalink
Merge pull request #896 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] improve extract gloss list and example list code
  • Loading branch information
xxyzz authored Nov 1, 2024
2 parents d49d402 + 6bfa818 commit d6bf104
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 21 deletions.
35 changes: 31 additions & 4 deletions src/wiktextract/extractor/ko/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense
from ..share import set_sound_file_url_fields
from .models import Example, Sense, Sound


def extract_example_list_item(
Expand All @@ -14,6 +15,8 @@ def extract_example_list_item(
parent_example: Example | None = None,
) -> None:
example = Example() if parent_example is None else parent_example
e_text_nodes = []
e_tr_nodes = []
after_lang_template = False
for node in list_item.children:
if isinstance(node, TemplateNode) and node.template_name == "lang":
Expand All @@ -33,11 +36,31 @@ def extract_example_list_item(
extract_ux_template(wxr, sense, example, node)
break
elif after_lang_template:
example.translation += clean_node(wxr, None, node)
e_tr_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
break
elif (
isinstance(node, WikiNode)
and node.kind == NodeKind.LINK
and len(node.largs) > 0
and len(node.largs[0]) > 0
and isinstance(node.largs[0][0], str)
and node.largs[0][0].startswith("File:")
):
sound = Sound()
sound_file = node.largs[0][0].removeprefix("File:").strip()
set_sound_file_url_fields(wxr, sound_file, sound)
if sound.audio != "":
example.sounds.append(sound)
else:
example.text += clean_node(wxr, None, node)
e_text_nodes.append(node)

e_text = clean_node(wxr, sense, e_text_nodes)
if e_text != "":
example.text = e_text
e_tr = clean_node(wxr, sense, e_tr_nodes)
if e_tr != "":
example.translation = e_tr

if len(example.text) > 0:
if lang_code == "zh" and "/" in example.text:
Expand All @@ -56,7 +79,11 @@ def extract_example_list_item(
for nested_list in list_item.find_child(NodeKind.LIST):
for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, nested_list_item, lang_code, example
wxr,
sense,
nested_list_item,
lang_code,
example if example.text == "" else Example(),
)


Expand Down
37 changes: 21 additions & 16 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@ class KoreanBaseModel(BaseModel):
)


class Sound(KoreanBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
hangul: str = ""
roman: str = ""
other: str = ""


class Example(KoreanBaseModel):
text: str = ""
translation: str = ""
Expand All @@ -21,6 +37,7 @@ class Example(KoreanBaseModel):
tags: list[str] = []
literal_meaning: str = ""
note: str = ""
sounds: list[Sound] = []


class AltForm(KoreanBaseModel):
Expand All @@ -36,22 +53,7 @@ class Sense(KoreanBaseModel):
examples: list[Example] = []
note: str = ""
form_of: list[AltForm] = []


class Sound(KoreanBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
hangul: str = ""
roman: str = ""
other: str = ""
pattern: str = Field(default="", description="Sentence structure, 문형")


class Linkage(KoreanBaseModel):
Expand Down Expand Up @@ -100,3 +102,6 @@ class WordEntry(KoreanBaseModel):
etymology_texts: list[str] = []
note: str = ""
forms: list[Form] = []
pattern: str = Field(
default="", description="Sentence structure, 문형", exclude=True
)
11 changes: 10 additions & 1 deletion src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ def extract_pos_section(
for list_item in node.find_child(NodeKind.LIST_ITEM):
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, page_data[-1], list_item, Sense()
wxr,
page_data[-1],
list_item,
Sense(pattern=page_data[-1].pattern),
)
else:
extract_unorderd_list_item(wxr, page_data[-1], list_item)
Expand Down Expand Up @@ -174,6 +177,12 @@ def extract_unorderd_list_item(
):
extract_linkage_list_item(wxr, word_entry, list_item, "")
break
elif isinstance(node, str) and "문형:" in node:
word_entry.pattern = node[node.index(":") + 1 :].strip()
word_entry.pattern += clean_node(
wxr, None, list_item.children[index + 1 :]
)
break
else:
if len(word_entry.senses) > 0:
extract_example_list_item(
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"연어": {"pos": "phrase", "tags": ["idiomatic"]},
"동사 활용형": {"pos": "verb", "tags": ["form-of"]},
"재귀동사": {"pos": "verb", "tags": ["reflexive"]},
"보조형용사": {"pos": "adj", "tags": ["auxiliary"]},
}

LINKAGE_SECTIONS = {
Expand Down
42 changes: 42 additions & 0 deletions tests/test_ko_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,45 @@ def test_jibong_yuseol_template(self):
"ref": "1614년, 이수광, 《지봉유설》, 〈2권 外國 條〉",
},
)

def test_sound_file(self):
data = parse_page(
self.wxr,
"사람",
"""== 중국어 ==
=== 명사 ===
==== 명사 1 ====
# 어떤 지역이나 시기에 태어나거나 살고 있거나 살았던 자.
:* 한국 '''사람''' [[File:Ko-한국 사람.oga]]""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0]["text"], "한국 사람"
)
self.assertEqual(
data[0]["senses"][0]["examples"][0]["sounds"][0]["audio"],
"Ko-한국 사람.oga",
)

def test_wrong_nested_list(self):
data = parse_page(
self.wxr,
"들다",
"""== 중국어 ==
=== 명사 ===
==== 명사 1 ====
# 한 곳에서 다른 어디로 또는 밖에서 속이나 안으로 향해 가거나, 오거나 또는 어디에 자리하다.
: 안으로 드시지요.
:* 물이 어디에 '''들어''' 있어요? [[File:물이 어디에 들어 있어요?.ogg]]""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0]["text"], "안으로 드시지요."
)
self.assertTrue("sounds" not in data[0]["senses"][0]["examples"][0])
self.assertEqual(
data[0]["senses"][0]["examples"][1]["text"],
"물이 어디에 들어 있어요?",
)
self.assertEqual(
data[0]["senses"][0]["examples"][1]["sounds"][0]["audio"],
"물이 어디에 들어 있어요?.ogg",
)
26 changes: 26 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,29 @@ def test_nested_gloss_lists(self):
},
],
)

def test_pattern_list(self):
data = parse_page(
self.wxr,
"대하다",
"""== 한국어 ==
=== 동사 ===
==== 동사 2 ====
*문형: […을] [(…과) …을]
# 마주 향하여 있다.
*문형: […에/에게 -게] […을 …으로] […을 -게]
# 어떤 태도로 상대하다.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["마주 향하여 있다."],
"pattern": "[…을] [(…과) …을]",
},
{
"glosses": ["어떤 태도로 상대하다."],
"pattern": "[…에/에게 -게] […을 …으로] […을 -게]",
},
],
)

0 comments on commit d6bf104

Please sign in to comment.