Skip to content

Commit

Permalink
[ko] extract nested gloss lists
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Oct 30, 2024
1 parent 598a4ee commit e012012
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 6 deletions.
25 changes: 19 additions & 6 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ def extract_pos_section(
extract_header_template(wxr, page_data[-1], node)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
if node.sarg.startswith("#"):
extract_gloss_list_item(wxr, page_data[-1], list_item)
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, page_data[-1], list_item, Sense()
)
else:
extract_unorderd_list_item(wxr, page_data[-1], list_item)

Expand All @@ -73,19 +75,30 @@ def extract_pos_section(


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
parent_sense: Sense,
) -> None:
gloss_nodes = []
sense = Sense()
sense = parent_sense.model_copy(deep=True)
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
gloss_text = clean_node(wxr, sense, gloss_nodes)
if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
translate_raw_tags(sense)
word_entry.senses.append(sense)
gloss_nodes.clear()
for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_unorderd_list_item(wxr, word_entry, nested_list_item)
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, word_entry, nested_list_item, sense
)
else:
extract_unorderd_list_item(
wxr, word_entry, nested_list_item
)
continue
elif isinstance(node, TemplateNode) and node.template_name.endswith(
" of"
Expand Down Expand Up @@ -127,7 +140,7 @@ def extract_unorderd_list_item(
if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
new_list_item.children = list_item.children[index + 1 :]
extract_gloss_list_item(wxr, word_entry, new_list_item)
extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
break
elif isinstance(node, str) and "어원:" in node:
etymology_nodes = []
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@
"같이 보기": "related",
"복합어": "derived",
"관련 단어": "related",
"동의어": "synonyms",
}
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/ko/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
"자동사": "intransitive",
"직역": "literally",
"타동사": "transitive",
"드물게": "rare",
"원래의 의미": "naturally",
"문학적": "literary",
"해학적": "humorous",
"완곡적": "euphemistic",
}

SOUND_TAGS = {
Expand Down
27 changes: 27 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,30 @@ def test_ko_verb(self):
data[0]["categories"],
["한국어 비표준 문자가 포함된 낱말 (링크)", "한국어 동사"],
)

def test_nested_gloss_lists(self):
data = parse_page(
self.wxr,
"병신",
"""== 한국어 ==
=== 어원 1 ===
==== 명사 ====
# 하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말.
## 남에게 [[당하다|당하거나]] [[헌신하다|헌신하기만]] 하는 대상을 동정하거나, 혹은 그런 사람이 자신의 [[처지]]를 [[하소연하다|하소연할]] 때 사용하는 표현.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말."
]
},
{
"glosses": [
"하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말.",
"남에게 당하거나 헌신하기만 하는 대상을 동정하거나, 혹은 그런 사람이 자신의 처지를 하소연할 때 사용하는 표현.",
]
},
],
)

0 comments on commit e012012

Please sign in to comment.