Skip to content

Commit

Permalink
Merge pull request #913 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] extract more forms table templates
  • Loading branch information
xxyzz authored Nov 19, 2024
2 parents 70c2e05 + fb4c28a commit 8972197
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 8 deletions.
47 changes: 45 additions & 2 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def extract_inflection_template(
extract_noun_adj_table(wxr, word_entry, t_node)
elif t_node.template_name == "-nlstam-":
extract_nlstam_template(wxr, word_entry, t_node)
elif t_node.template_name.startswith("-csadjc-comp-"):
extract_csadjc_comp_template(wxr, word_entry, t_node)


def extract_noun_adj_table(
Expand Down Expand Up @@ -94,7 +96,7 @@ def extract_vervoeging_page(
return
root = wxr.wtp.parse(page.body)
for t_node in root.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "-nlverb-":
if t_node.template_name in ["-nlverb-", "-nlverb-reflex-"]:
extract_nlverb_template(wxr, word_entry, t_node)


Expand All @@ -111,13 +113,15 @@ class TableHeader:
"vervoeging van de bedrijvende vorm van": ["active"],
"onpersoonlijke lijdende vorm": ["impersonal", "passive"],
"lijdende vorm": ["passive"],
"vervoeging van het Nederlandse werkwoord": [],
}


def extract_nlverb_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-nlverb-
# Sjabloon:-nlverb-reflex-
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
Expand Down Expand Up @@ -152,6 +156,7 @@ def extract_nlverb_template(
col_headers.clear()
row_headers.clear()

small_tag = ""
is_row_first_node = True
for cell_node in row_node.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
Expand Down Expand Up @@ -179,7 +184,10 @@ def extract_nlverb_template(
break
else:
if current_row_all_header:
if is_row_first_node:
if (
is_row_first_node
and t_node.template_name == "-nlverb-"
):
shared_raw_tags.append(cell_str)
else:
col_headers.append(
Expand All @@ -206,12 +214,22 @@ def extract_nlverb_template(
)
)
else:
has_small_tag = False
for small_node in cell_node.find_html("small"):
has_small_tag = True
if has_small_tag:
small_tag = cell_str
col_index += cell_colspan
continue
form = Form(
form=cell_str,
tags=shared_tags,
raw_tags=shared_raw_tags,
source=f"{wxr.wtp.title}/vervoeging",
)
if small_tag != "":
form.raw_tags.append(small_tag)
small_tag = ""
for row_header in row_headers:
if (
row_index >= row_header.row_index
Expand Down Expand Up @@ -241,3 +259,28 @@ def nlverb_table_cell_is_header(node: WikiNode) -> bool:
node.kind == NodeKind.TABLE_HEADER_CELL
or node.attrs.get("class", "") == "infoboxrijhoofding"
)


def extract_csadjc_comp_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3-
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
for row in table.find_child(NodeKind.TABLE_ROW):
row_header = ""
for cell_node in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
row_header = clean_node(wxr, None, cell_node)
elif cell_node.kind == NodeKind.TABLE_CELL:
form_text = clean_node(wxr, None, cell_node)
if form_text not in ["", wxr.wtp.title]:
form = Form(form=form_text)
if row_header != "":
form.raw_tags.append(row_header)
translate_raw_tags(form)
word_entry.forms.append(form)
15 changes: 9 additions & 6 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,14 @@ def parse_section(
extract_fixed_preposition_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Vervoeging":
pass # conjugation
elif title_text == "Verbuiging":
pass # inflection
elif title_text in [
"Gangbaarheid",
"Meer informatie",
"Verwijzingen",
"Citaten",
]:
pass # ignore
else:
elif not title_text.startswith(("Vervoeging", "Verbuiging")):
wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60")

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
Expand All @@ -112,7 +108,14 @@ def parse_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
for t_node in level_node.find_child(NodeKind.TEMPLATE):
extract_inflection_template(wxr, forms_data, t_node)
extract_inflection_template(
wxr,
page_data[-1]
if title_text.startswith(("Vervoeging", "Verbuiging"))
and len(page_data) > 0
else forms_data,
t_node,
)
return etymology_data


Expand Down
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@
# Sjabloon:-denoun-
"datief": "dative",
"accusatief": "accusative",
# Sjabloon:-nlverb-reflex-
"tegenwoordige tijd": "present",
"verleden tijd": "past",
"toekomende tijd": "future",
"1": "first-person",
"2": "second-person",
"3": "third-person",
"voltooide tijd": "past",
}


Expand Down

0 comments on commit 8972197

Please sign in to comment.