From c84f72a9c30262657f75645b00fd30c34641ddd9 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 4 Nov 2024 14:17:50 +0800 Subject: [PATCH 1/3] [nl] fix `AttributeError` in page "Waal" "-nlnoun-" template doc doesn't say the fifth param is used in table header if provided --- src/wiktextract/extractor/nl/inflection.py | 2 +- src/wiktextract/extractor/nl/tags.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index 053d6678..e7deda69 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -45,7 +45,7 @@ def extract_noun_adj_table( ).splitlines(): if form_str not in ["", "-", wxr.wtp.title]: form = Form(form=form_str) - if row_header not in ["", "naamwoord"]: + if row_header not in ["", "naamwoord", "demoniem"]: form.raw_tags.append(row_header) if col_index - 1 < len(column_headers): form.raw_tags.append( diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index fbc1f118..e46eea13 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -375,7 +375,7 @@ def translate_raw_tags(data: WordEntry) -> None: data.tags.append(tr_tag) elif isinstance(tr_tag, list): data.tags.extend(tr_tag) - elif raw_tag in TOPICS: + elif raw_tag in TOPICS and hasattr(data, "topics"): tr_topic = TOPICS[raw_tag] if isinstance(tr_topic, str): data.topics.append(tr_topic) From 40e30de4703b1a5c6f8b9aa1679ffe2bfb167413 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 4 Nov 2024 16:11:01 +0800 Subject: [PATCH 2/3] [nl] remove table wikitext before table cell text The table expanded from "-nlverb-" template has 3 "|" before some cell texts, and `clean_node()` returns `|text` for these cells. --- src/wiktextract/extractor/nl/inflection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index e7deda69..00e6b435 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -158,7 +158,7 @@ def extract_nlverb_template( cell_rowspan_str = cell_node.attrs.get("rowspan", "1") if re.fullmatch(r"\d+", cell_rowspan_str): cell_rowspan = int(cell_rowspan_str) - cell_str = clean_node(wxr, None, cell_node) + cell_str = clean_node(wxr, None, cell_node).strip("| ") if cell_str in ["", wxr.wtp.title]: col_index += cell_colspan is_row_first_node = False From 6265fe1765ca97ffff3ea1249e3162ba885ebc02 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 4 Nov 2024 16:42:10 +0800 Subject: [PATCH 3/3] [nl] remove number at the end of section title some pages like "baken" use title `Zelfstandig naamwoord #1` --- src/wiktextract/extractor/nl/page.py | 2 ++ src/wiktextract/extractor/nl/section_titles.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py index 56e1f531..ec816d12 100644 --- a/src/wiktextract/extractor/nl/page.py +++ b/src/wiktextract/extractor/nl/page.py @@ -1,3 +1,4 @@ +import re from typing import Any from mediawiki_langcodes import name_to_code @@ -39,6 +40,7 @@ def parse_section( # title templates # https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen title_text = clean_node(wxr, None, level_node.largs) + title_text = re.sub(r"\s+#?\d+:?$", "", title_text) wxr.wtp.start_subsection(title_text) etymology_data = [] if title_text in POS_DATA: diff --git a/src/wiktextract/extractor/nl/section_titles.py b/src/wiktextract/extractor/nl/section_titles.py index d29d0aed..7a4cdcc1 100644 --- a/src/wiktextract/extractor/nl/section_titles.py +++ b/src/wiktextract/extractor/nl/section_titles.py @@ -46,6 +46,7 @@ "Achtervoegsel": {"pos": "suffix", "tags": ["morpheme"]}, "Symbool": {"pos": "symbol"}, "Werkwoord": {"pos": "verb"}, + "Betrekkelijk naamwoord": {"pos": "noun", "tags": ["relative"]}, }