Skip to content

Commit

Permalink
[en] don't extract converted topic data as form
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Nov 21, 2024
1 parent f92d800 commit 6802294
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/en/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2441,6 +2441,8 @@ def strokes_repl(m: re.Match) -> str:
):
if alt_related is not None:
break
if len(topics) > 0 and len(tagsets) > 0:
break
continue
if (
i > 1
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@

# Matches head tag
HEAD_TAG_RE = re.compile(
r"^(head|Han char|arabic-noun|arabic-noun-form|"
r"^(head|Han char|arabic-noun|arabic-noun-form|term-label|tlb|"
r"hangul-symbol|syllable-hangul)$|"
+ r"^(latin|"
+ "|".join(lang_code for lang_code, *_ in get_all_names("en"))
Expand Down
26 changes: 26 additions & 0 deletions tests/test_en_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,3 +777,29 @@ def test_two_head_lines(self):
)[0]["senses"][0]["tags"],
["feminine", "dialectal", "masculine"],
)

def test_converted_topic_is_not_form(self):
# GH issue #906
# "fandom slang" -> "slang lifestyle" in "tags.py"
self.wxr.wtp.add_page(
"Template:term-label",
10,
"""<span class="usage-label-term"><span class="ib-brac">(</span><span class="ib-content">[[fandom]] [[slang]]<span class="ib-comma">,</span>&#32;sometimes&#32;[[derogatory]]</span><span class="ib-brac">)</span></span>""",
)
self.wxr.wtp.add_page(
"Template:en-noun",
10,
"""<span class="headword-line"><strong class="Latn headword" lang="en">chuunibyou</strong> (<i>[[Appendix:Glossary#countable|countable]] and [[Appendix:Glossary#uncountable|uncountable]]</i>, <i>plural</i> <b class="Latn form-of lang-en p-form-of" lang="en">[[chuunibyou#English|chuunibyou]]</b>)</span>""",
)
self.assertEqual(
parse_page(
self.wxr,
"chuunibyou",
"""==English==
===Noun===
{{en-noun|~|chuunibyou}} {{term-label|en|fandom slang|sometimes|derogatory}}
# gloss""",
)[0]["forms"],
[{"form": "chuunibyou", "tags": ["plural"]}],
)

0 comments on commit 6802294

Please sign in to comment.