Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[en] Retry decoding tags with errors if " and " in tag #893

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions src/wiktextract/extractor/en/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,13 +441,6 @@
r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
)

# Words that can be part of form description
valid_words: set[str] = set(["or", "and"])
for x in valid_tags:
valid_words.update(x.split(" "))
for x in xlat_tags_map.keys():
valid_words.update(x.split(" "))


# Dictionary of language-specific parenthesized head part starts that
# either introduce new tags or modify previous tags. The value for each
Expand Down Expand Up @@ -929,6 +922,7 @@ def check_unknown(
words = wordlst[from_i:to_i]
tag = " ".join(words)
assert tag
# print(f"{tag=}")
if re.match(ignored_unknown_starts_re, tag):
# Tags with this start are to be ignored
return [(from_i, ["UNKNOWN"], [])]
Expand Down Expand Up @@ -1010,11 +1004,14 @@ def decode_tags(
# I hate Python's *nested* list comprehension syntax ^
or any(s.startswith("error-") for s in topics)
):
# slashes_re contains valid key entries with slashes; we're going to
# skip them by splitting the string and skipping handling every
# second entry, which contains the splitting group like "masculine/
# feminine" style keys.
new_tagsets: list[tuple[str, ...]] = []
new_topics: list[str] = []

if "/" in src:
# slashes_re contains valid key entries with slashes; we're going
# to skip them by splitting the string and skipping handling every
# second entry, which contains the splitting group like "masculine/
# feminine" style keys.
split_parts = re.split(slashes_re, src)
new_parts: list[str] = []
if len(split_parts) > 1:
Expand All @@ -1029,7 +1026,16 @@ def decode_tags(
new_tagsets, new_topics = decode_tags1(
new_src, allow_any, no_unknown_starts
)
elif " or " in src or " and " in src:
# Annoying kludge.
new_src = src.replace(" and ", " ")
new_src = new_src.replace(" or ", " ")
new_tagsets, new_topics = decode_tags1(
new_src, allow_any, no_unknown_starts
)
# print(f"{new_tagsets=}")

if new_tagsets or new_topics:
old_errors = sum(
1 for tagset in tagsets for s in tagset if s.startswith("error")
)
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -4508,7 +4508,7 @@
"syncopated": "syncope",
"reduplication with syncope": "reduplication syncope",
"introducing subjunctive hortative": "subjunctive hortative",
"nominative and vocative plural animate": "nominative vocative",
"nominative and vocative plural animate": "nominative vocative plural animate",
"with diaeresis to indicate disyllabilicity": "",
"aphaeretic variant": "variant",
"mediopassive voice": "mediopassive",
Expand Down
213 changes: 155 additions & 58 deletions tests/test_en_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


class EnTagTests(unittest.TestCase):

def test_empty(self):
ret = decode_tags("")
self.assertEqual(ret, ([()], []))
Expand Down Expand Up @@ -67,7 +66,15 @@ def test_tags12(self):

def test_tags13(self):
ret, topics = decode_tags("class 2a stress pattern xyz")
self.assertEqual(ret, [("class-2a", "error-unknown-tag",)])
self.assertEqual(
ret,
[
(
"class-2a",
"error-unknown-tag",
)
],
)

def test_tags14(self):
ret, topics = decode_tags("Cockney rhyming slang")
Expand All @@ -84,46 +91,55 @@ def test_tags16(self):

def test_tags17(self):
ret, topics = decode_tags("colloquial Cockney Test rhyming slang")
self.assertEqual(ret, [("Cockney", "colloquial",
"error-unknown-tag", "slang")])
self.assertEqual(
ret, [("Cockney", "colloquial", "error-unknown-tag", "slang")]
)

def test_tags18(self):
ret, topics = decode_tags("colloquial Cockney Test unknown1 "
"rhyming slang")
self.assertEqual(ret, [("Cockney", "colloquial",
"error-unknown-tag", "slang")])
ret, topics = decode_tags(
"colloquial Cockney Test unknown1 " "rhyming slang"
)
self.assertEqual(
ret, [("Cockney", "colloquial", "error-unknown-tag", "slang")]
)

def test_tags19(self):
ret, topics = decode_tags("colloquial Cockney Test unknown1 "
"rhyming slang",
allow_any=True)
self.assertEqual(ret, [("Cockney", "Test unknown1", "colloquial",
"slang")])
ret, topics = decode_tags(
"colloquial Cockney Test unknown1 " "rhyming slang", allow_any=True
)
self.assertEqual(
ret, [("Cockney", "Test unknown1", "colloquial", "slang")]
)

def test_tags20(self):
ret, topics = decode_tags("colloquial Cockney rhyming slang "
"Test unknown1",
allow_any=True)
self.assertEqual(ret, [("Cockney", "Test unknown1", "colloquial",
"slang")])
ret, topics = decode_tags(
"colloquial Cockney rhyming slang " "Test unknown1", allow_any=True
)
self.assertEqual(
ret, [("Cockney", "Test unknown1", "colloquial", "slang")]
)

def test_tags21(self):
ret, topics = decode_tags("simple past and past participle")
self.assertEqual(topics, [])
self.assertEqual(ret, [("participle", "past"), ("past",)])

def test_tags22(self):
ret, topics = decode_tags("colloquial Cockney Test, unknown1; "
"rhyming slang",
allow_any=True)
self.assertEqual(ret, [("Cockney", "Test", "colloquial",
"slang", "unknown1")])
ret, topics = decode_tags(
"colloquial Cockney Test, unknown1; " "rhyming slang",
allow_any=True,
)
self.assertEqual(
ret, [("Cockney", "Test", "colloquial", "slang", "unknown1")]
)

def test_tags23(self):
ret, topics = decode_tags("intransitive, in perfect tenses, "
"without predicate")
self.assertEqual(ret, [("in perfect tenses",
"intransitive",
"without predicate")])
ret, topics = decode_tags(
"intransitive, in perfect tenses, " "without predicate"
)
self.assertEqual(
ret, [("in perfect tenses", "intransitive", "without predicate")]
)

def test_tags24(self):
ret, topics = decode_tags("as a modifier in compound words")
Expand Down Expand Up @@ -187,7 +203,7 @@ def test_tags38(self):

def test_tags39(self):
ret, topics = decode_tags("with inf., obsolescent")
self.assertEqual(ret, [("obsolete", "possibly","with-infinitive")])
self.assertEqual(ret, [("obsolete", "possibly", "with-infinitive")])

def test_tags40(self):
ret, topics = decode_tags("transitive of people")
Expand All @@ -198,46 +214,88 @@ def test_tags41(self):
self.assertEqual(ret, [("error-unknown-tag", "transitive")])

def test_tags42(self):
ret, topics = decode_tags("first/third-person singular present "
"subjunctive")
self.assertEqual(ret, [("first-person", "present",
"singular", "subjunctive", "third-person")])
ret, topics = decode_tags(
"first/third-person singular present " "subjunctive"
)
self.assertEqual(
ret,
[
(
"first-person",
"present",
"singular",
"subjunctive",
"third-person",
)
],
)

def test_tags43(self):
ret, topics = decode_tags("inflection of")
self.assertEqual(ret, [("form-of",)])

def test_tags44(self):
ret, topics = decode_tags("third-person singular present indicative")
self.assertEqual(ret, [("indicative", "present", "singular",
"third-person",)])
self.assertEqual(
ret,
[
(
"indicative",
"present",
"singular",
"third-person",
)
],
)

def test_tags45(self):
ret, topics = decode_tags("ordinal form of")
self.assertEqual(ret, [("form-of", "ordinal")])

def test_tags46(self):
ret, topics = decode_tags("first-person singular (eu) present "
"subjunctive")
self.assertEqual(ret, [("first-person", "present", "singular",
"subjunctive", "with-eu")])
ret, topics = decode_tags(
"first-person singular (eu) present " "subjunctive"
)
self.assertEqual(
ret,
[("first-person", "present", "singular", "subjunctive", "with-eu")],
)

def test_tags47(self):
ret, topics = decode_tags("third-person singular (él, ella, also "
"used with usted) present subjunctive "
"form of")
self.assertEqual(ret, [("form-of", "present", "singular", "subjunctive",
"third-person",
"with-ella", "with-usted", "with-él")])
ret, topics = decode_tags(
"third-person singular (él, ella, also "
"used with usted) present subjunctive "
"form of"
)
self.assertEqual(
ret,
[
(
"form-of",
"present",
"singular",
"subjunctive",
"third-person",
"with-ella",
"with-usted",
"with-él",
)
],
)

def test_tags48(self):
ret, topics = decode_tags("instant messaging")
self.assertEqual(ret, [("Internet",)])

def test_tags49(self):
ret, topics = decode_tags("plural and definite singular attributive")
self.assertEqual(ret, [("attributive", "definite", "singular"),
("attributive", "plural")])
self.assertEqual(
ret,
[
("attributive", "definite", "singular"),
("attributive", "plural"),
],
)

def test_tags50(self):
ret, topics = decode_tags("alternative spelling of")
Expand All @@ -261,8 +319,16 @@ def test_tags54(self):

def test_tags55(self):
ret, topics = decode_tags("plural and definite singular attributive")
self.assertEqual(ret, [("attributive", "definite", "singular"),
("attributive", "plural",)])
self.assertEqual(
ret,
[
("attributive", "definite", "singular"),
(
"attributive",
"plural",
),
],
)

def test_tags56(self):
ret, topics = decode_tags("comparative")
Expand Down Expand Up @@ -293,10 +359,13 @@ def test_tags62(self):
self.assertEqual(ret, [("definite", "plural", "singular")])

def test_tags63(self):
ret, topics = decode_tags("first-person plural "
"reflexive/dative/accusative form")
self.assertEqual(ret, [("accusative", "dative", "first-person",
"plural", "reflexive")])
ret, topics = decode_tags(
"first-person plural " "reflexive/dative/accusative form"
)
self.assertEqual(
ret,
[("accusative", "dative", "first-person", "plural", "reflexive")],
)

self.assertEqual(topics, [])

Expand All @@ -314,15 +383,43 @@ def test_tags66(self):
# during the first run, except for keys with slashes in them.
ret, topics = decode_tags("nominative/plural masculine/feminine")
# -> "nominative plural masculine/feminine"
self.assertEqual(ret, [("feminine", "masculine",
"nominative", "plural",)])
self.assertEqual(
ret,
[
(
"feminine",
"masculine",
"nominative",
"plural",
)
],
)

def test_topics1(self):
ret, topics = decode_tags("nautical")
self.assertEqual(topics, ["nautical", "transport"])

def test_topics2(self):
ret, topics = decode_tags("ropemaking")
self.assertEqual(topics, ["ropemaking", "crafts",
"nautical", "transport",
"arts", "hobbies", "lifestyle"])
self.assertEqual(
topics,
[
"ropemaking",
"crafts",
"nautical",
"transport",
"arts",
"hobbies",
"lifestyle",
],
)

def test_and(self):
ret, topics = decode_tags("nominative and accusative")
self.assertEqual(
ret,
[(
"accusative",
"nominative",
)],
)