From 33acb4012c3425c044dd9cb751b1f7eb6946d63f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 29 Oct 2024 11:38:00 +0200 Subject: [PATCH] [en] Retry decoding tags with errors if " and " in tag Removed unused code with `valid_words` from 2020. Didn't find an elegant way to use it where it was meant to be used. I've been hitting my head trying to figure out what to do in decode_tags1 to get " and " to parse correctly (that is, ignored it), and finally gave up. Instead, I've implemented a new kludge in the vein of the "/"- kludge preceding this, except thankfully simpler. --- .../extractor/en/form_descriptions.py | 28 ++- src/wiktextract/tags.py | 2 +- tests/test_en_tags.py | 213 +++++++++++++----- 3 files changed, 173 insertions(+), 70 deletions(-) diff --git a/src/wiktextract/extractor/en/form_descriptions.py b/src/wiktextract/extractor/en/form_descriptions.py index e4a802039..e39575928 100644 --- a/src/wiktextract/extractor/en/form_descriptions.py +++ b/src/wiktextract/extractor/en/form_descriptions.py @@ -441,13 +441,6 @@ r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" ) -# Words that can be part of form description -valid_words: set[str] = set(["or", "and"]) -for x in valid_tags: - valid_words.update(x.split(" ")) -for x in xlat_tags_map.keys(): - valid_words.update(x.split(" ")) - # Dictionary of language-specific parenthesized head part starts that # either introduce new tags or modify previous tags. The value for each @@ -929,6 +922,7 @@ def check_unknown( words = wordlst[from_i:to_i] tag = " ".join(words) assert tag + # print(f"{tag=}") if re.match(ignored_unknown_starts_re, tag): # Tags with this start are to be ignored return [(from_i, ["UNKNOWN"], [])] @@ -1010,11 +1004,14 @@ def decode_tags( # I hate Python's *nested* list comprehension syntax ^ or any(s.startswith("error-") for s in topics) ): - # slashes_re contains valid key entries with slashes; we're going to - # skip them by splitting the string and skipping handling every - # second entry, which contains the splitting group like "masculine/ - # feminine" style keys. + new_tagsets: list[tuple[str, ...]] = [] + new_topics: list[str] = [] + if "/" in src: + # slashes_re contains valid key entries with slashes; we're going + # to skip them by splitting the string and skipping handling every + # second entry, which contains the splitting group like "masculine/ + # feminine" style keys. split_parts = re.split(slashes_re, src) new_parts: list[str] = [] if len(split_parts) > 1: @@ -1029,7 +1026,16 @@ def decode_tags( new_tagsets, new_topics = decode_tags1( new_src, allow_any, no_unknown_starts ) + elif " or " in src or " and " in src: + # Annoying kludge. + new_src = src.replace(" and ", " ") + new_src = new_src.replace(" or ", " ") + new_tagsets, new_topics = decode_tags1( + new_src, allow_any, no_unknown_starts + ) + # print(f"{new_tagsets=}") + if new_tagsets or new_topics: old_errors = sum( 1 for tagset in tagsets for s in tagset if s.startswith("error") ) diff --git a/src/wiktextract/tags.py b/src/wiktextract/tags.py index 19a5d80d3..ab35ea9a1 100644 --- a/src/wiktextract/tags.py +++ b/src/wiktextract/tags.py @@ -4508,7 +4508,7 @@ "syncopated": "syncope", "reduplication with syncope": "reduplication syncope", "introducing subjunctive hortative": "subjunctive hortative", - "nominative and vocative plural animate": "nominative vocative", + "nominative and vocative plural animate": "nominative vocative plural animate", "with diaeresis to indicate disyllabilicity": "", "aphaeretic variant": "variant", "mediopassive voice": "mediopassive", diff --git a/tests/test_en_tags.py b/tests/test_en_tags.py index ae03d3737..3bf25b098 100644 --- a/tests/test_en_tags.py +++ b/tests/test_en_tags.py @@ -8,7 +8,6 @@ class EnTagTests(unittest.TestCase): - def test_empty(self): ret = decode_tags("") self.assertEqual(ret, ([()], [])) @@ -67,7 +66,15 @@ def test_tags12(self): def test_tags13(self): ret, topics = decode_tags("class 2a stress pattern xyz") - self.assertEqual(ret, [("class-2a", "error-unknown-tag",)]) + self.assertEqual( + ret, + [ + ( + "class-2a", + "error-unknown-tag", + ) + ], + ) def test_tags14(self): ret, topics = decode_tags("Cockney rhyming slang") @@ -84,28 +91,33 @@ def test_tags16(self): def test_tags17(self): ret, topics = decode_tags("colloquial Cockney Test rhyming slang") - self.assertEqual(ret, [("Cockney", "colloquial", - "error-unknown-tag", "slang")]) + self.assertEqual( + ret, [("Cockney", "colloquial", "error-unknown-tag", "slang")] + ) def test_tags18(self): - ret, topics = decode_tags("colloquial Cockney Test unknown1 " - "rhyming slang") - self.assertEqual(ret, [("Cockney", "colloquial", - "error-unknown-tag", "slang")]) + ret, topics = decode_tags( + "colloquial Cockney Test unknown1 " "rhyming slang" + ) + self.assertEqual( + ret, [("Cockney", "colloquial", "error-unknown-tag", "slang")] + ) def test_tags19(self): - ret, topics = decode_tags("colloquial Cockney Test unknown1 " - "rhyming slang", - allow_any=True) - self.assertEqual(ret, [("Cockney", "Test unknown1", "colloquial", - "slang")]) + ret, topics = decode_tags( + "colloquial Cockney Test unknown1 " "rhyming slang", allow_any=True + ) + self.assertEqual( + ret, [("Cockney", "Test unknown1", "colloquial", "slang")] + ) def test_tags20(self): - ret, topics = decode_tags("colloquial Cockney rhyming slang " - "Test unknown1", - allow_any=True) - self.assertEqual(ret, [("Cockney", "Test unknown1", "colloquial", - "slang")]) + ret, topics = decode_tags( + "colloquial Cockney rhyming slang " "Test unknown1", allow_any=True + ) + self.assertEqual( + ret, [("Cockney", "Test unknown1", "colloquial", "slang")] + ) def test_tags21(self): ret, topics = decode_tags("simple past and past participle") @@ -113,17 +125,21 @@ def test_tags21(self): self.assertEqual(ret, [("participle", "past"), ("past",)]) def test_tags22(self): - ret, topics = decode_tags("colloquial Cockney Test, unknown1; " - "rhyming slang", - allow_any=True) - self.assertEqual(ret, [("Cockney", "Test", "colloquial", - "slang", "unknown1")]) + ret, topics = decode_tags( + "colloquial Cockney Test, unknown1; " "rhyming slang", + allow_any=True, + ) + self.assertEqual( + ret, [("Cockney", "Test", "colloquial", "slang", "unknown1")] + ) + def test_tags23(self): - ret, topics = decode_tags("intransitive, in perfect tenses, " - "without predicate") - self.assertEqual(ret, [("in perfect tenses", - "intransitive", - "without predicate")]) + ret, topics = decode_tags( + "intransitive, in perfect tenses, " "without predicate" + ) + self.assertEqual( + ret, [("in perfect tenses", "intransitive", "without predicate")] + ) def test_tags24(self): ret, topics = decode_tags("as a modifier in compound words") @@ -187,7 +203,7 @@ def test_tags38(self): def test_tags39(self): ret, topics = decode_tags("with inf., obsolescent") - self.assertEqual(ret, [("obsolete", "possibly","with-infinitive")]) + self.assertEqual(ret, [("obsolete", "possibly", "with-infinitive")]) def test_tags40(self): ret, topics = decode_tags("transitive of people") @@ -198,10 +214,21 @@ def test_tags41(self): self.assertEqual(ret, [("error-unknown-tag", "transitive")]) def test_tags42(self): - ret, topics = decode_tags("first/third-person singular present " - "subjunctive") - self.assertEqual(ret, [("first-person", "present", - "singular", "subjunctive", "third-person")]) + ret, topics = decode_tags( + "first/third-person singular present " "subjunctive" + ) + self.assertEqual( + ret, + [ + ( + "first-person", + "present", + "singular", + "subjunctive", + "third-person", + ) + ], + ) def test_tags43(self): ret, topics = decode_tags("inflection of") @@ -209,26 +236,52 @@ def test_tags43(self): def test_tags44(self): ret, topics = decode_tags("third-person singular present indicative") - self.assertEqual(ret, [("indicative", "present", "singular", - "third-person",)]) + self.assertEqual( + ret, + [ + ( + "indicative", + "present", + "singular", + "third-person", + ) + ], + ) def test_tags45(self): ret, topics = decode_tags("ordinal form of") self.assertEqual(ret, [("form-of", "ordinal")]) def test_tags46(self): - ret, topics = decode_tags("first-person singular (eu) present " - "subjunctive") - self.assertEqual(ret, [("first-person", "present", "singular", - "subjunctive", "with-eu")]) + ret, topics = decode_tags( + "first-person singular (eu) present " "subjunctive" + ) + self.assertEqual( + ret, + [("first-person", "present", "singular", "subjunctive", "with-eu")], + ) def test_tags47(self): - ret, topics = decode_tags("third-person singular (él, ella, also " - "used with usted) present subjunctive " - "form of") - self.assertEqual(ret, [("form-of", "present", "singular", "subjunctive", - "third-person", - "with-ella", "with-usted", "with-él")]) + ret, topics = decode_tags( + "third-person singular (él, ella, also " + "used with usted) present subjunctive " + "form of" + ) + self.assertEqual( + ret, + [ + ( + "form-of", + "present", + "singular", + "subjunctive", + "third-person", + "with-ella", + "with-usted", + "with-él", + ) + ], + ) def test_tags48(self): ret, topics = decode_tags("instant messaging") @@ -236,8 +289,13 @@ def test_tags48(self): def test_tags49(self): ret, topics = decode_tags("plural and definite singular attributive") - self.assertEqual(ret, [("attributive", "definite", "singular"), - ("attributive", "plural")]) + self.assertEqual( + ret, + [ + ("attributive", "definite", "singular"), + ("attributive", "plural"), + ], + ) def test_tags50(self): ret, topics = decode_tags("alternative spelling of") @@ -261,8 +319,16 @@ def test_tags54(self): def test_tags55(self): ret, topics = decode_tags("plural and definite singular attributive") - self.assertEqual(ret, [("attributive", "definite", "singular"), - ("attributive", "plural",)]) + self.assertEqual( + ret, + [ + ("attributive", "definite", "singular"), + ( + "attributive", + "plural", + ), + ], + ) def test_tags56(self): ret, topics = decode_tags("comparative") @@ -293,10 +359,13 @@ def test_tags62(self): self.assertEqual(ret, [("definite", "plural", "singular")]) def test_tags63(self): - ret, topics = decode_tags("first-person plural " - "reflexive/dative/accusative form") - self.assertEqual(ret, [("accusative", "dative", "first-person", - "plural", "reflexive")]) + ret, topics = decode_tags( + "first-person plural " "reflexive/dative/accusative form" + ) + self.assertEqual( + ret, + [("accusative", "dative", "first-person", "plural", "reflexive")], + ) self.assertEqual(topics, []) @@ -314,8 +383,17 @@ def test_tags66(self): # during the first run, except for keys with slashes in them. ret, topics = decode_tags("nominative/plural masculine/feminine") # -> "nominative plural masculine/feminine" - self.assertEqual(ret, [("feminine", "masculine", - "nominative", "plural",)]) + self.assertEqual( + ret, + [ + ( + "feminine", + "masculine", + "nominative", + "plural", + ) + ], + ) def test_topics1(self): ret, topics = decode_tags("nautical") @@ -323,6 +401,25 @@ def test_topics1(self): def test_topics2(self): ret, topics = decode_tags("ropemaking") - self.assertEqual(topics, ["ropemaking", "crafts", - "nautical", "transport", - "arts", "hobbies", "lifestyle"]) + self.assertEqual( + topics, + [ + "ropemaking", + "crafts", + "nautical", + "transport", + "arts", + "hobbies", + "lifestyle", + ], + ) + + def test_and(self): + ret, topics = decode_tags("nominative and accusative") + self.assertEqual( + ret, + [( + "accusative", + "nominative", + )], + )