From 1b4515743dc0342a88f24a56ba0c69624c6f50cc Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 15 Apr 2024 17:22:13 +0900 Subject: [PATCH] Add flag for tokens based on foreign lemmas (#39) * Remove isascii compat for <=3.7 3.7 is EOL so support will be removed in the next version. * Add flag for whether word comes from foreign lemma * Formatting * Update Streamlit links, docs * Make the CLI compatible with Windows OS (#43) * Make compatible with Windows OS * Upload wheels * chore(actions): upgrade GitHub actions (#49) --------- Co-authored-by: Hizuru <106918920+Hizuru3@users.noreply.github.com> Co-authored-by: Stevie Gayet <87695919+stegayet@users.noreply.github.com> --- cutlet/cutlet.py | 5 +- cutlet/test/test_basic.py | 298 ++++++++++++++++++++------------------ 2 files changed, 165 insertions(+), 138 deletions(-) diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py index 7342f31..c6b6410 100644 --- a/cutlet/cutlet.py +++ b/cutlet/cutlet.py @@ -87,6 +87,8 @@ def load_exceptions(): class Token: surface: str space: bool # if a space should follow + # whether this comes from a foreign lemma + foreign: bool = False def __str__(self): sp = " " if self.space else "" @@ -229,7 +231,8 @@ def romaji_tokens(self, words, capitalize=True, title=False): not (pw and pw.feature.pos1 == '接頭辞')): roma = roma.title() - tok = Token(roma, False) + foreign = self.use_foreign_spelling and has_foreign_lemma(word) + tok = Token(roma, False, foreign) # handle punctuation with atypical spacing if word.surface in '「『': if po: diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py index 2727929..b3d4678 100644 --- a/cutlet/test/test_basic.py +++ b/cutlet/test/test_basic.py @@ -4,204 +4,219 @@ # Note that if there are multiple words, only the first is used WORDS = [ - ('新橋', 'shinbashi'), - ('学校', 'gakkou'), - ('パンダ', 'panda'), - # without curry, カツ is registered as 人名 (?) - ('カツカレー', 'cutlet'), - ('カレー', 'curry'), - ('繊維', "sen'i"), - ('専用', "sen'you"), - ('抹茶', 'matcha'), - ('重量', 'juuryou'), - ('ポール', 'Paul'), - ('ジル', 'jiru'), # test of ジル-外国 style lemmas - ('1', '1'), - ] + ("新橋", "shinbashi"), + ("学校", "gakkou"), + ("パンダ", "panda"), + # without curry, カツ is registered as 人名 (?) + ("カツカレー", "cutlet"), + ("カレー", "curry"), + ("繊維", "sen'i"), + ("専用", "sen'you"), + ("抹茶", "matcha"), + ("重量", "juuryou"), + ("ポール", "Paul"), + ("ジル", "jiru"), # test of ジル-外国 style lemmas + ("1", "1"), +] WORDS_KUNREI = [ - ('新橋', 'sinbasi'), - ('学校', 'gakkou'), - ('パンダ', 'panda'), - # without curry, カツ is registered as 人名 - ('カツカレー', 'cutlet'), - ('カレー', 'curry'), - ('繊維', "sen'i"), - ('専用', "sen'you"), - ('抹茶', 'mattya'), - ('重量', 'zyuuryou'), - ('ポール', 'Paul'), - ('1', '1'), - ] + ("新橋", "sinbasi"), + ("学校", "gakkou"), + ("パンダ", "panda"), + # without curry, カツ is registered as 人名 + ("カツカレー", "cutlet"), + ("カレー", "curry"), + ("繊維", "sen'i"), + ("専用", "sen'you"), + ("抹茶", "mattya"), + ("重量", "zyuuryou"), + ("ポール", "Paul"), + ("1", "1"), +] SENTENCES = [ - ("あっ", "A"), - ("括弧は「こう」でなくちゃ", "Kakko wa \"kou\" de nakucha"), - ("富士見坂", "Fujimi saka"), - ("本を読みました。", "Hon wo yomimashita."), - ("新橋行きの電車に乗った。", "Shinbashiiki no densha ni notta."), - ("カツカレーは美味しい", "Cutlet curry wa oishii"), - ("酵素とは、生体で起こる化学反応に対して触媒として機能する分子である。", - "Kouso to wa, seitai de okoru kagaku hannou ni taishite shokubai to shite kinou suru bunshi de aru."), - ("ホッピーは元祖ビアテイスト清涼飲料水です", - "Hoppy wa ganso beer taste seiryou inryousui desu"), - ("東京タワーの高さは333mです", - "Tokyo tower no takasa wa 333 m desu"), - ("国立国語研究所(NINJAL)は,日本語学・言語学・日本語教育研究を中心とした研究機関です。", - "Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu."), - ("やっちゃった!", "Yacchatta!"), - ("暖かかった", "Atatakakatta"), - ("私はテストです", "Watakushi wa test desu"), # issue #4, 私 -> 代名詞 - ("《月》", "(gatsu)"), # issue #7, unfamiliar punctuation - ("2 【電子版特典付】", "2 [denshi ban tokutentsuke]"), # issue #7 - # This looks weird but MeCab tokenizes at alpha-num barriers - ("cutlet23", "Cutlet 23"), - # Test some kana unks - issue #8 - ("アマガミ Sincerely Your S シンシアリーユアーズ", - "Amagami Sincerely Your S shinshiariiyuaazu"), - ("ケメコデラックス", "Kemekoderakkusu"), - ("プププランド", "Pupupurando"), - # Add some non-Japanese tests - ("панда", "?????"), - ("팬더", "??"), - ("「彁」は幽霊文字のひとつ", '"?" wa yuurei moji no hitotsu'), - # Do half-width katakana - ("ポール", "Paul"), - ("ウスイホン", "Usuihon"), - # Test adjective + desu - ("赤いです", "Akai desu"), - ("美味しいです", "Oishii desu"), - # Test repeated 長音符 - ("プラトーーーン", "Puratoo--n"), - # Ainu kana - ("イタㇰ", "Itak"), - # Whatever this is - ("エィッリィククワッドゥロッウ", "Irrikukuwadduro-u"), - # small か - ("夕陽ヵ丘三号館", "Yuuhi kakyuu san goukan"), - # deal with combining dakuten - ("青い春よさらば!", "Aoi haru yo saraba!"), - # small ケ - ("齋藤タヶオ", "Saitou ta ke o"), - # っー - ("ずっーと", "Zu--to"), - - # don't add spaces around apostrophe if it wasn't there - ("McDonald's", "McDonald's"), - ("Text McDonald's text", "Text McDonald's text"), - - # Following are quote weirdness. Not good but hard to fix. - # An issue is that ," or .' is a single token. - ("It's 'delicious.'", "It's ' delicious .'"), - ('"Hello," he said.', '" Hello ," he said.'), - - # this is a very strange typo - ("アトランテッィク", "Atoranteku"), - - # odoriji. Note at this point these rarely work properly, they mainly - # don't blow up. - ('くゞる', 'Kuguru'), # note this is actually in unidic-lite - ('くヽる', 'Ku ru'), - ('今度クヾペへ行こう', 'Kondo kugupe e ikou'), # made up word - ('彁々', '?'), - ] + ("あっ", "A"), + ("括弧は「こう」でなくちゃ", 'Kakko wa "kou" de nakucha'), + ("富士見坂", "Fujimi saka"), + ("本を読みました。", "Hon wo yomimashita."), + ("新橋行きの電車に乗った。", "Shinbashiiki no densha ni notta."), + ("カツカレーは美味しい", "Cutlet curry wa oishii"), + ( + "酵素とは、生体で起こる化学反応に対して触媒として機能する分子である。", + "Kouso to wa, seitai de okoru kagaku hannou ni taishite shokubai to shite kinou suru bunshi de aru.", + ), + ("ホッピーは元祖ビアテイスト清涼飲料水です", "Hoppy wa ganso beer taste seiryou inryousui desu"), + ("東京タワーの高さは333mです", "Tokyo tower no takasa wa 333 m desu"), + ( + "国立国語研究所(NINJAL)は,日本語学・言語学・日本語教育研究を中心とした研究機関です。", + "Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu.", + ), + ("やっちゃった!", "Yacchatta!"), + ("暖かかった", "Atatakakatta"), + ("私はテストです", "Watakushi wa test desu"), # issue #4, 私 -> 代名詞 + ("《月》", "(gatsu)"), # issue #7, unfamiliar punctuation + ("2 【電子版特典付】", "2 [denshi ban tokutentsuke]"), # issue #7 + # This looks weird but MeCab tokenizes at alpha-num barriers + ("cutlet23", "Cutlet 23"), + # Test some kana unks - issue #8 + ("アマガミ Sincerely Your S シンシアリーユアーズ", "Amagami Sincerely Your S shinshiariiyuaazu"), + ("ケメコデラックス", "Kemekoderakkusu"), + ("プププランド", "Pupupurando"), + # Add some non-Japanese tests + ("панда", "?????"), + ("팬더", "??"), + ("「彁」は幽霊文字のひとつ", '"?" wa yuurei moji no hitotsu'), + # Do half-width katakana + ("ポール", "Paul"), + ("ウスイホン", "Usuihon"), + # Test adjective + desu + ("赤いです", "Akai desu"), + ("美味しいです", "Oishii desu"), + # Test repeated 長音符 + ("プラトーーーン", "Puratoo--n"), + # Ainu kana + ("イタㇰ", "Itak"), + # Whatever this is + ("エィッリィククワッドゥロッウ", "Irrikukuwadduro-u"), + # small か + ("夕陽ヵ丘三号館", "Yuuhi kakyuu san goukan"), + # deal with combining dakuten + ("青い春よさらば!", "Aoi haru yo saraba!"), + # small ケ + ("齋藤タヶオ", "Saitou ta ke o"), + # っー + ("ずっーと", "Zu--to"), + # don't add spaces around apostrophe if it wasn't there + ("McDonald's", "McDonald's"), + ("Text McDonald's text", "Text McDonald's text"), + # Following are quote weirdness. Not good but hard to fix. + # An issue is that ," or .' is a single token. + ("It's 'delicious.'", "It's ' delicious .'"), + ('"Hello," he said.', '" Hello ," he said.'), + # this is a very strange typo + ("アトランテッィク", "Atoranteku"), + # odoriji. Note at this point these rarely work properly, they mainly + # don't blow up. + ("くゞる", "Kuguru"), # note this is actually in unidic-lite + ("くヽる", "Ku ru"), + ("今度クヾペへ行こう", "Kondo kugupe e ikou"), # made up word + ("彁々", "?"), +] SENTENCES_KUNREI = [ - ("富士見坂", "Huzimi saka"), - ] + ("富士見坂", "Huzimi saka"), +] SLUGS = [ - ("東京タワーの高さは?", "tokyo-tower-no-takasa-wa"), - ("ゲームマーケットとは", "game-market-to-wa"), - ("香川ゲーム条例、「(パブコメは)賛成多数だから採決しては」と発言したのは誰だったのか", - "kagawa-game-jourei-pabukome-wa-sansei-tasuu-dakara-saiketsu-shite-wa-to-hatsugen-shita-no-wa-dare-datta-no-ka"), - ("コトヤマ「よふかしのうた」3巻発売記念のPV公開、期間限定で1巻の無料配信も", - "koto-yama-yo-fukashi-no-uta-3-maki-hatsubai-kinen-no-pv-koukai-kikan-gentei-de-1-maki-no-muryou-haishin-mo"), - # Include some unks - ("彁は幽霊文字", "wa-yuurei-moji"), - ("パンダはロシア語でпанда", "panda-wa-rossiya-go-de"), - ] + ("東京タワーの高さは?", "tokyo-tower-no-takasa-wa"), + ("ゲームマーケットとは", "game-market-to-wa"), + ( + "香川ゲーム条例、「(パブコメは)賛成多数だから採決しては」と発言したのは誰だったのか", + "kagawa-game-jourei-pabukome-wa-sansei-tasuu-dakara-saiketsu-shite-wa-to-hatsugen-shita-no-wa-dare-datta-no-ka", + ), + ( + "コトヤマ「よふかしのうた」3巻発売記念のPV公開、期間限定で1巻の無料配信も", + "koto-yama-yo-fukashi-no-uta-3-maki-hatsubai-kinen-no-pv-koukai-kikan-gentei-de-1-maki-no-muryou-haishin-mo", + ), + # Include some unks + ("彁は幽霊文字", "wa-yuurei-moji"), + ("パンダはロシア語でпанда", "panda-wa-rossiya-go-de"), +] NON_FOREIGN = [ - ("カツカレーは美味しい", "Katsu karee wa oishii"), - ] + ("カツカレーは美味しい", "Katsu karee wa oishii"), +] TITLE = [ - ('吾輩は猫である', 'Wagahai wa Neko de Aru'), - ('お話があります', 'Ohanashi ga Arimasu'), - ('図書館戦争', 'Toshokan Sensou'), - ('頑張って', 'Ganbatte'), - ('さらば愛しき人よ', 'Saraba Itoshiki Hito yo'), - ('愛せよ乙女', 'Aiseyo Otome'), - ('巴里は燃えているか', 'Paris wa Moete Iru ka'), - ] + ("吾輩は猫である", "Wagahai wa Neko de Aru"), + ("お話があります", "Ohanashi ga Arimasu"), + ("図書館戦争", "Toshokan Sensou"), + ("頑張って", "Ganbatte"), + ("さらば愛しき人よ", "Saraba Itoshiki Hito yo"), + ("愛せよ乙女", "Aiseyo Otome"), + ("巴里は燃えているか", "Paris wa Moete Iru ka"), +] + +FOREIGN_TOKEN_FEATURE = [ + ("カツカレーは美味しい", [True, True, False, False]), +] import pathlib + here = pathlib.Path(__file__).parent.absolute() import json -with open(here / 'blns.json') as infile: + +with open(here / "blns.json") as infile: NAUGHTY = json.load(infile) -@pytest.mark.parametrize('ja, roma', WORDS) + +@pytest.mark.parametrize("ja, roma", WORDS) def test_words(ja, roma): cut = Cutlet() word = cut.tagger.parseToNodeList(ja)[0] assert cut.romaji_word(word) == roma -@pytest.mark.parametrize('ja, roma', WORDS_KUNREI) + +@pytest.mark.parametrize("ja, roma", WORDS_KUNREI) def test_words_kunrei(ja, roma): - cut = Cutlet('kunrei') + cut = Cutlet("kunrei") word = cut.tagger.parseToNodeList(ja)[0] assert cut.romaji_word(word) == roma -@pytest.mark.parametrize('ja, roma', SENTENCES) + +@pytest.mark.parametrize("ja, roma", SENTENCES) def test_romaji(ja, roma): cut = Cutlet() assert cut.romaji(ja) == roma -@pytest.mark.parametrize('ja, roma', SENTENCES_KUNREI) + +@pytest.mark.parametrize("ja, roma", SENTENCES_KUNREI) def test_romaji_kunrei(ja, roma): - cut = Cutlet('kunrei') + cut = Cutlet("kunrei") assert cut.romaji(ja) == roma -@pytest.mark.parametrize('ja, roma', SLUGS) + +@pytest.mark.parametrize("ja, roma", SLUGS) def test_romaji_slugs(ja, roma): cut = Cutlet() assert cut.slug(ja) == roma -@pytest.mark.parametrize('ja, roma', NON_FOREIGN) + +@pytest.mark.parametrize("ja, roma", NON_FOREIGN) def test_romaji_non_foreign(ja, roma): cut = Cutlet() cut.use_foreign_spelling = False assert cut.romaji(ja) == roma -@pytest.mark.parametrize('ja, roma', TITLE) + +@pytest.mark.parametrize("ja, roma", TITLE) def test_romaji_title(ja, roma): cut = Cutlet() assert cut.romaji(ja, title=True) == roma -@pytest.mark.parametrize('ja, roma', [ - (None, ''), - ('', '') -]) + +@pytest.mark.parametrize("ja, roma", [(None, ""), ("", "")]) def test_empty_string(ja, roma): cut = Cutlet() assert cut.romaji(ja) == roma -@pytest.mark.parametrize('text', NAUGHTY) + +@pytest.mark.parametrize("text", NAUGHTY) def test_naughty(text): # Goal here is just to not have an exception cut = Cutlet() cut.romaji(text) + def test_update_mapping(): cut = Cutlet() assert cut.romaji("お茶漬け") == "Ochazuke" cut.update_mapping("づ", "du") assert cut.romaji("お茶漬け") == "Ochaduke" -@pytest.mark.parametrize('text, roma', SENTENCES) + +@pytest.mark.parametrize("text, roma", SENTENCES) def test_romaji_tokens(text, roma): cut = Cutlet() toks = cut.tagger(normalize_text(text)) @@ -209,11 +224,20 @@ def test_romaji_tokens(text, roma): assert len(toks) == len(res), "Output length doesn't match input length" - rendered = '' + rendered = "" for tt in res: rendered += tt.surface if tt.space: - rendered += ' ' + rendered += " " rendered = rendered.strip() assert rendered == cut.romaji(text), "Token input diverged" + + +@pytest.mark.parametrize("text, is_foreign", FOREIGN_TOKEN_FEATURE) +def test_foreign_token_feature(text, is_foreign): + cut = Cutlet() + toks = cut.tagger(normalize_text(text)) + res = cut.romaji_tokens(toks) + for tok, gold in zip(res, is_foreign): + assert tok.foreign == gold, "Token's `foreign` feature is wrong"