From 5e73dc1d2798ad35edeb2de92b042bd12ff64d6c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 26 Jul 2020 18:35:17 +0900 Subject: [PATCH] Add ensure_ascii option This option lets you configure what to do with unknown characters. They can be converted to ? (the default) or passed through. This also adds tests, and fixes a bug where slug tests were not run. --- cutlet/cutlet.py | 11 ++++++++--- cutlet/test/test_basic.py | 11 +++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py index 0783687..ab48c38 100644 --- a/cutlet/cutlet.py +++ b/cutlet/cutlet.py @@ -85,6 +85,7 @@ def __init__(self, system='hepburn'): self.use_wo = (self.system in ('hepburn', 'nihon')) self.use_foreign_spelling = True + self.ensure_ascii = True def add_exception(self, key, val): self.exceptions[key] = val @@ -189,11 +190,15 @@ def romaji_word(self, word): if word.char_type == 6 or word.char_type == 7: # hiragana/katakana kana = jaconv.kata2hira(word.surface) return self.map_kana(kana) - elif word.char_type == 2: # kanji we don't know, like 彁 + + # At this point this is an unknown word and not kana. Could be + # unknown kanji, could be hangul, cyrillic, something else. + # By default ensure ascii by replacing with ?, but allow pass-through. + if self.ensure_ascii: out = '?' * len(word.surface) return out - # At this point it could be hangul or cyrillic or something - return word.surface + else: + return word.surface if word.feature.pos1 == '補助記号': # If it's punctuation we don't recognize, just discard it diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py index 5d4da5f..d351da6 100644 --- a/cutlet/test/test_basic.py +++ b/cutlet/test/test_basic.py @@ -62,8 +62,8 @@ ("ケメコデラックス", "Kemekoderakkusu"), ("プププランド", "Pupupurando"), # Add some non-Japanese tests - ("панда", "Панда"), - ("팬더", "팬더"), + ("панда", "?????"), + ("팬더", "??"), ("「彁」は幽霊文字のひとつ", '"?" wa yuurei moji no hitotsu'), ] @@ -78,10 +78,13 @@ "kagawa-game-jourei-pabukome-wa-sansei-tasuu-dakara-saiketsu-shite-wa-to-hatsugen-shita-no-wa-dare-datta-no-ka"), ("コトヤマ「よふかしのうた」3巻発売記念のPV公開、期間限定で1巻の無料配信も", "koto-yama-yo-fukashi-no-uta-3-kan-hatsubai-kinen-no-p-v-koukai-kikan-gentei-de-1-kan-no-muryou-haishin-mo"), + # Include some unks + ("彁は幽霊文字", "wa-yuurei-moji"), + ("パンダはロシア語でпанда", "panda-wa-rossiya-go-de"), ] NON_FOREIGN = [ - ("カツカレーは美味しい", "Katsu karee wa oishii") + ("カツカレーは美味しい", "Katsu karee wa oishii"), ] @pytest.mark.parametrize('ja, roma', WORDS) @@ -112,7 +115,7 @@ def test_romaji_slugs(ja, roma): assert cut.slug(ja) == roma @pytest.mark.parametrize('ja, roma', NON_FOREIGN) -def test_romaji_slugs(ja, roma): +def test_romaji_non_foreign(ja, roma): cut = Cutlet() cut.use_foreign_spelling = False assert cut.romaji(ja) == roma