Skip to content

Commit

Permalink
Add ensure_ascii option
Browse files Browse the repository at this point in the history
This option lets you configure what to do with unknown characters. They
can be converted to ? (the default) or passed through.

This also adds tests, and fixes a bug where slug tests were not run.
  • Loading branch information
polm committed Jul 26, 2020
1 parent 90830f7 commit 5e73dc1
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 7 deletions.
11 changes: 8 additions & 3 deletions cutlet/cutlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(self, system='hepburn'):
self.use_wo = (self.system in ('hepburn', 'nihon'))

self.use_foreign_spelling = True
self.ensure_ascii = True

def add_exception(self, key, val):
self.exceptions[key] = val
Expand Down Expand Up @@ -189,11 +190,15 @@ def romaji_word(self, word):
if word.char_type == 6 or word.char_type == 7: # hiragana/katakana
kana = jaconv.kata2hira(word.surface)
return self.map_kana(kana)
elif word.char_type == 2: # kanji we don't know, like 彁

# At this point this is an unknown word and not kana. Could be
# unknown kanji, could be hangul, cyrillic, something else.
# By default ensure ascii by replacing with ?, but allow pass-through.
if self.ensure_ascii:
out = '?' * len(word.surface)
return out
# At this point it could be hangul or cyrillic or something
return word.surface
else:
return word.surface

if word.feature.pos1 == '補助記号':
# If it's punctuation we don't recognize, just discard it
Expand Down
11 changes: 7 additions & 4 deletions cutlet/test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@
("ケメコデラックス", "Kemekoderakkusu"),
("プププランド", "Pupupurando"),
# Add some non-Japanese tests
("панда", "Панда"),
("팬더", "팬더"),
("панда", "?????"),
("팬더", "??"),
("「彁」は幽霊文字のひとつ", '"?" wa yuurei moji no hitotsu'),
]

Expand All @@ -78,10 +78,13 @@
"kagawa-game-jourei-pabukome-wa-sansei-tasuu-dakara-saiketsu-shite-wa-to-hatsugen-shita-no-wa-dare-datta-no-ka"),
("コトヤマ「よふかしのうた」3巻発売記念のPV公開、期間限定で1巻の無料配信も",
"koto-yama-yo-fukashi-no-uta-3-kan-hatsubai-kinen-no-p-v-koukai-kikan-gentei-de-1-kan-no-muryou-haishin-mo"),
# Include some unks
("彁は幽霊文字", "wa-yuurei-moji"),
("パンダはロシア語でпанда", "panda-wa-rossiya-go-de"),
]

NON_FOREIGN = [
("カツカレーは美味しい", "Katsu karee wa oishii")
("カツカレーは美味しい", "Katsu karee wa oishii"),
]

@pytest.mark.parametrize('ja, roma', WORDS)
Expand Down Expand Up @@ -112,7 +115,7 @@ def test_romaji_slugs(ja, roma):
assert cut.slug(ja) == roma

@pytest.mark.parametrize('ja, roma', NON_FOREIGN)
def test_romaji_slugs(ja, roma):
def test_romaji_non_foreign(ja, roma):
cut = Cutlet()
cut.use_foreign_spelling = False
assert cut.romaji(ja) == roma
Expand Down

0 comments on commit 5e73dc1

Please sign in to comment.