From 5e73dc1d2798ad35edeb2de92b042bd12ff64d6c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 26 Jul 2020 18:35:17 +0900
Subject: [PATCH] Add ensure_ascii option

This option lets you configure what to do with unknown characters. They
can be converted to ? (the default) or passed through.

This also adds tests, and fixes a bug where slug tests were not run.
---
 cutlet/cutlet.py          | 11 ++++++++---
 cutlet/test/test_basic.py | 11 +++++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py
index 0783687..ab48c38 100644
--- a/cutlet/cutlet.py
+++ b/cutlet/cutlet.py
@@ -85,6 +85,7 @@ def __init__(self, system='hepburn'):
         self.use_wo  = (self.system in ('hepburn', 'nihon'))
 
         self.use_foreign_spelling = True
+        self.ensure_ascii = True
 
     def add_exception(self, key, val):
         self.exceptions[key] = val
@@ -189,11 +190,15 @@ def romaji_word(self, word):
             if word.char_type == 6 or word.char_type == 7: # hiragana/katakana
                 kana = jaconv.kata2hira(word.surface)
                 return self.map_kana(kana)
-            elif word.char_type == 2: # kanji we don't know, like 彁
+
+            # At this point this is an unknown word and not kana. Could be
+            # unknown kanji, could be hangul, cyrillic, something else.
+            # By default ensure ascii by replacing with ?, but allow pass-through.
+            if self.ensure_ascii:
                 out = '?' * len(word.surface)
                 return out
-            # At this point it could be hangul or cyrillic or something
-            return word.surface
+            else:
+                return word.surface
 
         if word.feature.pos1 == '補助記号':
             # If it's punctuation we don't recognize, just discard it
diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py
index 5d4da5f..d351da6 100644
--- a/cutlet/test/test_basic.py
+++ b/cutlet/test/test_basic.py
@@ -62,8 +62,8 @@
         ("ケメコデラックス", "Kemekoderakkusu"),
         ("プププランド", "Pupupurando"),
         # Add some non-Japanese tests
-        ("панда", "Панда"),
-        ("팬더", "팬더"),
+        ("панда", "?????"),
+        ("팬더", "??"),
         ("「彁」は幽霊文字のひとつ", '"?" wa yuurei moji no hitotsu'),
         ]
 
@@ -78,10 +78,13 @@
             "kagawa-game-jourei-pabukome-wa-sansei-tasuu-dakara-saiketsu-shite-wa-to-hatsugen-shita-no-wa-dare-datta-no-ka"),
         ("コトヤマ「よふかしのうた」3巻発売記念のPV公開、期間限定で1巻の無料配信も", 
             "koto-yama-yo-fukashi-no-uta-3-kan-hatsubai-kinen-no-p-v-koukai-kikan-gentei-de-1-kan-no-muryou-haishin-mo"),
+        # Include some unks
+        ("彁は幽霊文字", "wa-yuurei-moji"),
+        ("パンダはロシア語でпанда", "panda-wa-rossiya-go-de"),
         ]
 
 NON_FOREIGN = [
-        ("カツカレーは美味しい", "Katsu karee wa oishii")
+        ("カツカレーは美味しい", "Katsu karee wa oishii"),
         ]
 
 @pytest.mark.parametrize('ja, roma', WORDS)
@@ -112,7 +115,7 @@ def test_romaji_slugs(ja, roma):
     assert cut.slug(ja) == roma
 
 @pytest.mark.parametrize('ja, roma', NON_FOREIGN)
-def test_romaji_slugs(ja, roma):
+def test_romaji_non_foreign(ja, roma):
     cut = Cutlet()
     cut.use_foreign_spelling = False
     assert cut.romaji(ja) == roma