polm · polm · Jan 4, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py
@@ -248,6 +248,13 @@ def romaji_tokens(self, words, capitalize=True, title=False):
                 out.append(tok)
                 continue
 
+            # preserve spaces between ascii tokens
+            if (word.surface.isascii() and
+                nw and nw.surface.isascii()):
+                use_space = bool(nw.white_space)
+                out.append(Token(word.surface, use_space))
+                continue
+
             out.append(tok)
 
             # no space sometimes

diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py
@@ -46,7 +46,7 @@
         "Kouso to wa, seitai de okoru kagaku hannou ni taishite shokubai to shite kinou suru bunshi de aru.",
     ),
     ("ホッピーは元祖ビアテイスト清涼飲料水です", "Hoppy wa ganso beer taste seiryou inryousui desu"),
-    ("東京タワーの高さは333mです", "Tokyo tower no takasa wa 333 m desu"),
+    ("東京タワーの高さは333mです", "Tokyo tower no takasa wa 333m desu"),
     (
         "国立国語研究所（NINJAL）は，日本語学・言語学・日本語教育研究を中心とした研究機関です。",
         "Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu.",
@@ -56,8 +56,7 @@
     ("私はテストです", "Watakushi wa test desu"),  # issue #4, 私 -> 代名詞
     ("《月》", "(gatsu)"),  # issue #7, unfamiliar punctuation
     ("２ 【電子版特典付】", "2 [denshi ban tokutentsuke]"),  # issue #7
-    # This looks weird but MeCab tokenizes at alpha-num barriers
-    ("ｃｕｔｌｅｔ２３", "Cutlet 23"),
+    ("ｃｕｔｌｅｔ２３", "Cutlet23"),
     # Test some kana unks - issue #8
     ("アマガミ Sincerely Your S シンシアリーユアーズ", "Amagami Sincerely Your S shinshiariiyuaazu"),
     ("ケメコデラックス", "Kemekoderakkusu"),
@@ -89,21 +88,22 @@
     # don't add spaces around apostrophe if it wasn't there
     ("McDonald's", "McDonald's"),
     ("Text McDonald's text", "Text McDonald's text"),
-    # Following are quote weirdness. Not good but hard to fix.
-    # An issue is that ," or .' is a single token.
-    ("It's 'delicious.'", "It's ' delicious .'"),
-    ('"Hello," he said.', '" Hello ," he said.'),
+    ("It's 'delicious.'", "It's 'delicious.'"),
+    ('"Hello," he said.', '"Hello," he said.'),
     # this is a very strange typo
     ("アトランテッィク", "Atoranteku"),
-    # odoriji. Note at this point these rarely work properly, they mainly
-    # don't blow up.
+    # odoriji. Note at this point these rarely work properly, these mainly test
+    # that they don't blow up.
     ("くゞる", "Kuguru"),  # note this is actually in unidic-lite
     ("くヽる", "Ku ru"),
     ("今度クヾペへ行こう", "Kondo kugupe e ikou"),  # made up word
     ("彁々", "?"),
     # prefixes, see #56
     ("ビオハザード", "Bio-hazard"),
     ("イントラワード", "Intra-word"),
+    # ascii whitespace, see #65
+    ("[04:30.748]", "[04:30.748]"),
+    (".big,bad bog", ".big,bad bog"),
 ]
 
 SENTENCES_KUNREI = [