From d962cda7c25ba03ca0824c00344d42d247ec8268 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 20 Oct 2021 23:15:09 +0900 Subject: [PATCH] Deal with odoriji This doesn't handle them all that well but it shouldn't blow up any more --- cutlet/cutlet.py | 20 ++++++++++++++++++++ cutlet/mapping.py | 9 +++++++++ cutlet/test/test_basic.py | 12 ++++++++++++ 3 files changed, 41 insertions(+) diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py index bf3eb14..7085d3c 100644 --- a/cutlet/cutlet.py +++ b/cutlet/cutlet.py @@ -10,6 +10,7 @@ SUTEGANA = 'ゃゅょぁぃぅぇぉ' PUNCT = '\'".!?(),;:-' +ODORI = '々〃ゝゞヽゞ' SYSTEMS = { 'hepburn': HEPBURN, @@ -255,6 +256,24 @@ def map_kana(self, kana): return out def get_single_mapping(self, pk, kk, nk): + # handle odoriji + # NOTE: This is very rarely useful at present because odoriji are not + # left in readings for dictionary words, and we can't follow kana + # across word boundaries. + if kk in ODORI: + if kk in 'ゝヽ': + if pk: return pk + else: return '' # invalid but be nice + if kk in 'ゞヾ': # repeat with voicing + if not pk: return '' + vv = add_dakuten(pk) + if vv: return self.table[vv] + else: return '' + # remaining are 々 for kanji and 〃 for symbols, but we can't + # infer their span reliably (or handle rendaku) + return '' + + # handle digraphs if pk and (pk + kk) in self.table: return self.table[pk + kk] @@ -262,6 +281,7 @@ def get_single_mapping(self, pk, kk, nk): return '' if nk and nk in SUTEGANA: + if kk == 'っ': return '' # never valid, just ignore return self.table[kk][:-1] + self.table[nk] if kk in SUTEGANA: return '' diff --git a/cutlet/mapping.py b/cutlet/mapping.py index 3bd91ba..a4b706e 100644 --- a/cutlet/mapping.py +++ b/cutlet/mapping.py @@ -188,3 +188,12 @@ NIHONSHIKI['ぢ'] = 'di' NIHONSHIKI['づ'] = 'du' +UNVOICED = 'かきくけこさしすせそたちつてとはひふへほ' +VOICED = 'がぎぐげござじずぜぞだぢづでどばびぶべぼ' + +def add_dakuten(kk): + ii = UNVOICED.index(kk) + if ii is None: + return ii + return VOICED[ii] + diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py index d77d1f6..e16460d 100644 --- a/cutlet/test/test_basic.py +++ b/cutlet/test/test_basic.py @@ -85,13 +85,25 @@ ("齋藤タヶオ", "Saitou ta ke o"), # っー ("ずっーと", "Zu--to"), + # don't add spaces around apostrophe if it wasn't there ("McDonald's", "McDonald's"), ("Text McDonald's text", "Text McDonald's text"), + # Following are quote weirdness. Not good but hard to fix. # An issue is that ," or .' is a single token. ("It's 'delicious.'", "It's ' delicious .'"), ('"Hello," he said.', '" Hello ," he said.'), + + # this is a very strange typo + ("アトランテッィク", "Atoranteku"), + + # odoriji. Note at this point these rarely work properly, they mainly + # don't blow up. + ('くゞる', 'Kuguru'), # note this is actually in unidic-lite + ('くヽる', 'Ku ru'), + ('今度クヾペへ行こう', 'Kondo kugupe e ikou'), # made up word + ('彁々', '?'), ] SENTENCES_KUNREI = [