Skip to content

Commit

Permalink
Deal with odoriji
Browse files Browse the repository at this point in the history
This doesn't handle them all that well but it shouldn't blow up any more
  • Loading branch information
polm committed Oct 20, 2021
1 parent 262e92f commit d962cda
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 0 deletions.
20 changes: 20 additions & 0 deletions cutlet/cutlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

SUTEGANA = 'ゃゅょぁぃぅぇぉ'
PUNCT = '\'".!?(),;:-'
ODORI = '々〃ゝゞヽゞ'

SYSTEMS = {
'hepburn': HEPBURN,
Expand Down Expand Up @@ -255,13 +256,32 @@ def map_kana(self, kana):
return out

def get_single_mapping(self, pk, kk, nk):
# handle odoriji
# NOTE: This is very rarely useful at present because odoriji are not
# left in readings for dictionary words, and we can't follow kana
# across word boundaries.
if kk in ODORI:
if kk in 'ゝヽ':
if pk: return pk
else: return '' # invalid but be nice
if kk in 'ゞヾ': # repeat with voicing
if not pk: return ''
vv = add_dakuten(pk)
if vv: return self.table[vv]
else: return ''
# remaining are 々 for kanji and 〃 for symbols, but we can't
# infer their span reliably (or handle rendaku)
return ''


# handle digraphs
if pk and (pk + kk) in self.table:
return self.table[pk + kk]
if nk and (kk + nk) in self.table:
return ''

if nk and nk in SUTEGANA:
if kk == 'っ': return '' # never valid, just ignore
return self.table[kk][:-1] + self.table[nk]
if kk in SUTEGANA:
return ''
Expand Down
9 changes: 9 additions & 0 deletions cutlet/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,12 @@
NIHONSHIKI['ぢ'] = 'di'
NIHONSHIKI['づ'] = 'du'

UNVOICED = 'かきくけこさしすせそたちつてとはひふへほ'
VOICED = 'がぎぐげござじずぜぞだぢづでどばびぶべぼ'

def add_dakuten(kk):
ii = UNVOICED.index(kk)
if ii is None:
return ii
return VOICED[ii]

12 changes: 12 additions & 0 deletions cutlet/test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,25 @@
("齋藤タヶオ", "Saitou ta ke o"),
# っー
("ずっーと", "Zu--to"),

# don't add spaces around apostrophe if it wasn't there
("McDonald's", "McDonald's"),
("Text McDonald's text", "Text McDonald's text"),

# Following are quote weirdness. Not good but hard to fix.
# An issue is that ," or .' is a single token.
("It's 'delicious.'", "It's ' delicious .'"),
('"Hello," he said.', '" Hello ," he said.'),

# this is a very strange typo
("アトランテッィク", "Atoranteku"),

# odoriji. Note at this point these rarely work properly, they mainly
# don't blow up.
('くゞる', 'Kuguru'), # note this is actually in unidic-lite
('くヽる', 'Ku ru'),
('今度クヾペへ行こう', 'Kondo kugupe e ikou'), # made up word
('彁々', '?'),
]

SENTENCES_KUNREI = [
Expand Down

0 comments on commit d962cda

Please sign in to comment.