Skip to content

Commit

Permalink
Fix handling of foreign lemmas that are prefixes
Browse files Browse the repository at this point in the history
  • Loading branch information
polm committed Jul 3, 2024
1 parent 309ddeb commit 0fab1bb
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions cutlet/cutlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def has_foreign_lemma(word):
if not '-' in lemma:
return False

cand = lemma.split('-')[-1]
cand = lemma.split('-', 1)[-1]
# NOTE: some words have 外国 instead of a foreign spelling. ジル
# (Jill?) is an example. Unclear why this is the case.
# There are other hyphenated lemmas, like 私-代名詞.
Expand Down Expand Up @@ -257,6 +257,8 @@ def romaji_tokens(self, words, capitalize=True, title=False):
if nw and nw.feature.pos1 in ('補助記号', '接尾辞'): continue
# special case for half-width commas
if nw and nw.surface == ',': continue
# special case for prefixes
if foreign and roma[-1] == "-": continue
# 思えば -> omoeba
if nw and nw.feature.pos2 in ('接続助詞'): continue
# 333 -> 333 ; this should probably be handled in mecab
Expand Down Expand Up @@ -348,7 +350,7 @@ def romaji_word(self, word):
elif (self.use_foreign_spelling and
has_foreign_lemma(word)):
# this is a foreign word with known spelling
return word.feature.lemma.split('-')[-1]
return word.feature.lemma.split('-', 1)[-1]
elif word.feature.kana:
# for known words
kana = jaconv.kata2hira(word.feature.kana)
Expand Down

0 comments on commit 0fab1bb

Please sign in to comment.