Skip to content

Commit

Permalink
Fix Foreign Lemma Prefixes (fix #56) (#57)
Browse files Browse the repository at this point in the history
* Add tests for prefix issue

* Fix handling of foreign lemmas that are prefixes
  • Loading branch information
polm authored Dec 20, 2024
1 parent 9574fad commit e89b907
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
6 changes: 4 additions & 2 deletions cutlet/cutlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def has_foreign_lemma(word):
if not '-' in lemma:
return False

cand = lemma.split('-')[-1]
cand = lemma.split('-', 1)[-1]
# NOTE: some words have 外国 instead of a foreign spelling. ジル
# (Jill?) is an example. Unclear why this is the case.
# There are other hyphenated lemmas, like 私-代名詞.
Expand Down Expand Up @@ -257,6 +257,8 @@ def romaji_tokens(self, words, capitalize=True, title=False):
if nw and nw.feature.pos1 in ('補助記号', '接尾辞'): continue
# special case for half-width commas
if nw and nw.surface == ',': continue
# special case for prefixes
if foreign and roma[-1] == "-": continue
# 思えば -> omoeba
if nw and nw.feature.pos2 in ('接続助詞'): continue
# 333 -> 333 ; this should probably be handled in mecab
Expand Down Expand Up @@ -348,7 +350,7 @@ def romaji_word(self, word):
elif (self.use_foreign_spelling and
has_foreign_lemma(word)):
# this is a foreign word with known spelling
return word.feature.lemma.split('-')[-1]
return word.feature.lemma.split('-', 1)[-1]
elif word.feature.kana:
# for known words
kana = jaconv.kata2hira(word.feature.kana)
Expand Down
3 changes: 3 additions & 0 deletions cutlet/test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@
("くヽる", "Ku ru"),
("今度クヾペへ行こう", "Kondo kugupe e ikou"), # made up word
("彁々", "?"),
# prefixes, see #56
("ビオハザード", "Bio-hazard"),
("イントラワード", "Intra-word"),
]

SENTENCES_KUNREI = [
Expand Down

0 comments on commit e89b907

Please sign in to comment.