Skip to content
This repository has been archived by the owner on Jul 22, 2022. It is now read-only.

Commit

Permalink
Fix separation of KANA and HIRAGANA words(#143)
Browse files Browse the repository at this point in the history
fixed issue #142

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed May 16, 2021
1 parent 052bc2d commit c6106a8
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 19 deletions.
55 changes: 51 additions & 4 deletions src/pykakasi/kakasi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
#
# Copyright 2011-2021 Hiroshi Miura <[email protected]>
#
import enum
from typing import Dict, List

import jaconv

from .kanji import Itaiji, JConv
from .properties import Ch
from .scripts import IConv
from .scripts import A2, H2, IConv, K2


class PyKakasiException(Exception):
Expand All @@ -20,6 +21,14 @@ class UnknownCharacterException(PyKakasiException):
pass


class _TYPE(enum.Enum):
KANJI = 1
KANA = 2
HIRAGANA = 3
SYMBOL = 4
ALPHA = 5


class Kakasi:
"""Kakasi is a conversion class for Japanese text."""

Expand All @@ -32,6 +41,19 @@ def __init__(self):
def normalize(cls, text):
return jaconv.normalize(text)

def _type(self, c: str):
if K2.isRegion(c):
return _TYPE.KANA
elif A2.isRegion(c):
return _TYPE.ALPHA
elif H2.isRegion(c):
return _TYPE.HIRAGANA
# always not Kanji
# elif self._isKanji(c):
# return _TYPE.KANJI
else:
return _TYPE.SYMBOL

def _isKanji(self, c: str):
return 0x3400 <= ord(c[0]) < 0xE000 or self._itaiji.haskey(ord(c[0]))

Expand All @@ -54,13 +76,16 @@ def convert(self, text: str) -> List[Dict[str, str]]:
otext = ""
_result = []
i = 0
prev_type = _TYPE.KANJI

while i < len(text):
if self._isKanji(text[i]):
t, ln = self._jconv.convert(text[i:])
if ln <= 0: # When JConv successfully convert text
if ln <= 0:
# When JConv does not convert text
# FIXME: maybe a bug
_state = False
otext = otext + text[i]
otext = otext + text[i] # pass through
i += 1
else:
if _state:
Expand All @@ -71,12 +96,34 @@ def convert(self, text: str) -> List[Dict[str, str]]:
_state = True
otext = ""
i += ln
elif self._type(text[i]) != prev_type:
if text[i] in Ch.endmark:
otext += text[i]
_result.append(self._iconv.convert(otext, otext))
otext = ""
i += 1
_state = True
elif text[i] in self._iconv.LONG_SYMBOLS:
otext += text[i]
i += 1
_state = False
else:
prev_type = self._type(text[i])
if len(otext) > 0:
_result.append(self._iconv.convert(otext, otext))
otext = text[i]
_state = False
i += 1
else:
_state = False
otext = otext + text[i]
i += 1
else:
_state = False
otext = otext + text[i]
i += 1

if ord(otext[-1]) in Ch.endmark:
if otext[-1] in Ch.endmark:
_result.append(self._iconv.convert(otext, otext))
otext = ""
_state = True
Expand Down
2 changes: 1 addition & 1 deletion src/pykakasi/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def do(self, text: str) -> str:
self._flag["s"]
and otext[-len(self._separator) :] != self._separator
and i < len(text)
and not (ord(text[i]) in Ch.endmark)
and text[i] not in Ch.endmark
):
otext += self._separator

Expand Down
2 changes: 1 addition & 1 deletion src/pykakasi/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class Ch:
zenkaku_number_nine = 0xFF1A
zenkaku_A = 0xFF21
zenkaku_a = 0xFF41
endmark = [ord(a) for a in [")", "]", "!", ",", ".", u"\u3001", u"\u3002"]]
endmark = ")]!,.,\u3001\u3002"


Ch = Ch()
Expand Down
2 changes: 1 addition & 1 deletion src/pykakasi/properties.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class Ch:
zenkaku_number_nine: int = ...
zenkaku_A: int = ...
zenkaku_a: int = ...
endmark: Any = ...
endmark: str = ...

class Convert_Tables:
symbol_table_1: Any = ...
Expand Down
23 changes: 14 additions & 9 deletions src/pykakasi/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class IConv:

_MAXLEN: int = 32
_LONG_SYMBOLS: str = "\u30FC\u2015\u2212\uFF70" # "ー ― − ー "
LONG_SYMBOLS: str = "\u30FC\u2015\u2212\uFF70" # "ー ― − ー "
# _UNCHECKED_LONG_SYMBOLS: str = "\u002D\u2010\u2011\u2013\u2014" # "- ‐ ‑ – —"

def __init__(self):
Expand Down Expand Up @@ -47,7 +47,7 @@ def _s2a(self, text: str) -> str:
if l1 > 0:
result += t
i += l1
elif text[i] in self._LONG_SYMBOLS: # handle chōonpu sound marks
elif text[i] in self.LONG_SYMBOLS: # handle chōonpu sound marks
# use previous char as a transliteration for kana-dash
if len(result) > 0:
result += result[-1]
Expand Down Expand Up @@ -152,7 +152,8 @@ def __init__(self, mode, method="Hepburn"):
else:
self.convert = self.convert_noop

def isRegion(self, char):
@classmethod
def isRegion(cls, char):
return 0x3040 < ord(char[0]) < 0x3097 or 0x1B150 <= ord(char[0]) <= 0x1B152

def convert_a(self, text):
Expand Down Expand Up @@ -209,18 +210,21 @@ def __init__(self, mode, method="Hepburn"):
else:
self.convert = self.convert_noop

def isRegion(self, char):
@classmethod
def isRegion(cls, char):
ch = ord(char[0])
return (
self._is_katakana(ch)
or self._is_half_width_kana(ch)
cls._is_katakana(ch)
or cls._is_half_width_kana(ch)
or 0x1B164 <= ch <= 0x1B167
)

def _is_katakana(self, ch):
@classmethod
def _is_katakana(cls, ch):
return 0x30A0 < ch < 0x30FD

def _is_half_width_kana(self, ch):
@classmethod
def _is_half_width_kana(cls, ch):
return 0xFF65 < ch < 0xFF9F

def _convert_half_kana(self, text):
Expand Down Expand Up @@ -369,7 +373,8 @@ def __init__(self, mode):
else:
self.convert = self.convert_noop

def isRegion(self, char):
@classmethod
def isRegion(cls, char):
return Ch.space <= ord(char[0]) < Ch.delete

def _convert(self, text):
Expand Down
152 changes: 149 additions & 3 deletions tests/test_pykakasi_structured.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,152 @@ def test_issue114():
def test_issue115():
kks = pykakasi.kakasi()
result = kks.convert("゙っ、") # \uFF9E
assert result[0]["hira"] == "\u309Bっ、"
assert result[0]["kana"] == "\uFF9Eッ、"
assert result[0]["hepburn"] == '"tsu,'
expected = [
{"hira": "\u309B", "kana": "\uFF9E", "hepburn": '"'},
{"hira": "っ、", "kana": "ッ、", "hepburn": "tsu,"},
]
for i in range(len(expected)):
assert result[i]["hira"] == expected[i]["hira"]
assert result[i]["kana"] == expected[i]["kana"]
assert result[i]["hepburn"] == expected[i]["hepburn"]


@pytest.mark.parametrize("case, expected", [("藍之介", "あいのすけ"), ("藍水", "らんすい")])
def test_kakasi_unidic_noun(case, expected):
kakasi = pykakasi.Kakasi()
result = kakasi.convert(case)
key = kakasi._jconv._kanwa._jisyo_table.get("85cd", None)
assert result[0]["orig"] == case
assert result[0]["hira"] == expected


@pytest.mark.parametrize(
"case, expected",
[
(
"バニーちゃんちのシャワーノズルの先端",
[
{
"orig": "バニー",
"hira": "ばにー",
"kana": "バニー",
"hepburn": "banii",
"kunrei": "banii",
"passport": "banii",
},
{
"orig": "ちゃんちの",
"hira": "ちゃんちの",
"kana": "チャンチノ",
"hepburn": "chanchino",
"kunrei": "tyantino",
"passport": "chanchino",
},
{
"orig": "シャワーノズル",
"hira": "しゃわーのずる",
"kana": "シャワーノズル",
"hepburn": "shawaanozuru",
"kunrei": "syawaanozuru",
"passport": "shawaanozuru",
},
{
"orig": "の",
"hira": "の",
"kana": "ノ",
"hepburn": "no",
"kunrei": "no",
"passport": "no",
},
{
"orig": "先端",
"hira": "せんたん",
"kana": "センタン",
"hepburn": "sentan",
"kunrei": "sentan",
"passport": "sentan",
},
],
),
(
"明日は明日の風が吹く",
[
{
"orig": "明日",
"hira": "あした",
"kana": "アシタ",
"hepburn": "ashita",
"kunrei": "asita",
"passport": "ashita",
},
{
"orig": "は",
"hira": "は",
"kana": "ハ",
"hepburn": "ha",
"kunrei": "ha",
"passport": "ha",
},
{
"orig": "明日",
"hira": "あした",
"kana": "アシタ",
"hepburn": "ashita",
"kunrei": "asita",
"passport": "ashita",
},
{
"orig": "の",
"hira": "の",
"kana": "ノ",
"hepburn": "no",
"kunrei": "no",
"passport": "no",
},
{
"orig": "風",
"hira": "かぜ",
"kana": "カゼ",
"hepburn": "kaze",
"kunrei": "kaze",
"passport": "kaze",
},
{
"orig": "が",
"hira": "が",
"kana": "ガ",
"hepburn": "ga",
"kunrei": "ga",
"passport": "ga",
},
{
"orig": "吹く",
"hira": "ふく",
"kana": "フク",
"hepburn": "fuku",
"kunrei": "fuku",
"passport": "fuku",
},
],
),
],
)
def test_kakasi_unihandecode(case, expected):
kakasi = pykakasi.Kakasi()
result = kakasi.convert(case)
if len(result) < len(expected):
for i, r in enumerate(result):
assert r["orig"] == expected[i]["orig"]
assert r["hira"] == expected[i]["hira"]
assert r["kana"] == expected[i]["kana"]
assert r["hepburn"] == expected[i]["hepburn"]
assert r["kunrei"] == expected[i]["kunrei"]
assert r["passport"] == expected[i]["passport"]
else:
for i, e in enumerate(expected):
assert result[i]["orig"] == e["orig"]
assert result[i]["hira"] == e["hira"]
assert result[i]["kana"] == e["kana"]
assert result[i]["hepburn"] == e["hepburn"]
assert result[i]["kunrei"] == e["kunrei"]
assert result[i]["passport"] == e["passport"]

0 comments on commit c6106a8

Please sign in to comment.