Fix separation of KANA and HIRAGANA words(#143)

fixed issue #142 Signed-off-by: Hiroshi Miura <[email protected]>
miurahr · May 16, 2021 · c6106a8 · c6106a8
1 parent 052bc2d
commit c6106a8
Show file tree

Hide file tree

Showing 6 changed files with 217 additions and 19 deletions.
diff --git a/src/pykakasi/kakasi.py b/src/pykakasi/kakasi.py
@@ -3,13 +3,14 @@
 #
 # Copyright 2011-2021 Hiroshi Miura <[email protected]>
 #
+import enum
 from typing import Dict, List
 
 import jaconv
 
 from .kanji import Itaiji, JConv
 from .properties import Ch
-from .scripts import IConv
+from .scripts import A2, H2, IConv, K2
 
 
 class PyKakasiException(Exception):
@@ -20,6 +21,14 @@ class UnknownCharacterException(PyKakasiException):
     pass
 
 
+class _TYPE(enum.Enum):
+    KANJI = 1
+    KANA = 2
+    HIRAGANA = 3
+    SYMBOL = 4
+    ALPHA = 5
+
+
 class Kakasi:
     """Kakasi is a conversion class for Japanese text."""
 
@@ -32,6 +41,19 @@ def __init__(self):
     def normalize(cls, text):
         return jaconv.normalize(text)
 
+    def _type(self, c: str):
+        if K2.isRegion(c):
+            return _TYPE.KANA
+        elif A2.isRegion(c):
+            return _TYPE.ALPHA
+        elif H2.isRegion(c):
+            return _TYPE.HIRAGANA
+        # always not Kanji
+        # elif self._isKanji(c):
+        #    return _TYPE.KANJI
+        else:
+            return _TYPE.SYMBOL
+
     def _isKanji(self, c: str):
         return 0x3400 <= ord(c[0]) < 0xE000 or self._itaiji.haskey(ord(c[0]))
 
@@ -54,13 +76,16 @@ def convert(self, text: str) -> List[Dict[str, str]]:
         otext = ""
         _result = []
         i = 0
+        prev_type = _TYPE.KANJI
 
         while i < len(text):
             if self._isKanji(text[i]):
                 t, ln = self._jconv.convert(text[i:])
-                if ln <= 0:  # When JConv successfully convert text
+                if ln <= 0:
+                    # When JConv does not convert text
+                    # FIXME: maybe a bug
                     _state = False
-                    otext = otext + text[i]
+                    otext = otext + text[i]  # pass through
                     i += 1
                 else:
                     if _state:
@@ -71,12 +96,34 @@ def convert(self, text: str) -> List[Dict[str, str]]:
                         _state = True
                     otext = ""
                     i += ln
+            elif self._type(text[i]) != prev_type:
+                if text[i] in Ch.endmark:
+                    otext += text[i]
+                    _result.append(self._iconv.convert(otext, otext))
+                    otext = ""
+                    i += 1
+                    _state = True
+                elif text[i] in self._iconv.LONG_SYMBOLS:
+                    otext += text[i]
+                    i += 1
+                    _state = False
+                else:
+                    prev_type = self._type(text[i])
+                    if len(otext) > 0:
+                        _result.append(self._iconv.convert(otext, otext))
+                        otext = text[i]
+                        _state = False
+                        i += 1
+                    else:
+                        _state = False
+                        otext = otext + text[i]
+                        i += 1
             else:
                 _state = False
                 otext = otext + text[i]
                 i += 1
 
-                if ord(otext[-1]) in Ch.endmark:
+                if otext[-1] in Ch.endmark:
                     _result.append(self._iconv.convert(otext, otext))
                     otext = ""
                     _state = True

diff --git a/src/pykakasi/legacy.py b/src/pykakasi/legacy.py
@@ -213,7 +213,7 @@ def do(self, text: str) -> str:
                 self._flag["s"]
                 and otext[-len(self._separator) :] != self._separator
                 and i < len(text)
-                and not (ord(text[i]) in Ch.endmark)
+                and text[i] not in Ch.endmark
             ):
                 otext += self._separator
 

diff --git a/src/pykakasi/properties.py b/src/pykakasi/properties.py
@@ -60,7 +60,7 @@ class Ch:
     zenkaku_number_nine = 0xFF1A
     zenkaku_A = 0xFF21
     zenkaku_a = 0xFF41
-    endmark = [ord(a) for a in [")", "]", "!", ",", ".", u"\u3001", u"\u3002"]]
+    endmark = ")]!,.,\u3001\u3002"
 
 
 Ch = Ch()

diff --git a/src/pykakasi/properties.pyi b/src/pykakasi/properties.pyi
@@ -45,7 +45,7 @@ class Ch:
     zenkaku_number_nine: int = ...
     zenkaku_A: int = ...
     zenkaku_a: int = ...
-    endmark: Any = ...
+    endmark: str = ...
 
 class Convert_Tables:
     symbol_table_1: Any = ...

diff --git a/src/pykakasi/scripts.py b/src/pykakasi/scripts.py
@@ -12,7 +12,7 @@
 class IConv:
 
     _MAXLEN: int = 32
-    _LONG_SYMBOLS: str = "\u30FC\u2015\u2212\uFF70"  # "ー  ―  −  ｰ "
+    LONG_SYMBOLS: str = "\u30FC\u2015\u2212\uFF70"  # "ー  ―  −  ｰ "
     # _UNCHECKED_LONG_SYMBOLS: str = "\u002D\u2010\u2011\u2013\u2014" # "-  ‐ ‑ – —"
 
     def __init__(self):
@@ -47,7 +47,7 @@ def _s2a(self, text: str) -> str:
             if l1 > 0:
                 result += t
                 i += l1
-            elif text[i] in self._LONG_SYMBOLS:  # handle chōonpu sound marks
+            elif text[i] in self.LONG_SYMBOLS:  # handle chōonpu sound marks
                 # use previous char as a transliteration for kana-dash
                 if len(result) > 0:
                     result += result[-1]
@@ -152,7 +152,8 @@ def __init__(self, mode, method="Hepburn"):
         else:
             self.convert = self.convert_noop
 
-    def isRegion(self, char):
+    @classmethod
+    def isRegion(cls, char):
         return 0x3040 < ord(char[0]) < 0x3097 or 0x1B150 <= ord(char[0]) <= 0x1B152
 
     def convert_a(self, text):
@@ -209,18 +210,21 @@ def __init__(self, mode, method="Hepburn"):
         else:
             self.convert = self.convert_noop
 
-    def isRegion(self, char):
+    @classmethod
+    def isRegion(cls, char):
         ch = ord(char[0])
         return (
-            self._is_katakana(ch)
-            or self._is_half_width_kana(ch)
+            cls._is_katakana(ch)
+            or cls._is_half_width_kana(ch)
             or 0x1B164 <= ch <= 0x1B167
         )
 
-    def _is_katakana(self, ch):
+    @classmethod
+    def _is_katakana(cls, ch):
         return 0x30A0 < ch < 0x30FD
 
-    def _is_half_width_kana(self, ch):
+    @classmethod
+    def _is_half_width_kana(cls, ch):
         return 0xFF65 < ch < 0xFF9F
 
     def _convert_half_kana(self, text):
@@ -369,7 +373,8 @@ def __init__(self, mode):
         else:
             self.convert = self.convert_noop
 
-    def isRegion(self, char):
+    @classmethod
+    def isRegion(cls, char):
         return Ch.space <= ord(char[0]) < Ch.delete
 
     def _convert(self, text):

diff --git a/tests/test_pykakasi_structured.py b/tests/test_pykakasi_structured.py
@@ -387,6 +387,152 @@ def test_issue114():
 def test_issue115():
     kks = pykakasi.kakasi()
     result = kks.convert("ﾞっ、")  # \uFF9E
-    assert result[0]["hira"] == "\u309Bっ、"
-    assert result[0]["kana"] == "\uFF9Eッ、"
-    assert result[0]["hepburn"] == '"tsu,'
+    expected = [
+        {"hira": "\u309B", "kana": "\uFF9E", "hepburn": '"'},
+        {"hira": "っ、", "kana": "ッ、", "hepburn": "tsu,"},
+    ]
+    for i in range(len(expected)):
+        assert result[i]["hira"] == expected[i]["hira"]
+        assert result[i]["kana"] == expected[i]["kana"]
+        assert result[i]["hepburn"] == expected[i]["hepburn"]
+
+
+@pytest.mark.parametrize("case, expected", [("藍之介", "あいのすけ"), ("藍水", "らんすい")])
+def test_kakasi_unidic_noun(case, expected):
+    kakasi = pykakasi.Kakasi()
+    result = kakasi.convert(case)
+    key = kakasi._jconv._kanwa._jisyo_table.get("85cd", None)
+    assert result[0]["orig"] == case
+    assert result[0]["hira"] == expected
+
+
+@pytest.mark.parametrize(
+    "case, expected",
+    [
+        (
+            "バニーちゃんちのシャワーノズルの先端",
+            [
+                {
+                    "orig": "バニー",
+                    "hira": "ばにー",
+                    "kana": "バニー",
+                    "hepburn": "banii",
+                    "kunrei": "banii",
+                    "passport": "banii",
+                },
+                {
+                    "orig": "ちゃんちの",
+                    "hira": "ちゃんちの",
+                    "kana": "チャンチノ",
+                    "hepburn": "chanchino",
+                    "kunrei": "tyantino",
+                    "passport": "chanchino",
+                },
+                {
+                    "orig": "シャワーノズル",
+                    "hira": "しゃわーのずる",
+                    "kana": "シャワーノズル",
+                    "hepburn": "shawaanozuru",
+                    "kunrei": "syawaanozuru",
+                    "passport": "shawaanozuru",
+                },
+                {
+                    "orig": "の",
+                    "hira": "の",
+                    "kana": "ノ",
+                    "hepburn": "no",
+                    "kunrei": "no",
+                    "passport": "no",
+                },
+                {
+                    "orig": "先端",
+                    "hira": "せんたん",
+                    "kana": "センタン",
+                    "hepburn": "sentan",
+                    "kunrei": "sentan",
+                    "passport": "sentan",
+                },
+            ],
+        ),
+        (
+            "明日は明日の風が吹く",
+            [
+                {
+                    "orig": "明日",
+                    "hira": "あした",
+                    "kana": "アシタ",
+                    "hepburn": "ashita",
+                    "kunrei": "asita",
+                    "passport": "ashita",
+                },
+                {
+                    "orig": "は",
+                    "hira": "は",
+                    "kana": "ハ",
+                    "hepburn": "ha",
+                    "kunrei": "ha",
+                    "passport": "ha",
+                },
+                {
+                    "orig": "明日",
+                    "hira": "あした",
+                    "kana": "アシタ",
+                    "hepburn": "ashita",
+                    "kunrei": "asita",
+                    "passport": "ashita",
+                },
+                {
+                    "orig": "の",
+                    "hira": "の",
+                    "kana": "ノ",
+                    "hepburn": "no",
+                    "kunrei": "no",
+                    "passport": "no",
+                },
+                {
+                    "orig": "風",
+                    "hira": "かぜ",
+                    "kana": "カゼ",
+                    "hepburn": "kaze",
+                    "kunrei": "kaze",
+                    "passport": "kaze",
+                },
+                {
+                    "orig": "が",
+                    "hira": "が",
+                    "kana": "ガ",
+                    "hepburn": "ga",
+                    "kunrei": "ga",
+                    "passport": "ga",
+                },
+                {
+                    "orig": "吹く",
+                    "hira": "ふく",
+                    "kana": "フク",
+                    "hepburn": "fuku",
+                    "kunrei": "fuku",
+                    "passport": "fuku",
+                },
+            ],
+        ),
+    ],
+)
+def test_kakasi_unihandecode(case, expected):
+    kakasi = pykakasi.Kakasi()
+    result = kakasi.convert(case)
+    if len(result) < len(expected):
+        for i, r in enumerate(result):
+            assert r["orig"] == expected[i]["orig"]
+            assert r["hira"] == expected[i]["hira"]
+            assert r["kana"] == expected[i]["kana"]
+            assert r["hepburn"] == expected[i]["hepburn"]
+            assert r["kunrei"] == expected[i]["kunrei"]
+            assert r["passport"] == expected[i]["passport"]
+    else:
+        for i, e in enumerate(expected):
+            assert result[i]["orig"] == e["orig"]
+            assert result[i]["hira"] == e["hira"]
+            assert result[i]["kana"] == e["kana"]
+            assert result[i]["hepburn"] == e["hepburn"]
+            assert result[i]["kunrei"] == e["kunrei"]
+            assert result[i]["passport"] == e["passport"]