Skip to content
This repository has been archived by the owner on Jul 22, 2022. It is now read-only.

Commit

Permalink
Refactoring main loop
Browse files Browse the repository at this point in the history
Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed May 19, 2021
1 parent 2039971 commit 4236aa8
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 66 deletions.
143 changes: 81 additions & 62 deletions src/pykakasi/kakasi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
# Copyright 2011-2021 Hiroshi Miura <[email protected]>
#
import enum
from typing import Dict, List
from typing import Dict, List, Tuple

import jaconv

from .kanji import Itaiji, JConv
from .properties import Ch
from .scripts import A2, H2, IConv, K2
from .scripts import A2, H2, IConv, K2, Sym2


class PyKakasiException(Exception):
Expand Down Expand Up @@ -41,25 +41,23 @@ def __init__(self):
def normalize(cls, text):
return jaconv.normalize(text)

def _type(self, c: str):
if K2.isRegion(c):
return _TYPE.KANA
elif A2.isRegion(c):
return _TYPE.ALPHA
elif H2.isRegion(c):
return _TYPE.HIRAGANA
# always not Kanji
# elif self._isKanji(c):
# return _TYPE.KANJI
else:
return _TYPE.SYMBOL
def _isAlpha(self, c: str):
return A2.isRegion(c)

def _isSymbol(self, c: str):
return Sym2.isRegion(c)

def _isKana(self, c: str):
return K2.isRegion(c)

def _isHira(self, c: str):
return H2.isRegion(c)

def _isKanji(self, c: str):
return 0x3400 <= ord(c[0]) < 0xE000 or self._itaiji.haskey(ord(c[0]))

def convert(self, text: str) -> List[Dict[str, str]]:
"""Convert input text to dictionary contains KANA, HIRA and romaji results."""
_state = True

if len(text) == 0:
return [
Expand All @@ -73,63 +71,84 @@ def convert(self, text: str) -> List[Dict[str, str]]:
}
]

otext = ""
original_text = ""
kana_text = ""
_result = []
i = 0
prev_type = _TYPE.KANJI
output_flag: Tuple[bool, bool, bool] = (False, False, False)

while i < len(text):
if self._isKanji(text[i]):
t, ln = self._jconv.convert(text[i:])
if ln <= 0:
# When JConv does not convert text
# FIXME: maybe a bug
_state = False
otext = otext + text[i] # pass through
i += 1
# output_flag
# means (output buffer?, output text[i]?, copy and increment i?)
# possible (False, True, True), (True, False, False), (True, True, True)
# (False, False, True)
if text[i] in Ch.endmark:
prev_type = _TYPE.SYMBOL
output_flag = (True, True, True)
elif text[i] in Ch.long_symbols:
# FIXME: special case
output_flag = (False, False, True)
elif self._isSymbol(text[i]):
if prev_type != _TYPE.SYMBOL:
output_flag = (True, False, True)
else:
if _state:
_result.append(self._iconv.convert(otext + text[i : i + ln], t))
else:
_result.append(self._iconv.convert(otext, otext))
_result.append(self._iconv.convert(text[i : i + ln], t))
_state = True
otext = ""
output_flag = (False, True, True)
prev_type = _TYPE.SYMBOL
elif self._isKana(text[i]):
output_flag = (prev_type != _TYPE.KANA, False, True)
prev_type = _TYPE.KANA
elif self._isHira(text[i]):
output_flag = (prev_type != _TYPE.HIRAGANA, False, True)
prev_type = _TYPE.HIRAGANA
elif self._isAlpha(text[i]):
output_flag = (prev_type != _TYPE.ALPHA, False, True)
prev_type = _TYPE.ALPHA
elif self._isKanji(text[i]):
if len(original_text) > 0:
_result.append(self._iconv.convert(original_text, kana_text))
t, ln = self._jconv.convert(text[i:])
prev_type = _TYPE.KANJI
if ln > 0:
original_text = text[i : i + ln]
kana_text = t
i += ln
elif self._type(text[i]) != prev_type:
if text[i] in Ch.endmark:
otext += text[i]
_result.append(self._iconv.convert(otext, otext))
otext = ""
i += 1
_state = True
elif text[i] in self._iconv.LONG_SYMBOLS:
otext += text[i]
output_flag = (False, False, False)
else: # unknown kanji
original_text = text[i]
kana_text = ""
i += 1
_state = False
else:
prev_type = self._type(text[i])
if len(otext) > 0:
_result.append(self._iconv.convert(otext, otext))
otext = text[i]
_state = False
i += 1
else:
_state = False
otext = otext + text[i]
i += 1
output_flag = (True, False, False)
else:
_state = False
otext = otext + text[i]
if len(original_text) > 0:
_result.append(self._iconv.convert(original_text, kana_text))
_result.append(self._iconv.convert(text[i], ""))
i += 1
output_flag = (False, False, False)

# Convert to kana and Output based on flag
if output_flag[0] and output_flag[1]:
original_text += text[i]
kana_text += text[i]
_result.append(self._iconv.convert(original_text, kana_text))
original_text = ""
kana_text = ""
i += 1
elif output_flag[0] and output_flag[2]:
if len(original_text) > 0:
_result.append(self._iconv.convert(original_text, kana_text))
original_text = text[i]
kana_text = text[i]
i += 1
elif output_flag[2]:
original_text += text[i]
kana_text += text[i]
i += 1
else:
pass

if otext[-1] in Ch.endmark:
_result.append(self._iconv.convert(otext, otext))
otext = ""
_state = True

if otext:
# last word
_result.append(self._iconv.convert(otext, otext))
# last word
if len(original_text) > 0:
_result.append(self._iconv.convert(original_text, kana_text))

return _result
2 changes: 2 additions & 0 deletions src/pykakasi/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ class Ch:
zenkaku_A = 0xFF21
zenkaku_a = 0xFF41
endmark = ")]!,.,\u3001\u3002"
long_symbols = "\u30FC\u2015\u2212\uFF70" # "ー ― − ー "
# _UNCHECKED_LONG_SYMBOLS: str = "\u002D\u2010\u2011\u2013\u2014" # "- ‐ ‑ – —"


Ch = Ch()
Expand Down
7 changes: 3 additions & 4 deletions src/pykakasi/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
class IConv:

_MAXLEN: int = 32
LONG_SYMBOLS: str = "\u30FC\u2015\u2212\uFF70" # "ー ― − ー "
# _UNCHECKED_LONG_SYMBOLS: str = "\u002D\u2010\u2011\u2013\u2014" # "- ‐ ‑ – —"

def __init__(self):
self._hahconv = H2("a", method="Hepburn")
Expand Down Expand Up @@ -47,7 +45,7 @@ def _s2a(self, text: str) -> str:
if l1 > 0:
result += t
i += l1
elif text[i] in self.LONG_SYMBOLS: # handle chōonpu sound marks
elif text[i] in Ch.long_symbols: # handle chōonpu sound marks
# use previous char as a transliteration for kana-dash
if len(result) > 0:
result += result[-1]
Expand Down Expand Up @@ -315,7 +313,8 @@ def __init__(self, mode):
else:
self.convert = self.convert_noop

def isRegion(self, char):
@classmethod
def isRegion(cls, char: str):
c = ord(char[0])
return (
(Ch.ideographic_space <= c <= Ch.postal_mark_face)
Expand Down

0 comments on commit 4236aa8

Please sign in to comment.