This repository has been archived by the owner on Jul 22, 2022. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix separation of KANA and HIRAGANA words(#143)
fixed issue #142 Signed-off-by: Hiroshi Miura <[email protected]>
- Loading branch information
Showing
6 changed files
with
217 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,14 @@ | |
# | ||
# Copyright 2011-2021 Hiroshi Miura <[email protected]> | ||
# | ||
import enum | ||
from typing import Dict, List | ||
|
||
import jaconv | ||
|
||
from .kanji import Itaiji, JConv | ||
from .properties import Ch | ||
from .scripts import IConv | ||
from .scripts import A2, H2, IConv, K2 | ||
|
||
|
||
class PyKakasiException(Exception): | ||
|
@@ -20,6 +21,14 @@ class UnknownCharacterException(PyKakasiException): | |
pass | ||
|
||
|
||
class _TYPE(enum.Enum): | ||
KANJI = 1 | ||
KANA = 2 | ||
HIRAGANA = 3 | ||
SYMBOL = 4 | ||
ALPHA = 5 | ||
|
||
|
||
class Kakasi: | ||
"""Kakasi is a conversion class for Japanese text.""" | ||
|
||
|
@@ -32,6 +41,19 @@ def __init__(self): | |
def normalize(cls, text): | ||
return jaconv.normalize(text) | ||
|
||
def _type(self, c: str): | ||
if K2.isRegion(c): | ||
return _TYPE.KANA | ||
elif A2.isRegion(c): | ||
return _TYPE.ALPHA | ||
elif H2.isRegion(c): | ||
return _TYPE.HIRAGANA | ||
# always not Kanji | ||
# elif self._isKanji(c): | ||
# return _TYPE.KANJI | ||
else: | ||
return _TYPE.SYMBOL | ||
|
||
def _isKanji(self, c: str): | ||
return 0x3400 <= ord(c[0]) < 0xE000 or self._itaiji.haskey(ord(c[0])) | ||
|
||
|
@@ -54,13 +76,16 @@ def convert(self, text: str) -> List[Dict[str, str]]: | |
otext = "" | ||
_result = [] | ||
i = 0 | ||
prev_type = _TYPE.KANJI | ||
|
||
while i < len(text): | ||
if self._isKanji(text[i]): | ||
t, ln = self._jconv.convert(text[i:]) | ||
if ln <= 0: # When JConv successfully convert text | ||
if ln <= 0: | ||
# When JConv does not convert text | ||
# FIXME: maybe a bug | ||
_state = False | ||
otext = otext + text[i] | ||
otext = otext + text[i] # pass through | ||
i += 1 | ||
else: | ||
if _state: | ||
|
@@ -71,12 +96,34 @@ def convert(self, text: str) -> List[Dict[str, str]]: | |
_state = True | ||
otext = "" | ||
i += ln | ||
elif self._type(text[i]) != prev_type: | ||
if text[i] in Ch.endmark: | ||
otext += text[i] | ||
_result.append(self._iconv.convert(otext, otext)) | ||
otext = "" | ||
i += 1 | ||
_state = True | ||
elif text[i] in self._iconv.LONG_SYMBOLS: | ||
otext += text[i] | ||
i += 1 | ||
_state = False | ||
else: | ||
prev_type = self._type(text[i]) | ||
if len(otext) > 0: | ||
_result.append(self._iconv.convert(otext, otext)) | ||
otext = text[i] | ||
_state = False | ||
i += 1 | ||
else: | ||
_state = False | ||
otext = otext + text[i] | ||
i += 1 | ||
else: | ||
_state = False | ||
otext = otext + text[i] | ||
i += 1 | ||
|
||
if ord(otext[-1]) in Ch.endmark: | ||
if otext[-1] in Ch.endmark: | ||
_result.append(self._iconv.convert(otext, otext)) | ||
otext = "" | ||
_state = True | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters