From 1d9a4ca3f987be516dcc190583d2a54d5253ce87 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 8 Dec 2021 20:30:10 +0900 Subject: [PATCH] Support latin1 characters with reproducable of issue #150 Signed-off-by: Hiroshi Miura --- src/pykakasi/properties.py | 103 +++++++++++++++++++++++++++++- src/pykakasi/scripts.py | 3 + tests/test_pykakasi_structured.py | 10 +++ 3 files changed, 114 insertions(+), 2 deletions(-) diff --git a/src/pykakasi/properties.py b/src/pykakasi/properties.py index 3abf305..5cf3ff9 100644 --- a/src/pykakasi/properties.py +++ b/src/pykakasi/properties.py @@ -40,6 +40,8 @@ class Ch: bracket_bra = 0x7B tilda = 0x7E delete = 0x7F + latin1_inverted_exclam = 0x00A1 + latin1_y_diaeresis = 0x00FF ideographic_space = 0x3000 postal_mark_face = 0x3020 wavy_dash = 0x3030 @@ -101,7 +103,7 @@ class Convert_Tables: a2 f0 | Å ‰ ♯ ♭ ♪ † ‡ ¶ ◯ ---------------------------------------------------------- - Greek convertion table + Greek conversion table ---------------------------------------------------------- "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", @@ -245,7 +247,7 @@ class Convert_Tables: ".", "/", ] - # cyriilic + # cyrillic cyrillic_table = { # basic cyrillic characters "\u0410": "A", "\u0411": "B", @@ -359,6 +361,103 @@ class Convert_Tables: "\uff40", ] # [\]^_` alpha_table_3 = ["\uff5b", "\uff5c", "\uff5d", "\uff5e"] # {|}~ + latin1_table = [ + "!", # inverted exclamation + "cent", # cent mark + "GBP", # pound mark + "currency", # currency mark + "yen", # Yen mark + "|", # broken bar + "ss", # section sign + "..", # diaeresis + "(c)", # copyright + "a", # Feminine Ordinal Indicator + "<<", # left pointing double angle + "not", # not sign + "-", # soft hyphen + "(R)", # registered + "~", # macron + ".", # degree symbol + "+-", # plus-minus sign + "^2", # superscript two + "^3", # superscript three + "`", # acute + "u", # micro sign + "D", # pilcrow sign + ".", # middle dot + ",", # cedilla + "^1", # superscript one + "", # Masculine ordinal indicator + ">>", # right pointing double angle + "1/4", # Vulgar fraction one quarter + "1/2", # Vulgar fraction one half + "3/4", # Vulgar fraction three quarters + "?", # Inverted question mark + "A", + "A", + "A", + "A", + "A", + "A", + "AE", + "C", + "E", + "E", + "E", + "E", + "I", + "I", + "I", + "I", + "Eth", + "N", + "O", + "O", + "O", + "O", + "O", + "x", + "O", + "U", + "U", + "U", + "U", + "Y", + "", + "", + "a", + "a", + "a", + "a", + "a", + "a", + "ae", + "c", + "e", + "e", + "e", + "e", + "i", + "i", + "i", + "i", + "eth", + "n", + "o", + "o", + "o", + "o", + "o", + "/", + "o", + "u", + "u", + "u", + "u", + "y", + "", + "y", + ] Convert_Tables = Convert_Tables() diff --git a/src/pykakasi/scripts.py b/src/pykakasi/scripts.py index a3467f7..595f48f 100644 --- a/src/pykakasi/scripts.py +++ b/src/pykakasi/scripts.py @@ -324,6 +324,7 @@ def isRegion(cls, char: str): or (Ch.greece_alpha <= c <= Ch.greece_omega) or (Ch.cyrillic_A <= c <= Ch.cyrillic_ya) or (Ch.zenkaku_exc_mark <= c <= Ch.zenkaku_number_nine) + or (Ch.latin1_inverted_exclam <= c <= Ch.latin1_y_diaeresis) or (0xFF20 <= c <= 0xFF5E) or c == 0x0451 or c == 0x0401 @@ -351,6 +352,8 @@ def _convert(self, text): return chr(0x0041 + c - 0xFF21) # u\ff21A => u\0041:@A..Z[\]^_` elif 0xFF41 <= c < 0xFF5F: return chr(0x0061 + c - 0xFF41) # u\ff41a => u\0061:a..z{|} + elif Ch.latin1_inverted_exclam <= c <= Ch.latin1_y_diaeresis: + return Convert_Tables.latin1_table[c - Ch.latin1_inverted_exclam] else: return "" # pragma: no cover diff --git a/tests/test_pykakasi_structured.py b/tests/test_pykakasi_structured.py index 4c889ff..309cce7 100644 --- a/tests/test_pykakasi_structured.py +++ b/tests/test_pykakasi_structured.py @@ -623,3 +623,13 @@ def test_kakasi_unihandecode(case, expected): assert result[i]["hepburn"] == e["hepburn"] assert result[i]["kunrei"] == e["kunrei"] assert result[i]["passport"] == e["passport"] + + +def test_issue_150(): + kakasi = pykakasi.kakasi() + result = kakasi.convert("三\u00D7五") + assert result[0]["hira"] == "さん" + assert result[1]["orig"] == "\u00D7" + assert result[1]["hira"] == "×" + assert result[2]["hira"] == "ご" + assert len(result) == 3