diff --git a/.gitignore b/.gitignore index abe3612..d468ba1 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,4 @@ venv/ # ignore .ignore/ +.DS_Store diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..91cc012 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,9 @@ +# https://travis-ci.org/mpcabd/python-arabic-reshaper +language: python +python: + - "2.7" + - "3.6" +install: + - "pip install -e ." +script: + - "python setup.py test" diff --git a/MANIFEST.in b/MANIFEST.in index 4ee273c..4c933bf 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ -include default-config.ini -include arabic_reshaper.py +include arabic_reshaper/default-config.ini +include arabic_reshaper/arabic_reshaper.py include README \ No newline at end of file diff --git a/__init__.py b/arabic_reshaper/__init__.py similarity index 100% rename from __init__.py rename to arabic_reshaper/__init__.py diff --git a/arabic_reshaper/arabic_reshaper.py b/arabic_reshaper/arabic_reshaper.py new file mode 100644 index 0000000..a7fff00 --- /dev/null +++ b/arabic_reshaper/arabic_reshaper.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- + +# This work is licensed under the GNU Public License (GPL). +# To view a copy of this license, visit http://www.gnu.org/copyleft/gpl.html + +# Written by Abdullah Diab (mpcabd) +# Email: mpcabd@gmail.com +# Website: http://mpcabd.xyz + +# Ported and tweaked from Java to Python, from Better Arabic Reshaper +# [https://github.com/agawish/Better-Arabic-Reshaper/] + +# Usage: +# Install python-bidi [https://github.com/MeirKriheli/python-bidi], can be +# installed from pip `pip install python-bidi`. + +# import arabic_reshaper +# from bidi.algorithm import get_display +# reshaped_text = arabic_reshaper.reshape('اللغة العربية رائعة') +# bidi_text = get_display(reshaped_text) +# Now you can pass `bidi_text` to any function that handles +# displaying/printing of the text, like writing it to PIL Image or passing it +# to a PDF generating method. + +from __future__ import unicode_literals +from builtins import range + +import re +import os + +from configparser import ConfigParser +from itertools import repeat +from pkg_resources import resource_filename + +from .ligatures import * +from .letters import * + +HARAKAT_RE = re.compile( + '[' + '\u0610-\u061a' + '\u064b-\u065f' + '\u0670' + '\u06d6-\u06dc' + '\u06df-\u06e8' + '\u06ea-\u06ed' + '\u08d4-\u08e1' + '\u08d4-\u08ed' + '\u08e3-\u08ff' + ']', + + re.UNICODE | re.X +) + + +class ArabicReshaper(object): + """ + A class for Arabic reshaper, it allows for fine-tune configuration over the + API. + + If no configuration is passed to the constructor, the class will check for + an environment variable :envvar:`PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE` + , if the variable is available, the class will load the file pointed to by + the variable, and will read it as an ini file. + If the variable doesn't exist, the class will load with the default + configuration file :file:`default-config.ini` + + Check these links for information on the configuration files format: + + * Python 3: https://docs.python.org/3/library/configparser.html + * Python 2: https://docs.python.org/2/library/configparser.html + + See the default configuration file :file:`default-config.ini` for details + on how to configure your reshaper. + """ + def __init__(self, configuration=None, configuration_file=None): + super(ArabicReshaper, self).__init__() + + configuration_files = [ + resource_filename(__name__, 'default-config.ini') + ] + + if not os.path.exists(configuration_files[0]): + raise Exception( + ('Default configuration file {} not found,' + + ' check the module installation.').format( + configuration_files[0], + ) + ) + + loaded_from_envvar = False + + if not configuration_file: + configuration_file = os.getenv( + 'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE' + ) + if configuration_file: + loaded_from_envvar = True + + if configuration_file: + if not os.path.exists(configuration_file): + raise Exception( + 'Configuration file {} not found{}.'.format( + configuration_file, + loaded_from_envvar and ( + ' it is set in your environment variable ' + + 'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE' + ) or '' + ) + ) + configuration_files.append(configuration_file) + + configuration_parser = ConfigParser() + configuration_from_files = configuration_parser.read( + configuration_files + ) + + if configuration: + configuration_parser.read_dict({ + 'ArabicReshaper': configuration + }) + + if 'ArabicReshaper' not in configuration_parser: + raise ValueError( + 'Invalid configuration: ' + 'A section with the name ArabicReshaper was not found' + ) + + configuration = configuration_parser['ArabicReshaper'] + self.configuration = configuration + + @property + def _ligatures_re(self): + if not hasattr(self, '__ligatures_re'): + patterns = [] + re_group_index_to_ligature_forms = {} + index = 0 + FORMS = 1 + MATCH = 0 + for ligature_record in LIGATURES: + ligature, replacement = ligature_record + if not self.configuration.getboolean(ligature): + continue + re_group_index_to_ligature_forms[index] = replacement[FORMS] + patterns.append('({})'.format(replacement[MATCH])) + index += 1 + self._re_group_index_to_ligature_forms = ( + re_group_index_to_ligature_forms + ) + self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE) + return self.__ligatures_re + + def _get_ligature_forms_from_re_group_index(self, group_index): + if not hasattr(self, '_re_group_index_to_ligature_forms'): + self._ligatures_re + return self._re_group_index_to_ligature_forms[group_index] + + def reshape(self, text): + if not text: + return '' + + output = [] + + LETTER = 0 + FORM = 1 + NOT_SUPPORTED = -1 + + delete_harakat = self.configuration.getboolean('delete_harakat') + positions_harakat = {} + + for letter in text: + if HARAKAT_RE.match(letter): + if not delete_harakat: + position = len(output) - 1 + if position not in positions_harakat: + positions_harakat[position] = [] + positions_harakat[position].append(letter) + elif letter not in LETTERS: + output.append((letter, NOT_SUPPORTED)) + elif not output: + output.append((letter, ISOLATED)) + else: + previous_letter = output[-1] + if previous_letter[FORM] == NOT_SUPPORTED: + output.append((letter, ISOLATED)) + elif not connects_with_letter_before(letter): + output.append((letter, ISOLATED)) + elif not connects_with_letter_after( + previous_letter[LETTER] + ): + output.append((letter, ISOLATED)) + elif (previous_letter[FORM] == FINAL and not + connects_with_letters_before_and_after( + previous_letter[LETTER] + )): + output.append((letter, ISOLATED)) + elif previous_letter[FORM] == ISOLATED: + output[-1] = ( + previous_letter[LETTER], + INITIAL + ) + output.append((letter, FINAL)) + # Otherwise, we will change the previous letter to connect + # to the current letter + else: + output[-1] = ( + previous_letter[LETTER], + MEDIAL + ) + output.append((letter, FINAL)) + + if self.configuration.getboolean('support_ligatures'): + # Clean text from Harakat to be able to find ligatures + text = HARAKAT_RE.sub('', text) + for match in re.finditer(self._ligatures_re, text): + group_index = next(( + i for i, group in enumerate(match.groups()) if group + ), -1) + forms = self._get_ligature_forms_from_re_group_index( + group_index + ) + a, b = match.span() + a_form = output[a][FORM] + b_form = output[b - 1][FORM] + ligature_form = None + + # +-----------+----------+---------+---------+----------+ + # | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL | + # +-----------+----------+---------+---------+----------+ + # | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED | + # | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED | + # | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL | + # | FINAL | FINAL | MEDIAL | MEDIAL | FINAL | + # +-----------+----------+---------+---------+----------+ + + if a_form in (ISOLATED, INITIAL): + if b_form in (ISOLATED, FINAL): + ligature_form = ISOLATED + else: + ligature_form = INITIAL + else: + if b_form in (ISOLATED, FINAL): + ligature_form = FINAL + else: + ligature_form = MEDIAL + if not forms[ligature_form]: + continue + output[a] = (forms[ligature_form], NOT_SUPPORTED) + output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) + + result = [] + if not delete_harakat and -1 in positions_harakat: + result.extend(positions_harakat[-1]) + for i, o in enumerate(output): + if o[LETTER]: + if o[FORM] == NOT_SUPPORTED: + result.append(o[LETTER]) + else: + result.append(LETTERS[o[LETTER]][o[FORM]]) + + if not delete_harakat: + if i in positions_harakat: + result.extend(positions_harakat[i]) + + return ''.join(result) + + +default_reshaper = ArabicReshaper() +reshape = default_reshaper.reshape diff --git a/default-config.ini b/arabic_reshaper/default-config.ini similarity index 100% rename from default-config.ini rename to arabic_reshaper/default-config.ini diff --git a/arabic_reshaper/letters.py b/arabic_reshaper/letters.py new file mode 100644 index 0000000..1f9ce77 --- /dev/null +++ b/arabic_reshaper/letters.py @@ -0,0 +1,197 @@ +# Each letter is of the format: +# +# ('', ) +# +# And replacement is of the format: +# +# ('', '', '', '') +# +# Where is the string to replace, and is the replacement in +# case should be in isolated form, is the replacement in +# case should be in initial form, is the replacement in case +# should be in medial form, and is the replacement in case +# should be in final form. If no replacement is specified for a form, +# then no that means the letter doesn't support this form. + +ISOLATED = 0 +INITIAL = 1 +MEDIAL = 2 +FINAL = 3 + +LETTERS = { + # ARABIC LETTER HAMZA + '\u0621': ('\uFE80', '', '', ''), + # ARABIC LETTER ALEF WITH MADDA ABOVE + '\u0622': ('\uFE81', '', '', '\uFE82'), + # ARABIC LETTER ALEF WITH HAMZA ABOVE + '\u0623': ('\uFE83', '', '', '\uFE84'), + # ARABIC LETTER WAW WITH HAMZA ABOVE + '\u0624': ('\uFE85', '', '', '\uFE86'), + # ARABIC LETTER ALEF WITH HAMZA BELOW + '\u0625': ('\uFE87', '', '', '\uFE88'), + # ARABIC LETTER YEH WITH HAMZA ABOVE + '\u0626': ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A'), + # ARABIC LETTER ALEF + '\u0627': ('\uFE8D', '', '', '\uFE8E'), + # ARABIC LETTER BEH + '\u0628': ('\uFE8F', '\uFE91', '\uFE92', '\uFE90'), + # ARABIC LETTER TEH MARBUTA + '\u0629': ('\uFE93', '', '', '\uFE94'), + # ARABIC LETTER TEH + '\u062A': ('\uFE95', '\uFE97', '\uFE98', '\uFE96'), + # ARABIC LETTER THEH + '\u062B': ('\uFE99', '\uFE9B', '\uFE9C', '\uFE9A'), + # ARABIC LETTER JEEM + '\u062C': ('\uFE9D', '\uFE9F', '\uFEA0', '\uFE9E'), + # ARABIC LETTER HAH + '\u062D': ('\uFEA1', '\uFEA3', '\uFEA4', '\uFEA2'), + # ARABIC LETTER KHAH + '\u062E': ('\uFEA5', '\uFEA7', '\uFEA8', '\uFEA6'), + # ARABIC LETTER DAL + '\u062F': ('\uFEA9', '', '', '\uFEAA'), + # ARABIC LETTER THAL + '\u0630': ('\uFEAB', '', '', '\uFEAC'), + # ARABIC LETTER REH + '\u0631': ('\uFEAD', '', '', '\uFEAE'), + # ARABIC LETTER ZAIN + '\u0632': ('\uFEAF', '', '', '\uFEB0'), + # ARABIC LETTER SEEN + '\u0633': ('\uFEB1', '\uFEB3', '\uFEB4', '\uFEB2'), + # ARABIC LETTER SHEEN + '\u0634': ('\uFEB5', '\uFEB7', '\uFEB8', '\uFEB6'), + # ARABIC LETTER SAD + '\u0635': ('\uFEB9', '\uFEBB', '\uFEBC', '\uFEBA'), + # ARABIC LETTER DAD + '\u0636': ('\uFEBD', '\uFEBF', '\uFEC0', '\uFEBE'), + # ARABIC LETTER TAH + '\u0637': ('\uFEC1', '\uFEC3', '\uFEC4', '\uFEC2'), + # ARABIC LETTER ZAH + '\u0638': ('\uFEC5', '\uFEC7', '\uFEC8', '\uFEC6'), + # ARABIC LETTER AIN + '\u0639': ('\uFEC9', '\uFECB', '\uFECC', '\uFECA'), + # ARABIC LETTER GHAIN + '\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'), + # ARABIC TATWEEL + '\u0640': ('\u0640', '\u0640', '\u0640', '\u0640'), + # ARABIC LETTER FEH + '\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'), + # ARABIC LETTER QAF + '\u0642': ('\uFED5', '\uFED7', '\uFED8', '\uFED6'), + # ARABIC LETTER KAF + '\u0643': ('\uFED9', '\uFEDB', '\uFEDC', '\uFEDA'), + # ARABIC LETTER LAM + '\u0644': ('\uFEDD', '\uFEDF', '\uFEE0', '\uFEDE'), + # ARABIC LETTER MEEM + '\u0645': ('\uFEE1', '\uFEE3', '\uFEE4', '\uFEE2'), + # ARABIC LETTER NOON + '\u0646': ('\uFEE5', '\uFEE7', '\uFEE8', '\uFEE6'), + # ARABIC LETTER HEH + '\u0647': ('\uFEE9', '\uFEEB', '\uFEEC', '\uFEEA'), + # ARABIC LETTER WAW + '\u0648': ('\uFEED', '', '', '\uFEEE'), + # ARABIC LETTER ALEF MAKSURA + '\u0649': ('\uFEEF', '', '', '\uFEF0'), + # ARABIC LETTER YEH + '\u064A': ('\uFEF1', '\uFEF3', '\uFEF4', '\uFEF2'), + # ARABIC LETTER ALEF WASLA + '\u0671': ('\uFB50', '', '', '\uFB51'), + # ARABIC LETTER U WITH HAMZA ABOVE + '\u0677': ('\uFBDD', '', '', ''), + # ARABIC LETTER TTEH + '\u0679': ('\uFB66', '\uFB68', '\uFB69', '\uFB67'), + # ARABIC LETTER TTEHEH + '\u067A': ('\uFB5E', '\uFB60', '\uFB61', '\uFB5F'), + # ARABIC LETTER BEEH + '\u067B': ('\uFB52', '\uFB54', '\uFB55', '\uFB53'), + # ARABIC LETTER PEH + '\u067E': ('\uFB56', '\uFB58', '\uFB59', '\uFB57'), + # ARABIC LETTER TEHEH + '\u067F': ('\uFB62', '\uFB64', '\uFB65', '\uFB63'), + # ARABIC LETTER BEHEH + '\u0680': ('\uFB5A', '\uFB5C', '\uFB5D', '\uFB5B'), + # ARABIC LETTER NYEH + '\u0683': ('\uFB76', '\uFB78', '\uFB79', '\uFB77'), + # ARABIC LETTER DYEH + '\u0684': ('\uFB72', '\uFB74', '\uFB75', '\uFB73'), + # ARABIC LETTER TCHEH + '\u0686': ('\uFB7A', '\uFB7C', '\uFB7D', '\uFB7B'), + # ARABIC LETTER TCHEHEH + '\u0687': ('\uFB7E', '\uFB80', '\uFB81', '\uFB7F'), + # ARABIC LETTER DDAL + '\u0688': ('\uFB88', '', '', '\uFB89'), + # ARABIC LETTER DAHAL + '\u068C': ('\uFB84', '', '', '\uFB85'), + # ARABIC LETTER DDAHAL + '\u068D': ('\uFB82', '', '', '\uFB83'), + # ARABIC LETTER DUL + '\u068E': ('\uFB86', '', '', '\uFB87'), + # ARABIC LETTER RREH + '\u0691': ('\uFB8C', '', '', '\uFB8D'), + # ARABIC LETTER JEH + '\u0698': ('\uFB8A', '', '', '\uFB8B'), + # ARABIC LETTER VEH + '\u06A4': ('\uFB6A', '\uFB6C', '\uFB6D', '\uFB6B'), + # ARABIC LETTER PEHEH + '\u06A6': ('\uFB6E', '\uFB70', '\uFB71', '\uFB6F'), + # ARABIC LETTER KEHEH + '\u06A9': ('\uFB8E', '\uFB90', '\uFB91', '\uFB8F'), + # ARABIC LETTER NG + '\u06AD': ('\uFBD3', '\uFBD5', '\uFBD6', '\uFBD4'), + # ARABIC LETTER GAF + '\u06AF': ('\uFB92', '\uFB94', '\uFB95', '\uFB93'), + # ARABIC LETTER NGOEH + '\u06B1': ('\uFB9A', '\uFB9C', '\uFB9D', '\uFB9B'), + # ARABIC LETTER GUEH + '\u06B3': ('\uFB96', '\uFB98', '\uFB99', '\uFB97'), + # ARABIC LETTER NOON GHUNNA + '\u06BA': ('\uFB9E', '', '', '\uFB9F'), + # ARABIC LETTER RNOON + '\u06BB': ('\uFBA0', '\uFBA2', '\uFBA3', '\uFBA1'), + # ARABIC LETTER HEH DOACHASHMEE + '\u06BE': ('\uFBAA', '\uFBAC', '\uFBAD', '\uFBAB'), + # ARABIC LETTER HEH WITH YEH ABOVE + '\u06C0': ('\uFBA4', '', '', '\uFBA5'), + # ARABIC LETTER HEH GOAL + '\u06C1': ('\uFBA6', '\uFBA8', '\uFBA9', '\uFBA7'), + # ARABIC LETTER KIRGHIZ OE + '\u06C5': ('\uFBE0', '', '', '\uFBE1'), + # ARABIC LETTER OE + '\u06C6': ('\uFBD9', '', '', '\uFBDA'), + # ARABIC LETTER U + '\u06C7': ('\uFBD7', '', '', '\uFBD8'), + # ARABIC LETTER YU + '\u06C8': ('\uFBDB', '', '', '\uFBDC'), + # ARABIC LETTER KIRGHIZ YU + '\u06C9': ('\uFBE2', '', '', '\uFBE3'), + # ARABIC LETTER VE + '\u06CB': ('\uFBDE', '', '', '\uFBDF'), + # ARABIC LETTER FARSI YEH + '\u06CC': ('\uFBFC', '\uFBFE', '\uFBFF', '\uFBFD'), + # ARABIC LETTER E + '\u06D0': ('\uFBE4', '\uFBE6', '\uFBE7', '\uFBE5'), + # ARABIC LETTER YEH BARREE + '\u06D2': ('\uFBAE', '', '', '\uFBAF'), + # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE + '\u06D3': ('\uFBB0', '', '', '\uFBB1'), +} + + +def connects_with_letter_before(letter): + if letter not in LETTERS: + return False + forms = LETTERS[letter] + return forms[FINAL] or forms[MEDIAL] + + +def connects_with_letter_after(letter): + if letter not in LETTERS: + return False + forms = LETTERS[letter] + return forms[INITIAL] or forms[MEDIAL] + + +def connects_with_letters_before_and_after(letter): + if letter not in LETTERS: + return False + forms = LETTERS[letter] + return forms[MEDIAL] diff --git a/arabic_reshaper.py b/arabic_reshaper/ligatures.py similarity index 66% rename from arabic_reshaper.py rename to arabic_reshaper/ligatures.py index edfec39..e769f88 100644 --- a/arabic_reshaper.py +++ b/arabic_reshaper/ligatures.py @@ -1,39 +1,3 @@ -# -*- coding: utf-8 -*- - -# This work is licensed under the GNU Public License (GPL). -# To view a copy of this license, visit http://www.gnu.org/copyleft/gpl.html - -# Written by Abdullah Diab (mpcabd) -# Email: mpcabd@gmail.com -# Website: http://mpcabd.xyz - -# Ported and tweaked from Java to Python, from Better Arabic Reshaper -# [https://github.com/agawish/Better-Arabic-Reshaper/] - -# Usage: -# Install python-bidi [https://github.com/MeirKriheli/python-bidi], can be -# installed from pip `pip install python-bidi`. - -# import arabic_reshaper -# from bidi.algorithm import get_display -# reshaped_text = arabic_reshaper.reshape('اللغة العربية رائعة') -# bidi_text = get_display(reshaped_text) -# Now you can pass `bidi_text` to any function that handles -# displaying/printing of the text, like writing it to PIL Image or passing it -# to a PDF generating method. - -from __future__ import unicode_literals -from builtins import range - -import re -import os - -from configparser import ConfigParser -from itertools import repeat -from pkg_resources import resource_filename - -# ----------------------- Begin: Ligatures Definitions ---------------------- # - # Each ligature is of the format: # # ('', ) @@ -54,6 +18,7 @@ # 2. Words # 3. Letters # This way we make sure we replace the longest ligatures first + LIGATURES = ( # Sentences ('ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM', ( @@ -959,428 +924,3 @@ '\u0638\u0645', ('\uFC28', '\uFCB9', '\uFD3B', ''), )), ) - -# ------------------------ End: Ligatures Definitions ----------------------- # - -# ------------------------ Begin: Letters Definitions ----------------------- # - -# Each letter is of the format: -# -# ('', ) -# -# And replacement is of the format: -# -# ('', '', '', '') -# -# Where is the string to replace, and is the replacement in -# case should be in isolated form, is the replacement in -# case should be in initial form, is the replacement in case -# should be in medial form, and is the replacement in case -# should be in final form. If no replacement is specified for a form, -# then no that means the letter doesn't support this form. - -ISOLATED = 0 -INITIAL = 1 -MEDIAL = 2 -FINAL = 3 - -LETTERS = { - # ARABIC LETTER HAMZA - '\u0621': ('\uFE80', '', '', ''), - # ARABIC LETTER ALEF WITH MADDA ABOVE - '\u0622': ('\uFE81', '', '', '\uFE82'), - # ARABIC LETTER ALEF WITH HAMZA ABOVE - '\u0623': ('\uFE83', '', '', '\uFE84'), - # ARABIC LETTER WAW WITH HAMZA ABOVE - '\u0624': ('\uFE85', '', '', '\uFE86'), - # ARABIC LETTER ALEF WITH HAMZA BELOW - '\u0625': ('\uFE87', '', '', '\uFE88'), - # ARABIC LETTER YEH WITH HAMZA ABOVE - '\u0626': ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A'), - # ARABIC LETTER ALEF - '\u0627': ('\uFE8D', '', '', '\uFE8E'), - # ARABIC LETTER BEH - '\u0628': ('\uFE8F', '\uFE91', '\uFE92', '\uFE90'), - # ARABIC LETTER TEH MARBUTA - '\u0629': ('\uFE93', '', '', '\uFE94'), - # ARABIC LETTER TEH - '\u062A': ('\uFE95', '\uFE97', '\uFE98', '\uFE96'), - # ARABIC LETTER THEH - '\u062B': ('\uFE99', '\uFE9B', '\uFE9C', '\uFE9A'), - # ARABIC LETTER JEEM - '\u062C': ('\uFE9D', '\uFE9F', '\uFEA0', '\uFE9E'), - # ARABIC LETTER HAH - '\u062D': ('\uFEA1', '\uFEA3', '\uFEA4', '\uFEA2'), - # ARABIC LETTER KHAH - '\u062E': ('\uFEA5', '\uFEA7', '\uFEA8', '\uFEA6'), - # ARABIC LETTER DAL - '\u062F': ('\uFEA9', '', '', '\uFEAA'), - # ARABIC LETTER THAL - '\u0630': ('\uFEAB', '', '', '\uFEAC'), - # ARABIC LETTER REH - '\u0631': ('\uFEAD', '', '', '\uFEAE'), - # ARABIC LETTER ZAIN - '\u0632': ('\uFEAF', '', '', '\uFEB0'), - # ARABIC LETTER SEEN - '\u0633': ('\uFEB1', '\uFEB3', '\uFEB4', '\uFEB2'), - # ARABIC LETTER SHEEN - '\u0634': ('\uFEB5', '\uFEB7', '\uFEB8', '\uFEB6'), - # ARABIC LETTER SAD - '\u0635': ('\uFEB9', '\uFEBB', '\uFEBC', '\uFEBA'), - # ARABIC LETTER DAD - '\u0636': ('\uFEBD', '\uFEBF', '\uFEC0', '\uFEBE'), - # ARABIC LETTER TAH - '\u0637': ('\uFEC1', '\uFEC3', '\uFEC4', '\uFEC2'), - # ARABIC LETTER ZAH - '\u0638': ('\uFEC5', '\uFEC7', '\uFEC8', '\uFEC6'), - # ARABIC LETTER AIN - '\u0639': ('\uFEC9', '\uFECB', '\uFECC', '\uFECA'), - # ARABIC LETTER GHAIN - '\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'), - # ARABIC TATWEEL - '\u0640': ('\u0640', '\u0640', '\u0640', '\u0640'), - # ARABIC LETTER FEH - '\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'), - # ARABIC LETTER QAF - '\u0642': ('\uFED5', '\uFED7', '\uFED8', '\uFED6'), - # ARABIC LETTER KAF - '\u0643': ('\uFED9', '\uFEDB', '\uFEDC', '\uFEDA'), - # ARABIC LETTER LAM - '\u0644': ('\uFEDD', '\uFEDF', '\uFEE0', '\uFEDE'), - # ARABIC LETTER MEEM - '\u0645': ('\uFEE1', '\uFEE3', '\uFEE4', '\uFEE2'), - # ARABIC LETTER NOON - '\u0646': ('\uFEE5', '\uFEE7', '\uFEE8', '\uFEE6'), - # ARABIC LETTER HEH - '\u0647': ('\uFEE9', '\uFEEB', '\uFEEC', '\uFEEA'), - # ARABIC LETTER WAW - '\u0648': ('\uFEED', '', '', '\uFEEE'), - # ARABIC LETTER ALEF MAKSURA - '\u0649': ('\uFEEF', '', '', '\uFEF0'), - # ARABIC LETTER YEH - '\u064A': ('\uFEF1', '\uFEF3', '\uFEF4', '\uFEF2'), - # ARABIC LETTER ALEF WASLA - '\u0671': ('\uFB50', '', '', '\uFB51'), - # ARABIC LETTER U WITH HAMZA ABOVE - '\u0677': ('\uFBDD', '', '', ''), - # ARABIC LETTER TTEH - '\u0679': ('\uFB66', '\uFB68', '\uFB69', '\uFB67'), - # ARABIC LETTER TTEHEH - '\u067A': ('\uFB5E', '\uFB60', '\uFB61', '\uFB5F'), - # ARABIC LETTER BEEH - '\u067B': ('\uFB52', '\uFB54', '\uFB55', '\uFB53'), - # ARABIC LETTER PEH - '\u067E': ('\uFB56', '\uFB58', '\uFB59', '\uFB57'), - # ARABIC LETTER TEHEH - '\u067F': ('\uFB62', '\uFB64', '\uFB65', '\uFB63'), - # ARABIC LETTER BEHEH - '\u0680': ('\uFB5A', '\uFB5C', '\uFB5D', '\uFB5B'), - # ARABIC LETTER NYEH - '\u0683': ('\uFB76', '\uFB78', '\uFB79', '\uFB77'), - # ARABIC LETTER DYEH - '\u0684': ('\uFB72', '\uFB74', '\uFB75', '\uFB73'), - # ARABIC LETTER TCHEH - '\u0686': ('\uFB7A', '\uFB7C', '\uFB7D', '\uFB7B'), - # ARABIC LETTER TCHEHEH - '\u0687': ('\uFB7E', '\uFB80', '\uFB81', '\uFB7F'), - # ARABIC LETTER DDAL - '\u0688': ('\uFB88', '', '', '\uFB89'), - # ARABIC LETTER DAHAL - '\u068C': ('\uFB84', '', '', '\uFB85'), - # ARABIC LETTER DDAHAL - '\u068D': ('\uFB82', '', '', '\uFB83'), - # ARABIC LETTER DUL - '\u068E': ('\uFB86', '', '', '\uFB87'), - # ARABIC LETTER RREH - '\u0691': ('\uFB8C', '', '', '\uFB8D'), - # ARABIC LETTER JEH - '\u0698': ('\uFB8A', '', '', '\uFB8B'), - # ARABIC LETTER VEH - '\u06A4': ('\uFB6A', '\uFB6C', '\uFB6D', '\uFB6B'), - # ARABIC LETTER PEHEH - '\u06A6': ('\uFB6E', '\uFB70', '\uFB71', '\uFB6F'), - # ARABIC LETTER KEHEH - '\u06A9': ('\uFB8E', '\uFB90', '\uFB91', '\uFB8F'), - # ARABIC LETTER NG - '\u06AD': ('\uFBD3', '\uFBD5', '\uFBD6', '\uFBD4'), - # ARABIC LETTER GAF - '\u06AF': ('\uFB92', '\uFB94', '\uFB95', '\uFB93'), - # ARABIC LETTER NGOEH - '\u06B1': ('\uFB9A', '\uFB9C', '\uFB9D', '\uFB9B'), - # ARABIC LETTER GUEH - '\u06B3': ('\uFB96', '\uFB98', '\uFB99', '\uFB97'), - # ARABIC LETTER NOON GHUNNA - '\u06BA': ('\uFB9E', '', '', '\uFB9F'), - # ARABIC LETTER RNOON - '\u06BB': ('\uFBA0', '\uFBA2', '\uFBA3', '\uFBA1'), - # ARABIC LETTER HEH DOACHASHMEE - '\u06BE': ('\uFBAA', '\uFBAC', '\uFBAD', '\uFBAB'), - # ARABIC LETTER HEH WITH YEH ABOVE - '\u06C0': ('\uFBA4', '', '', '\uFBA5'), - # ARABIC LETTER HEH GOAL - '\u06C1': ('\uFBA6', '\uFBA8', '\uFBA9', '\uFBA7'), - # ARABIC LETTER KIRGHIZ OE - '\u06C5': ('\uFBE0', '', '', '\uFBE1'), - # ARABIC LETTER OE - '\u06C6': ('\uFBD9', '', '', '\uFBDA'), - # ARABIC LETTER U - '\u06C7': ('\uFBD7', '', '', '\uFBD8'), - # ARABIC LETTER YU - '\u06C8': ('\uFBDB', '', '', '\uFBDC'), - # ARABIC LETTER KIRGHIZ YU - '\u06C9': ('\uFBE2', '', '', '\uFBE3'), - # ARABIC LETTER VE - '\u06CB': ('\uFBDE', '', '', '\uFBDF'), - # ARABIC LETTER FARSI YEH - '\u06CC': ('\uFBFC', '\uFBFE', '\uFBFF', '\uFBFD'), - # ARABIC LETTER E - '\u06D0': ('\uFBE4', '\uFBE6', '\uFBE7', '\uFBE5'), - # ARABIC LETTER YEH BARREE - '\u06D2': ('\uFBAE', '', '', '\uFBAF'), - # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE - '\u06D3': ('\uFBB0', '', '', '\uFBB1'), -} - -HARAKAT_RE = re.compile( - '[' - '\u0610-\u061a' - '\u064b-\u065f' - '\u0670' - '\u06d6-\u06dc' - '\u06df-\u06e8' - '\u06ea-\u06ed' - '\u08d4-\u08e1' - '\u08d4-\u08ed' - '\u08e3-\u08ff' - ']', - - re.UNICODE | re.X -) - - -def _connects_with_letter_before(letter): - if letter not in LETTERS: - return False - forms = LETTERS[letter] - return forms[FINAL] or forms[MEDIAL] - - -def _connects_with_letter_after(letter): - if letter not in LETTERS: - return False - forms = LETTERS[letter] - return forms[INITIAL] or forms[MEDIAL] - - -def _connects_with_letters_before_and_after(letter): - if letter not in LETTERS: - return False - forms = LETTERS[letter] - return forms[MEDIAL] - -# ------------------------- End: Letters Definitions ------------------------ # - -# ----------------------------- Begin: Reshaper ---------------------------- # - - -class ArabicReshaper(object): - """ - A class for Arabic reshaper, it allows for fine-tune configuration over the - API. - - If no configuration is passed to the constructor, the class will check for - an environment variable :envvar:`PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE` - , if the variable is available, the class will load the file pointed to by - the variable, and will read it as an ini file. - If the variable doesn't exist, the class will load with the default - configuration file :file:`default-config.ini` - - Check these links for information on the configuration files format: - - * Python 3: https://docs.python.org/3/library/configparser.html - * Python 2: https://docs.python.org/2/library/configparser.html - - See the default configuration file :file:`default-config.ini` for details - on how to configure your reshaper. - """ - def __init__(self, configuration=None, configuration_file=None): - super(ArabicReshaper, self).__init__() - - configuration_files = [ - resource_filename(__name__, 'default-config.ini') - ] - - if not os.path.exists(configuration_files[0]): - raise Exception( - ('Default configuration file {} not found,' + - ' check the module installation.').format( - configuration_files[0], - ) - ) - - - loaded_from_envvar = False - - if not configuration_file: - configuration_file = os.getenv( - 'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE' - ) - if configuration_file: - loaded_from_envvar = True - - if configuration_file: - if not os.path.exists(configuration_file): - raise Exception( - 'Configuration file {} not found{}.'.format( - configuration_file, - loaded_from_envvar and ( - ' it is set in your environment variable ' + - 'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE' - ) or '' - ) - ) - configuration_files.append(configuration_file) - - configuration_parser = ConfigParser() - configuration_from_files = configuration_parser.read( - configuration_files - ) - - if configuration: - configuration_parser.read_dict({ - 'ArabicReshaper': configuration - }) - - if 'ArabicReshaper' not in configuration_parser: - raise ValueError( - 'Invalid configuration: ' - 'A section with the name ArabicReshaper was not found' - ) - - configuration = configuration_parser['ArabicReshaper'] - self.configuration = configuration - - @property - def _ligatures_re(self): - if not hasattr(self, '__ligatures_re'): - patterns = [] - re_group_index_to_ligature_forms = {} - index = 0 - FORMS = 1 - MATCH = 0 - for ligature_record in LIGATURES: - ligature, replacement = ligature_record - if not self.configuration.getboolean(ligature): - continue - re_group_index_to_ligature_forms[index] = replacement[FORMS] - patterns.append('({})'.format(replacement[MATCH])) - index += 1 - self._re_group_index_to_ligature_forms = ( - re_group_index_to_ligature_forms - ) - self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE) - return self.__ligatures_re - - def _get_ligature_forms_from_re_group_index(self, group_index): - if not hasattr(self, '_re_group_index_to_ligature_forms'): - self._ligatures_re - return self._re_group_index_to_ligature_forms[group_index] - - def reshape(self, text): - if not text: - return '' - - output = [] - - LETTER = 0 - FORM = 1 - NOT_SUPPORTED = -1 - - delete_harakat = self.configuration.getboolean('delete_harakat') - - for i in range(len(text)): - letter = text[i] - if delete_harakat and HARAKAT_RE.match(letter): - output.append(('', NOT_SUPPORTED)) - if letter not in LETTERS: - output.append((letter, NOT_SUPPORTED)) - elif not output: - output.append((letter, ISOLATED)) - else: - previous_output = output[-1] - if previous_output[FORM] == NOT_SUPPORTED: - output.append((letter, ISOLATED)) - elif not _connects_with_letter_before(letter): - output.append((letter, ISOLATED)) - elif not _connects_with_letter_after(previous_output[LETTER]): - output.append((letter, ISOLATED)) - elif (previous_output[FORM] == FINAL - and not _connects_with_letters_before_and_after( - previous_output[LETTER] - )): - output.append((letter, ISOLATED)) - elif previous_output[FORM] == ISOLATED: - output[-1] = (previous_output[LETTER], - INITIAL) - output.append((letter, FINAL)) - # Otherwise, we will change the previous letter to connect to - # the current letter - else: - output[-1] = (previous_output[LETTER], - MEDIAL) - output.append((letter, FINAL)) - - if self.configuration.getboolean('support_ligatures'): - for match in re.finditer(self._ligatures_re, text): - group_index = next(( - i for i, group in enumerate(match.groups()) if group - ), -1) - forms = self._get_ligature_forms_from_re_group_index( - group_index - ) - a, b = match.span() - a_form = output[a][FORM] - b_form = output[b - 1][FORM] - ligature_form = None - - # +-----------+----------+---------+---------+----------+ - # | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL | - # +-----------+----------+---------+---------+----------+ - # | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED | - # | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED | - # | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL | - # | FINAL | FINAL | MEDIAL | MEDIAL | FINAL | - # +-----------+----------+---------+---------+----------+ - - if a_form in (ISOLATED, INITIAL): - if b_form in (ISOLATED, FINAL): - ligature_form = ISOLATED - else: - ligature_form = INITIAL - else: - if b_form in (ISOLATED, FINAL): - ligature_form = FINAL - else: - ligature_form = MEDIAL - if not forms[ligature_form]: - continue - output[a] = (forms[ligature_form], NOT_SUPPORTED) - output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) - - return ''.join( - map( - lambda o: ( - o[FORM] == NOT_SUPPORTED - and o[LETTER] - or LETTERS[o[LETTER]][o[FORM]] - ), - filter(lambda o: o[LETTER], output), - ) - ) - -# ------------------------------ End: Reshaper ----------------------------- # - -# Exports - -default_reshaper = ArabicReshaper() -reshape = default_reshaper.reshape diff --git a/arabic_reshaper/tests/__init__.py b/arabic_reshaper/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arabic_reshaper/tests/test_001_initialization.py b/arabic_reshaper/tests/test_001_initialization.py new file mode 100644 index 0000000..8f2f558 --- /dev/null +++ b/arabic_reshaper/tests/test_001_initialization.py @@ -0,0 +1,39 @@ +import unittest +import arabic_reshaper + + +class TestDefaultConfiguration(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper() + + def test_configuration_exists(self): + self.assertIsNotNone(self.reshaper.configuration) + + def test_language(self): + self.assertIn('language', self.reshaper.configuration) + self.assertIsNotNone(self.reshaper.configuration['language']) + self.assertTrue(self.reshaper.configuration['language']) + + def test_support_ligatures(self): + self.assertIn('support_ligatures', self.reshaper.configuration) + self.assertIsNotNone( + self.reshaper.configuration.getboolean('support_ligatures') + ) + + def test_delete_harakat(self): + self.assertIn('delete_harakat', self.reshaper.configuration) + self.assertIsNotNone( + self.reshaper.configuration.getboolean('delete_harakat') + ) + + def test_ligatures(self): + import arabic_reshaper.ligatures + for ligature in arabic_reshaper.ligatures.LIGATURES: + with self.subTest(ligature=ligature[0]): + self.assertIn(ligature[0], self.reshaper.configuration) + self.assertIsNotNone( + self.reshaper.configuration.getboolean(ligature[0]) + ) + +if __name__ == '__main__': + unittest.main() diff --git a/arabic_reshaper/tests/test_002_reshaping.py b/arabic_reshaper/tests/test_002_reshaping.py new file mode 100644 index 0000000..b37950a --- /dev/null +++ b/arabic_reshaper/tests/test_002_reshaping.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +import unittest +import arabic_reshaper + + +class TestDefaultReshaping(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.default_reshaper + self.cases = ( + ('السلام عليكم', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'), + ('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'), + ) + + def test_reshaping(self): + for i, case in enumerate(self.cases): + with self.subTest(i=i, case=case[0]): + self.assertEqual(case[1], self.reshaper.reshape(case[0])) + + +class TestReshapingWithHarakat(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper({ + 'delete_harakat': False + }) + self.cases = ( + ('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻼَْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'), + ) + + def test_reshaping(self): + for i, case in enumerate(self.cases): + with self.subTest(i=i, case=case[0]): + self.assertEqual(case[1], self.reshaper.reshape(case[0])) + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index da9f349..a296fba 100755 --- a/setup.py +++ b/setup.py @@ -1,26 +1,30 @@ #!/usr/bin/env python -# coding = utf-8 +# coding=utf-8 from setuptools import setup setup( - name = "arabic_reshaper", - description = "Reconstruct Arabic sentences to be used in applications that don't support Arabic", - version = '2.0.5', - platforms = "ALL", - license = "GPL", - packages = ['arabic_reshaper'], - install_requires = ['configparser', 'future'], - author = "Abdullah Diab", - author_email = "mpcabd@gmail.com", - maintainer = "Abdullah Diab", - maintainer_email = "mpcabd@gmail.com", - package_dir = { 'arabic_reshaper' : '.' }, - package_data = { 'arabic_reshaper' : [ 'default-config.ini' ] }, - keywords = "arabic shaping reshaping reshaper", - url = "https://mpcabd.xyz/python-arabic-text-reshaper/", - download_url = "https://github.com/mpcabd/python-arabic-reshaper/tarball/master", - classifiers = [ + name="arabic_reshaper", + description=("Reconstruct Arabic sentences to be used in" + " applications that don't support Arabic"), + version='2.0.6', + platforms="ALL", + license="GPL", + packages=['arabic_reshaper'], + install_requires=['configparser', 'future'], + author="Abdullah Diab", + author_email="mpcabd@gmail.com", + maintainer="Abdullah Diab", + maintainer_email="mpcabd@gmail.com", + package_dir={'arabic_reshaper': 'arabic_reshaper'}, + package_data={'arabic_reshaper': ['default-config.ini']}, + test_suite='arabic_reshaper.tests', + include_package_data=True, + keywords="arabic shaping reshaping reshaper", + url="https://mpcabd.xyz/python-arabic-text-reshaper/", + download_url=("https://github.com/mpcabd/" + "python-arabic-reshaper/tarball/master"), + classifiers=[ "Natural Language :: Arabic", "Operating System :: OS Independent", "Programming Language :: Python :: 2.7",