From d0d74fc1ab309e8448b494e366a3836a16d9fcf8 Mon Sep 17 00:00:00 2001 From: Abdullah Diab Date: Sat, 17 Jun 2017 20:09:23 +0200 Subject: [PATCH] More tests + delete_tatweel * Add `delete_tatweel` * Add more test cases --- README.md | 51 +++++-- arabic_reshaper/arabic_reshaper.py | 8 ++ arabic_reshaper/default-config.ini | 9 +- arabic_reshaper/letters.py | 4 +- .../tests/test_001_initialization.py | 29 ++-- arabic_reshaper/tests/test_002_reshaping.py | 136 ++++++++++++++++-- setup.py | 2 +- 7 files changed, 198 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index ec8fee8..19d8379 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,10 @@ in the reshaped text, you should enable this option if you are going to pass the reshaped text to `bidi.algorithm.get_display` because it will reverse the text and you'd end up with harakat applied to the next letter instead of the previous letter. +* `delete_tatweel` (Default `False`): When this is set to `True` the reshaper +will delete the Tatweel character (U+0640) from the text before reshaping, this +can be useful when you want to support ligatures and don't care about Tatweel +getting deleted. Besides the settings above, you can enable/disable supported ligatures. For a full list of supported ligatures and their default status check the file @@ -109,10 +113,10 @@ from arabic_reshaper import ArabicReshaper configuration = { 'delete_harakat': False, 'support_ligatures': True, - 'RIAL SIGN': True, # Replace ريال with ﷼ + 'RIAL SIGN': True, # Replace ر ي ا ل with ﷼ } reshaper = ArabicReshaper(configuration=configuration) -text_to_be_reshaped = 'سعر المنتج ١٥٠ ريال' +text_to_be_reshaped = 'سعر المنتج ١٥٠ ر' + 'يال' # had to split the string for display reshaped_text = reshaper.reshape(text_to_be_reshaped) ``` @@ -126,13 +130,8 @@ constructor's `configuration_file` parameter like this: ``` from arabic_reshaper import ArabicReshaper -configuration = { - 'delete_harakat': False, - 'support_ligatures': True, - 'RIAL SIGN': True, # Replace ريال with ﷼ -} reshaper = ArabicReshaper(configuration_file='/path/to/your/config.ini') -text_to_be_reshaped = 'سعر المنتج ١٥٠ ريال' +text_to_be_reshaped = 'سعر المنتج ١٥٠ ر' + 'يال' # had to split the string for display reshaped_text = reshaper.reshape(text_to_be_reshaped) ``` @@ -179,6 +178,42 @@ https://github.com/mpcabd/python-arabic-reshaper/tarball/master ## Version History +### 2.0.8 + +* Added `delete_tatweel` +* Added more test cases + +### 2.0.7 + +* Fix tests for Python 2.7 + +### 2.0.6 + +* Fixed a bug with Harakat breaking the reshaping +* Wrote two small unit tests, more to come +* Moved letters and ligatures to separate files for readability/maintainability +* Moved package to its own folder for readability/maintainability + +### 2.0.5 + +Fix error message formatting + +### 2.0.4 + +Fix error message formatting + +### 2.0.3 + +Use `Exception` instead of `Error`. + +### 2.0.2 + +Use `pkg_resources.resource_filename` instead of depending on `__file__` to access `default-config.ini`. + +### 2.0.1 + +Include default-config.ini in setup.py + ### 2.0.0 * Totally rewrote the code; diff --git a/arabic_reshaper/arabic_reshaper.py b/arabic_reshaper/arabic_reshaper.py index a7fff00..69e7486 100644 --- a/arabic_reshaper/arabic_reshaper.py +++ b/arabic_reshaper/arabic_reshaper.py @@ -165,6 +165,7 @@ def reshape(self, text): NOT_SUPPORTED = -1 delete_harakat = self.configuration.getboolean('delete_harakat') + delete_tatweel = self.configuration.getboolean('delete_tatweel') positions_harakat = {} for letter in text: @@ -174,6 +175,8 @@ def reshape(self, text): if position not in positions_harakat: positions_harakat[position] = [] positions_harakat[position].append(letter) + elif letter == TATWEEL and delete_tatweel: + pass elif letter not in LETTERS: output.append((letter, NOT_SUPPORTED)) elif not output: @@ -211,6 +214,11 @@ def reshape(self, text): if self.configuration.getboolean('support_ligatures'): # Clean text from Harakat to be able to find ligatures text = HARAKAT_RE.sub('', text) + + # Clean text from Tatweel to find ligatures if delete_tatweel + if delete_tatweel: + text = text.replace(TATWEEL, '') + for match in re.finditer(self._ligatures_re, text): group_index = next(( i for i, group in enumerate(match.groups()) if group diff --git a/arabic_reshaper/default-config.ini b/arabic_reshaper/default-config.ini index 48c6505..8a3ea5f 100644 --- a/arabic_reshaper/default-config.ini +++ b/arabic_reshaper/default-config.ini @@ -3,13 +3,16 @@ # More languages might be supported soon. language = Arabic +# Whether to delete the Harakat (Tashkeel) before reshaping or not. +delete_harakat = yes + +# Whether to delete the Tatweel (U+0640) before reshaping or not. +delete_tatweel = no + # Whether to use ligatures or not. # Serves as a shortcut to disable all ligatures. support_ligatures = yes -# Whether to delete the Harakat (Tashkeel) before reshaping or not. -delete_harakat = yes - # When `support_ligatures` is enabled. # Separate ligatures configuration take precedence over it. # When `support_ligatures` is disabled, diff --git a/arabic_reshaper/letters.py b/arabic_reshaper/letters.py index 3f1a482..7d46abe 100644 --- a/arabic_reshaper/letters.py +++ b/arabic_reshaper/letters.py @@ -20,6 +20,8 @@ MEDIAL = 2 FINAL = 3 +TATWEEL = '\u0640' + LETTERS = { # ARABIC LETTER HAMZA '\u0621': ('\uFE80', '', '', ''), @@ -74,7 +76,7 @@ # ARABIC LETTER GHAIN '\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'), # ARABIC TATWEEL - '\u0640': ('\u0640', '\u0640', '\u0640', '\u0640'), + TATWEEL: (TATWEEL, TATWEEL, TATWEEL, TATWEEL), # ARABIC LETTER FEH '\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'), # ARABIC LETTER QAF diff --git a/arabic_reshaper/tests/test_001_initialization.py b/arabic_reshaper/tests/test_001_initialization.py index 8a25859..b1c26f2 100644 --- a/arabic_reshaper/tests/test_001_initialization.py +++ b/arabic_reshaper/tests/test_001_initialization.py @@ -8,6 +8,12 @@ class TestDefaultConfiguration(unittest.TestCase): def setUp(self): self.reshaper = arabic_reshaper.ArabicReshaper() + def boolean_check(self, boolean): + self.assertIn(boolean, self.reshaper.configuration) + self.assertIsNotNone( + self.reshaper.configuration.getboolean(boolean) + ) + def test_configuration_exists(self): self.assertIsNotNone(self.reshaper.configuration) @@ -17,31 +23,22 @@ def test_language(self): self.assertTrue(self.reshaper.configuration['language']) def test_support_ligatures(self): - self.assertIn('support_ligatures', self.reshaper.configuration) - self.assertIsNotNone( - self.reshaper.configuration.getboolean('support_ligatures') - ) + self.boolean_check('support_ligatures') def test_delete_harakat(self): - self.assertIn('delete_harakat', self.reshaper.configuration) - self.assertIsNotNone( - self.reshaper.configuration.getboolean('delete_harakat') - ) + self.boolean_check('delete_harakat') + + def test_delete_tatweel(self): + self.boolean_check('delete_tatweel') def test_ligatures(self): import arabic_reshaper.ligatures for ligature in arabic_reshaper.ligatures.LIGATURES: if hasattr(self, 'subTest'): with self.subTest(ligature=ligature[0]): - self.assertIn(ligature[0], self.reshaper.configuration) - self.assertIsNotNone( - self.reshaper.configuration.getboolean(ligature[0]) - ) + self.boolean_check(ligature[0]) else: - self.assertIn(ligature[0], self.reshaper.configuration) - self.assertIsNotNone( - self.reshaper.configuration.getboolean(ligature[0]) - ) + self.boolean_check(ligature[0]) if __name__ == '__main__': unittest.main() diff --git a/arabic_reshaper/tests/test_002_reshaping.py b/arabic_reshaper/tests/test_002_reshaping.py index 3f86bca..c6e1951 100644 --- a/arabic_reshaper/tests/test_002_reshaping.py +++ b/arabic_reshaper/tests/test_002_reshaping.py @@ -6,21 +6,40 @@ import arabic_reshaper +def _reshaping_test(test): + for i, case in enumerate(test.cases): + def t(): test.assertEqual(case[1], test.reshaper.reshape(case[0])) + if hasattr(test, 'subTest'): + with test.subTest(i=i, case=case[0]): + t() + else: + t() + + class TestDefaultReshaping(unittest.TestCase): def setUp(self): self.reshaper = arabic_reshaper.default_reshaper self.cases = ( ('السلام عليكم', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'), ('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'), + ('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'), + ('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎ ﻭﻧﻄﻘﺎ ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'), + ('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'), + ('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'), + ('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'), + ('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'), + ('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'), + ('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'), + ('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'), + ('الأمم المتحدة، ويُحتفل', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳﺤﺘﻔﻞ'), + ('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'), + ('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'), + ('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'), + ('الأمم المتحدة.', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'), ) def test_reshaping(self): - for i, case in enumerate(self.cases): - if hasattr(self, 'subTest'): - with self.subTest(i=i, case=case[0]): - self.assertEqual(case[1], self.reshaper.reshape(case[0])) - else: - self.assertEqual(case[1], self.reshaper.reshape(case[0])) + _reshaping_test(self) class TestReshapingWithHarakat(unittest.TestCase): @@ -30,15 +49,108 @@ def setUp(self): }) self.cases = ( ('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻼَْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'), + ('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'), + ('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'), + ('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'), + ('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'), + ('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'), + ('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'), + ('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'), + ('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'), + ('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'), + ('الأمم المتحدة، ويُحتفل', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ'), + ('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'), + ('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'), + ('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'), + ('الأمم المتحدة.', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'), + ) + + def test_reshaping(self): + _reshaping_test(self) + + +class TestReshapingWithHarakatWithoutLigatures(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper({ + 'delete_harakat': False, + 'support_ligatures': False + }) + self.cases = ( + ('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻠَﺎْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'), + ('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'), + ('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'), + ('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'), + ('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'), + ('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'), + ('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'), + ('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'), + ('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'), + ('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'), + ('الأمم المتحدة، ويُحتفل', 'ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ'), + ('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'), + ('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'), + ('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'), + ('الأمم المتحدة.', 'ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'), + ) + + def test_reshaping(self): + _reshaping_test(self) + + +class TestReshapingSomeLigatures(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper({ + 'delete_tatweel': True, + 'ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM': True, + 'ARABIC LIGATURE JALLAJALALOUHOU': True, + 'ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM': True, + 'ARABIC LIGATURE ALLAH ': True, + 'ARABIC LIGATURE AKBAR': True, + 'ARABIC LIGATURE ALAYHE': True, + 'ARABIC LIGATURE MOHAMMAD': True, + 'ARABIC LIGATURE RASOUL': True, + 'ARABIC LIGATURE SALAM': True, + 'ARABIC LIGATURE SALLA': True, + 'ARABIC LIGATURE WASALLAM': True, + }) + self.cases = ( + ('إِنَّهُ مِن سُلَيْمَانَ ' + 'وَإِنَّهُ بِسْمِ اللّـَهِ ' + 'الرَّحْمَـٰنِ الرَّحِيمِ ﴿٣٠﴾ ' + 'أَلَّا تَعْلُوا عَلَيَّ ' + 'وَأْتُونِي مُسْلِمِينَ ﴿٣١﴾', + + 'ﺇﻧﻪ ﻣﻦ ﺳﻠﻴﻤﺎﻥ ﻭﺇﻧﻪ ﷽ ﴿٣٠﴾ ' + 'ﺃﻻ ﺗﻌﻠﻮﺍ ﻋﻠﻲ ﻭﺃﺗﻮﻧﻲ ﻣﺴﻠﻤﻴﻦ ﴿٣١﴾'), + + ('فَذَكِّرْ إِنَّمَا أَنتَ' + ' مُذَكِّرٌ ﴿٢١﴾ لَّسْتَ' + ' عَلَيْهِم بِمُصَيْطِرٍ ﴿٢٢﴾' + ' إِلَّا مَن تَوَلَّىٰ' + ' وَكَفَرَ ﴿٢٣﴾ فَيُعَذِّبُهُ' + ' اللَّـهُ الْعَذَابَ' + ' الْأَكْبَرَ ﴿٢٤﴾', + + 'ﻓﺬﻛﺮ ﺇﻧﻤﺎ ﺃﻧﺖ' + ' ﻣﺬﻛﺮ ﴿٢١﴾ ﻟﺴﺖ' + ' ﻋﻠﻴﻬﻢ ﺑﻤﺼﻴﻄﺮ ﴿٢٢﴾' + ' ﺇﻻ ﻣﻦ ﺗﻮﻟﻰ' + ' ﻭﻛﻔﺮ ﴿٢٣﴾ ﻓﻴﻌﺬﺑﻪ' + ' ﷲ ﺍﻟﻌﺬﺍﺏ' + ' ﺍﻷﻛﺒﺮ ﴿٢٤﴾'), + + ('محمد رسول الله صلى الله عليه وسلم', + 'ﷴ ﷶ ﷲ ﷺ'), + + ('الله جل جلاله', + 'ﷲ ﷻ'), + + ('محمد رسول الله عليه صلى الله وسلم', + 'ﷴ ﷶ ﷲ ﷷ ﷹ ﷲ ﷸ'), ) def test_reshaping(self): - for i, case in enumerate(self.cases): - if hasattr(self, 'subTest'): - with self.subTest(i=i, case=case[0]): - self.assertEqual(case[1], self.reshaper.reshape(case[0])) - else: - self.assertEqual(case[1], self.reshaper.reshape(case[0])) + _reshaping_test(self) if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 8a5e1af..87d88f9 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="arabic_reshaper", description=("Reconstruct Arabic sentences to be used in" " applications that don't support Arabic"), - version='2.0.7', + version='2.0.8', platforms="ALL", license="GPL", packages=['arabic_reshaper'],