Skip to content

Commit

Permalink
More tests + delete_tatweel
Browse files Browse the repository at this point in the history
* Add `delete_tatweel`
* Add more test cases
  • Loading branch information
mpcabd committed Jun 17, 2017
1 parent 281a49e commit d0d74fc
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 41 deletions.
51 changes: 43 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ in the reshaped text, you should enable this option if you are going to pass
the reshaped text to `bidi.algorithm.get_display` because it will reverse the
text and you'd end up with harakat applied to the next letter instead of the
previous letter.
* `delete_tatweel` (Default `False`): When this is set to `True` the reshaper
will delete the Tatweel character (U+0640) from the text before reshaping, this
can be useful when you want to support ligatures and don't care about Tatweel
getting deleted.

Besides the settings above, you can enable/disable supported ligatures. For a
full list of supported ligatures and their default status check the file
Expand All @@ -109,10 +113,10 @@ from arabic_reshaper import ArabicReshaper
configuration = {
'delete_harakat': False,
'support_ligatures': True,
'RIAL SIGN': True, # Replace ريال with ﷼
'RIAL SIGN': True, # Replace ر ي ا ل with ﷼
}
reshaper = ArabicReshaper(configuration=configuration)
text_to_be_reshaped = 'سعر المنتج ١٥٠ ريال'
text_to_be_reshaped = 'سعر المنتج ١٥٠ ر' + 'يال' # had to split the string for display
reshaped_text = reshaper.reshape(text_to_be_reshaped)
```

Expand All @@ -126,13 +130,8 @@ constructor's `configuration_file` parameter like this:

```
from arabic_reshaper import ArabicReshaper
configuration = {
'delete_harakat': False,
'support_ligatures': True,
'RIAL SIGN': True, # Replace ريال with ﷼
}
reshaper = ArabicReshaper(configuration_file='/path/to/your/config.ini')
text_to_be_reshaped = 'سعر المنتج ١٥٠ ريال'
text_to_be_reshaped = 'سعر المنتج ١٥٠ ر' + 'يال' # had to split the string for display
reshaped_text = reshaper.reshape(text_to_be_reshaped)
```

Expand Down Expand Up @@ -179,6 +178,42 @@ https://github.com/mpcabd/python-arabic-reshaper/tarball/master

## Version History

### 2.0.8

* Added `delete_tatweel`
* Added more test cases

### 2.0.7

* Fix tests for Python 2.7

### 2.0.6

* Fixed a bug with Harakat breaking the reshaping
* Wrote two small unit tests, more to come
* Moved letters and ligatures to separate files for readability/maintainability
* Moved package to its own folder for readability/maintainability

### 2.0.5

Fix error message formatting

### 2.0.4

Fix error message formatting

### 2.0.3

Use `Exception` instead of `Error`.

### 2.0.2

Use `pkg_resources.resource_filename` instead of depending on `__file__` to access `default-config.ini`.

### 2.0.1

Include default-config.ini in setup.py

### 2.0.0

* Totally rewrote the code;
Expand Down
8 changes: 8 additions & 0 deletions arabic_reshaper/arabic_reshaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def reshape(self, text):
NOT_SUPPORTED = -1

delete_harakat = self.configuration.getboolean('delete_harakat')
delete_tatweel = self.configuration.getboolean('delete_tatweel')
positions_harakat = {}

for letter in text:
Expand All @@ -174,6 +175,8 @@ def reshape(self, text):
if position not in positions_harakat:
positions_harakat[position] = []
positions_harakat[position].append(letter)
elif letter == TATWEEL and delete_tatweel:
pass
elif letter not in LETTERS:
output.append((letter, NOT_SUPPORTED))
elif not output:
Expand Down Expand Up @@ -211,6 +214,11 @@ def reshape(self, text):
if self.configuration.getboolean('support_ligatures'):
# Clean text from Harakat to be able to find ligatures
text = HARAKAT_RE.sub('', text)

# Clean text from Tatweel to find ligatures if delete_tatweel
if delete_tatweel:
text = text.replace(TATWEEL, '')

for match in re.finditer(self._ligatures_re, text):
group_index = next((
i for i, group in enumerate(match.groups()) if group
Expand Down
9 changes: 6 additions & 3 deletions arabic_reshaper/default-config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
# More languages might be supported soon.
language = Arabic

# Whether to delete the Harakat (Tashkeel) before reshaping or not.
delete_harakat = yes

# Whether to delete the Tatweel (U+0640) before reshaping or not.
delete_tatweel = no

# Whether to use ligatures or not.
# Serves as a shortcut to disable all ligatures.
support_ligatures = yes

# Whether to delete the Harakat (Tashkeel) before reshaping or not.
delete_harakat = yes

# When `support_ligatures` is enabled.
# Separate ligatures configuration take precedence over it.
# When `support_ligatures` is disabled,
Expand Down
4 changes: 3 additions & 1 deletion arabic_reshaper/letters.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
MEDIAL = 2
FINAL = 3

TATWEEL = '\u0640'

LETTERS = {
# ARABIC LETTER HAMZA
'\u0621': ('\uFE80', '', '', ''),
Expand Down Expand Up @@ -74,7 +76,7 @@
# ARABIC LETTER GHAIN
'\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'),
# ARABIC TATWEEL
'\u0640': ('\u0640', '\u0640', '\u0640', '\u0640'),
TATWEEL: (TATWEEL, TATWEEL, TATWEEL, TATWEEL),
# ARABIC LETTER FEH
'\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'),
# ARABIC LETTER QAF
Expand Down
29 changes: 13 additions & 16 deletions arabic_reshaper/tests/test_001_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ class TestDefaultConfiguration(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.ArabicReshaper()

def boolean_check(self, boolean):
self.assertIn(boolean, self.reshaper.configuration)
self.assertIsNotNone(
self.reshaper.configuration.getboolean(boolean)
)

def test_configuration_exists(self):
self.assertIsNotNone(self.reshaper.configuration)

Expand All @@ -17,31 +23,22 @@ def test_language(self):
self.assertTrue(self.reshaper.configuration['language'])

def test_support_ligatures(self):
self.assertIn('support_ligatures', self.reshaper.configuration)
self.assertIsNotNone(
self.reshaper.configuration.getboolean('support_ligatures')
)
self.boolean_check('support_ligatures')

def test_delete_harakat(self):
self.assertIn('delete_harakat', self.reshaper.configuration)
self.assertIsNotNone(
self.reshaper.configuration.getboolean('delete_harakat')
)
self.boolean_check('delete_harakat')

def test_delete_tatweel(self):
self.boolean_check('delete_tatweel')

def test_ligatures(self):
import arabic_reshaper.ligatures
for ligature in arabic_reshaper.ligatures.LIGATURES:
if hasattr(self, 'subTest'):
with self.subTest(ligature=ligature[0]):
self.assertIn(ligature[0], self.reshaper.configuration)
self.assertIsNotNone(
self.reshaper.configuration.getboolean(ligature[0])
)
self.boolean_check(ligature[0])
else:
self.assertIn(ligature[0], self.reshaper.configuration)
self.assertIsNotNone(
self.reshaper.configuration.getboolean(ligature[0])
)
self.boolean_check(ligature[0])

if __name__ == '__main__':
unittest.main()
136 changes: 124 additions & 12 deletions arabic_reshaper/tests/test_002_reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,40 @@
import arabic_reshaper


def _reshaping_test(test):
for i, case in enumerate(test.cases):
def t(): test.assertEqual(case[1], test.reshaper.reshape(case[0]))
if hasattr(test, 'subTest'):
with test.subTest(i=i, case=case[0]):
t()
else:
t()


class TestDefaultReshaping(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.default_reshaper
self.cases = (
('السلام عليكم', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'),
('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'),
('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'),
('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎ ﻭﻧﻄﻘﺎ ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'),
('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'),
('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'),
('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'),
('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'),
('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'),
('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'),
('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'),
('الأمم المتحدة، ويُحتفل', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳﺤﺘﻔﻞ'),
('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'),
('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'),
('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'),
('الأمم المتحدة.', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'),
)

def test_reshaping(self):
for i, case in enumerate(self.cases):
if hasattr(self, 'subTest'):
with self.subTest(i=i, case=case[0]):
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
else:
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
_reshaping_test(self)


class TestReshapingWithHarakat(unittest.TestCase):
Expand All @@ -30,15 +49,108 @@ def setUp(self):
})
self.cases = (
('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻼَْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'),
('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'),
('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'),
('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'),
('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'),
('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'),
('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'),
('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'),
('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'),
('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'),
('الأمم المتحدة، ويُحتفل', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ'),
('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'),
('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'),
('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'),
('الأمم المتحدة.', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'),
)

def test_reshaping(self):
_reshaping_test(self)


class TestReshapingWithHarakatWithoutLigatures(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.ArabicReshaper({
'delete_harakat': False,
'support_ligatures': False
})
self.cases = (
('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻠَﺎْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'),
('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'),
('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'),
('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'),
('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'),
('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'),
('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'),
('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'),
('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'),
('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'),
('الأمم المتحدة، ويُحتفل', 'ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ'),
('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'),
('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'),
('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'),
('الأمم المتحدة.', 'ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'),
)

def test_reshaping(self):
_reshaping_test(self)


class TestReshapingSomeLigatures(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.ArabicReshaper({
'delete_tatweel': True,
'ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM': True,
'ARABIC LIGATURE JALLAJALALOUHOU': True,
'ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM': True,
'ARABIC LIGATURE ALLAH ': True,
'ARABIC LIGATURE AKBAR': True,
'ARABIC LIGATURE ALAYHE': True,
'ARABIC LIGATURE MOHAMMAD': True,
'ARABIC LIGATURE RASOUL': True,
'ARABIC LIGATURE SALAM': True,
'ARABIC LIGATURE SALLA': True,
'ARABIC LIGATURE WASALLAM': True,
})
self.cases = (
('إِنَّهُ مِن سُلَيْمَانَ '
'وَإِنَّهُ بِسْمِ اللّـَهِ '
'الرَّحْمَـٰنِ الرَّحِيمِ ﴿٣٠﴾ '
'أَلَّا تَعْلُوا عَلَيَّ '
'وَأْتُونِي مُسْلِمِينَ ﴿٣١﴾',

'ﺇﻧﻪ ﻣﻦ ﺳﻠﻴﻤﺎﻥ ﻭﺇﻧﻪ ﷽ ﴿٣٠﴾ '
'ﺃﻻ ﺗﻌﻠﻮﺍ ﻋﻠﻲ ﻭﺃﺗﻮﻧﻲ ﻣﺴﻠﻤﻴﻦ ﴿٣١﴾'),

('فَذَكِّرْ إِنَّمَا أَنتَ'
' مُذَكِّرٌ ﴿٢١﴾ لَّسْتَ'
' عَلَيْهِم بِمُصَيْطِرٍ ﴿٢٢﴾'
' إِلَّا مَن تَوَلَّىٰ'
' وَكَفَرَ ﴿٢٣﴾ فَيُعَذِّبُهُ'
' اللَّـهُ الْعَذَابَ'
' الْأَكْبَرَ ﴿٢٤﴾',

'ﻓﺬﻛﺮ ﺇﻧﻤﺎ ﺃﻧﺖ'
' ﻣﺬﻛﺮ ﴿٢١﴾ ﻟﺴﺖ'
' ﻋﻠﻴﻬﻢ ﺑﻤﺼﻴﻄﺮ ﴿٢٢﴾'
' ﺇﻻ ﻣﻦ ﺗﻮﻟﻰ'
' ﻭﻛﻔﺮ ﴿٢٣﴾ ﻓﻴﻌﺬﺑﻪ'
' ﷲ ﺍﻟﻌﺬﺍﺏ'
' ﺍﻷﻛﺒﺮ ﴿٢٤﴾'),

('محمد رسول الله صلى الله عليه وسلم',
'ﷴ ﷶ ﷲ ﷺ'),

('الله جل جلاله',
'ﷲ ﷻ'),

('محمد رسول الله عليه صلى الله وسلم',
'ﷴ ﷶ ﷲ ﷷ ﷹ ﷲ ﷸ'),
)

def test_reshaping(self):
for i, case in enumerate(self.cases):
if hasattr(self, 'subTest'):
with self.subTest(i=i, case=case[0]):
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
else:
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
_reshaping_test(self)

if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
name="arabic_reshaper",
description=("Reconstruct Arabic sentences to be used in"
" applications that don't support Arabic"),
version='2.0.7',
version='2.0.8',
platforms="ALL",
license="GPL",
packages=['arabic_reshaper'],
Expand Down

0 comments on commit d0d74fc

Please sign in to comment.