From 86bc129c2a71b486c86b1d28b639fb6eee85bc9e Mon Sep 17 00:00:00 2001 From: Juraj Michalak Date: Thu, 13 Oct 2022 15:57:38 +0200 Subject: [PATCH] fix issue #86 - missed ligature due to non-overlapping regex matches of ligature pattern in string when previous overlapping ligature candidate is skipped due to its form mismatch --- arabic_reshaper/arabic_reshaper.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arabic_reshaper/arabic_reshaper.py b/arabic_reshaper/arabic_reshaper.py index 4721a6a..a94cd0f 100644 --- a/arabic_reshaper/arabic_reshaper.py +++ b/arabic_reshaper/arabic_reshaper.py @@ -186,14 +186,17 @@ def reshape(self, text): if delete_tatweel: text = text.replace(TATWEEL, '') - for match in re.finditer(self._ligatures_re, text): + regex_start = 0 + matchIt = re.finditer(self._ligatures_re, text) + match = next(matchIt, None) + while match: group_index = next(( i for i, group in enumerate(match.groups()) if group ), -1) forms = self._get_ligature_forms_from_re_group_index( group_index ) - a, b = match.span() + a, b = tuple(i+regex_start for i in match.span()) a_form = output[a][FORM] b_form = output[b - 1][FORM] ligature_form = None @@ -218,9 +221,13 @@ def reshape(self, text): else: ligature_form = MEDIAL if not forms[ligature_form]: + regex_start = a+1 + matchIt = re.finditer(self._ligatures_re, text[regex_start:]) + match = next(matchIt, None) continue output[a] = (forms[ligature_form], NOT_SUPPORTED) output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) + match = next(matchIt, None) result = [] if not delete_harakat and -1 in positions_harakat: