From 6d086ffc5be71220d527495ad384834e4f54f340 Mon Sep 17 00:00:00 2001 From: Matthew Barnett Date: Sat, 22 Jun 2024 18:06:08 +0100 Subject: [PATCH] Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to missing new GB9c rule implementation --- changelog.txt | 4 ++++ regex_3/_regex.c | 33 +++++++++++++++++++++++++++++++++ regex_3/_regex_unicode.h | 5 +++++ tools/build_regex_unicode.py | 10 ++++++++++ 4 files changed, 52 insertions(+) diff --git a/changelog.txt b/changelog.txt index 7075963..8989726 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,7 @@ +Version: 2024.6.22 + + Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to missing new GB9c rule implementation + Version: 2024.5.15 Git issue 530: hangs with fuzzy and optionals diff --git a/regex_3/_regex.c b/regex_3/_regex.c index d129632..4312513 100644 --- a/regex_3/_regex.c +++ b/regex_3/_regex.c @@ -1803,6 +1803,7 @@ static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos) Py_UCS4 right_char; RE_UINT32 left_prop; RE_UINT32 right_prop; + RE_UINT32 prop; Py_ssize_t pos; /* Break at the start and end of text, unless the text is empty. */ @@ -1873,6 +1874,38 @@ static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos) if (left_prop == RE_GBREAK_PREPEND) return FALSE; + /* The GB9c rule only applies to extended grapheme clusters: Do not break + * within certain combinations with Indic_Conjunct_Break (InCB)=Linker. + */ + /* GB9c */ + if (re_get_indic_conjunct_break(right_char) == RE_INCB_CONSONANT) { + BOOL has_linker; + + has_linker = FALSE; + pos = left_pos; + + do { + prop = re_get_indic_conjunct_break(char_at(state->text, pos)); + + switch (prop) { + case RE_INCB_LINKER: + has_linker = TRUE; + break; + case RE_INCB_EXTEND: + break; + case RE_INCB_CONSONANT: + if (has_linker) + return FALSE; + goto end_GB9c; + default: + goto end_GB9c; + } + + --pos; + } while (pos >= state->text_start); + } + +end_GB9c: /* Do not break within emoji modifier sequences or emoji zwj sequences. */ /* GB11 */ if (left_prop == RE_GBREAK_ZWJ && re_get_extended_pictographic(right_char)) diff --git a/regex_3/_regex_unicode.h b/regex_3/_regex_unicode.h index a1c3b10..cbdf858 100644 --- a/regex_3/_regex_unicode.h +++ b/regex_3/_regex_unicode.h @@ -198,6 +198,11 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 codepoint); #define RE_LBREAK_REGIONALINDICATOR 46 #define RE_LBREAK_EMODIFIER 47 +#define RE_INCB_NONE 0 +#define RE_INCB_EXTEND 1 +#define RE_INCB_CONSONANT 2 +#define RE_INCB_LINKER 3 + extern char* re_strings[1506]; extern RE_Property re_properties[183]; extern RE_PropertyValue re_property_values[1651]; diff --git a/tools/build_regex_unicode.py b/tools/build_regex_unicode.py index f19f89b..664fa00 100644 --- a/tools/build_regex_unicode.py +++ b/tools/build_regex_unicode.py @@ -1733,6 +1733,16 @@ def make_key(names): h_file.write('\n') + val_list = unique(properties[munge('Indic_Conjunct_Break')]['values'].values(), + key=id) + values = [(value['id'], value['names'][0]) for value in val_list] + + for val_id, name in sorted(values): + h_file.write('#define RE_INCB_{} {}\n'.format(munge(name), + val_id)) + + h_file.write('\n') + h_file.write('extern char* re_strings[{}];\n'.format(unicode_data['string_count'])) h_file.write('extern RE_Property re_properties[{}];\n'.format(unicode_data['property_table_count'])) h_file.write('extern RE_PropertyValue re_property_values[{}];\n'.format(unicode_data['valueset_table_count']))