From 6d086ffc5be71220d527495ad384834e4f54f340 Mon Sep 17 00:00:00 2001
From: Matthew Barnett <git@mrabarnett.plus.com>
Date: Sat, 22 Jun 2024 18:06:08 +0100
Subject: [PATCH] Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due
 to missing new GB9c rule implementation

---
 changelog.txt                |  4 ++++
 regex_3/_regex.c             | 33 +++++++++++++++++++++++++++++++++
 regex_3/_regex_unicode.h     |  5 +++++
 tools/build_regex_unicode.py | 10 ++++++++++
 4 files changed, 52 insertions(+)

diff --git a/changelog.txt b/changelog.txt
index 7075963..8989726 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,3 +1,7 @@
+Version: 2024.6.22
+
+    Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to missing new GB9c rule implementation
+
 Version: 2024.5.15
 
     Git issue 530: hangs with fuzzy and optionals
diff --git a/regex_3/_regex.c b/regex_3/_regex.c
index d129632..4312513 100644
--- a/regex_3/_regex.c
+++ b/regex_3/_regex.c
@@ -1803,6 +1803,7 @@ static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos)
     Py_UCS4 right_char;
     RE_UINT32 left_prop;
     RE_UINT32 right_prop;
+    RE_UINT32 prop;
     Py_ssize_t pos;
 
     /* Break at the start and end of text, unless the text is empty. */
@@ -1873,6 +1874,38 @@ static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos)
     if (left_prop == RE_GBREAK_PREPEND)
         return FALSE;
 
+    /* The GB9c rule only applies to extended grapheme clusters: Do not break
+     * within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
+     */
+    /* GB9c	*/
+    if (re_get_indic_conjunct_break(right_char) == RE_INCB_CONSONANT) {
+        BOOL has_linker;
+
+        has_linker = FALSE;
+        pos = left_pos;
+
+        do {
+            prop = re_get_indic_conjunct_break(char_at(state->text, pos));
+
+            switch (prop) {
+            case RE_INCB_LINKER:
+                has_linker = TRUE;
+                break;
+            case RE_INCB_EXTEND:
+                break;
+            case RE_INCB_CONSONANT:
+                if (has_linker)
+                    return FALSE;
+                goto end_GB9c;
+            default:
+                goto end_GB9c;
+            }
+
+            --pos;
+        } while (pos >= state->text_start);
+    }
+
+end_GB9c:
     /* Do not break within emoji modifier sequences or emoji zwj sequences. */
     /* GB11 */
     if (left_prop == RE_GBREAK_ZWJ && re_get_extended_pictographic(right_char))
diff --git a/regex_3/_regex_unicode.h b/regex_3/_regex_unicode.h
index a1c3b10..cbdf858 100644
--- a/regex_3/_regex_unicode.h
+++ b/regex_3/_regex_unicode.h
@@ -198,6 +198,11 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 codepoint);
 #define RE_LBREAK_REGIONALINDICATOR 46
 #define RE_LBREAK_EMODIFIER 47
 
+#define RE_INCB_NONE 0
+#define RE_INCB_EXTEND 1
+#define RE_INCB_CONSONANT 2
+#define RE_INCB_LINKER 3
+
 extern char* re_strings[1506];
 extern RE_Property re_properties[183];
 extern RE_PropertyValue re_property_values[1651];
diff --git a/tools/build_regex_unicode.py b/tools/build_regex_unicode.py
index f19f89b..664fa00 100644
--- a/tools/build_regex_unicode.py
+++ b/tools/build_regex_unicode.py
@@ -1733,6 +1733,16 @@ def make_key(names):
 
         h_file.write('\n')
 
+        val_list = unique(properties[munge('Indic_Conjunct_Break')]['values'].values(),
+          key=id)
+        values = [(value['id'], value['names'][0]) for value in val_list]
+
+        for val_id, name in sorted(values):
+            h_file.write('#define RE_INCB_{} {}\n'.format(munge(name),
+              val_id))
+
+        h_file.write('\n')
+
         h_file.write('extern char* re_strings[{}];\n'.format(unicode_data['string_count']))
         h_file.write('extern RE_Property re_properties[{}];\n'.format(unicode_data['property_table_count']))
         h_file.write('extern RE_PropertyValue re_property_values[{}];\n'.format(unicode_data['valueset_table_count']))