From 91b761b027580a92ef54b9176d3646c5c60dbda7 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 2 Jan 2024 17:21:04 -0600 Subject: [PATCH] =?UTF-8?q?feat(core):=20fixup=20markers=20as=20regex=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - move prepend_marker out of line - replace bool for_regex with an enum - cleanup another instance of \b -> \u0008 #9121 --- core/src/ldml/ldml_transforms.cpp | 59 +++++++++++++----- core/src/ldml/ldml_transforms.hpp | 78 +++++++++--------------- core/tests/unit/ldml/test_transforms.cpp | 18 +++--- 3 files changed, 83 insertions(+), 72 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index cf3085587bf..6f1ddcb0a5f 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -519,7 +519,7 @@ transform_entry::init() { // TODO-LDML: if we have mapFrom, may need to do other processing. std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom); // normalize, including markers, for regex - normalize_nfd_markers(patstr, true); + normalize_nfd_markers(patstr, regex_sentinel); UErrorCode status = U_ZERO_ERROR; /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); // add '$' to match to end @@ -950,9 +950,9 @@ bool normalize_nfd(std::u16string &str) { return normalize(nfd, str, status); } -bool normalize_nfd_markers(std::u16string &str, marker_map &map, bool for_regex) { +bool normalize_nfd_markers(std::u16string &str, marker_map &map, marker_encoding encoding) { std::u32string rstr = km::core::kmx::u16string_to_u32string(str); - if(!normalize_nfd_markers(rstr, map, for_regex)) { + if(!normalize_nfd_markers(rstr, map, encoding)) { return false; } else { str = km::core::kmx::u32string_to_u16string(rstr); @@ -960,7 +960,7 @@ bool normalize_nfd_markers(std::u16string &str, marker_map &map, bool for_regex) } } -static void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map, bool for_regex) { +static void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map, marker_encoding encoding) { // need to reconstitute. marker_map map2(map); // make a copy of the map // clear the string @@ -970,7 +970,7 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con const auto ch = MARKER_BEFORE_EOT; const auto m = map2.find(ch); if (m != map2.end()) { - prepend_marker(str, m->second, for_regex); + prepend_marker(str, m->second, encoding); map2.erase(ch); // remove it } } @@ -981,7 +981,7 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con const auto m = map2.find(ch); if (m != map2.end()) { - prepend_marker(str, m->second, for_regex); + prepend_marker(str, m->second, encoding); map2.erase(ch); // remove it } } @@ -992,9 +992,9 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con * - doesn't support >1 marker per char - may need a set instead of a map! * - ideally this should be used on a normalization safe subsequence */ -bool normalize_nfd_markers(std::u32string &str, marker_map &map, bool for_regex) { +bool normalize_nfd_markers(std::u32string &str, marker_map &map, marker_encoding encoding) { /** original string, but no markers */ - std::u32string str_unmarked = remove_markers(str, map, for_regex); + std::u32string str_unmarked = remove_markers(str, map, encoding); /** original string, no markers, NFD */ std::u32string str_unmarked_nfd = str_unmarked; if(!normalize_nfd(str_unmarked_nfd)) { @@ -1006,14 +1006,14 @@ bool normalize_nfd_markers(std::u32string &str, marker_map &map, bool for_regex) // Normalization produced no change when markers were removed. // So, we'll call this a no-op. } else { - add_back_markers(str, str_unmarked_nfd, map, for_regex); + add_back_markers(str, str_unmarked_nfd, map, encoding); } return true; // all OK } -bool normalize_nfc_markers(std::u32string &str, marker_map &map, bool for_regex) { +bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) { /** original string, but no markers */ - std::u32string str_unmarked = remove_markers(str, map, for_regex); + std::u32string str_unmarked = remove_markers(str, map, encoding); /** original string, no markers, NFC */ std::u32string str_unmarked_nfc = str_unmarked; if(!normalize_nfc(str_unmarked_nfc)) { @@ -1025,7 +1025,7 @@ bool normalize_nfc_markers(std::u32string &str, marker_map &map, bool for_regex) // Normalization produced no change when markers were removed. // So, we'll call this a no-op. } else { - add_back_markers(str, str_unmarked_nfc, map, for_regex); + add_back_markers(str, str_unmarked_nfc, map, encoding); } return true; // all OK } @@ -1048,6 +1048,36 @@ bool normalize_nfc(std::u16string &str) { return normalize(nfc, str, status); } +void +prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding) { + if (encoding == plain_sentinel) { + km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; + str.insert(0, markstr, 3); + } else { + assert(encoding == regex_sentinel); + if (marker == LDML_MARKER_ANY_INDEX) { + // recreate the regex from back to front + str.insert(0, 1, U']'); + prepend_hex_quad(str, LDML_MARKER_MAX_INDEX); + str.insert(0, 1, U'u'); + str.insert(0, 1, U'\\'); + str.insert(0, 1, U'-'); + prepend_hex_quad(str, LDML_MARKER_MIN_INDEX); + str.insert(0, 1, U'u'); + str.insert(0, 1, U'\\'); + str.insert(0, 1, U'['); + str.insert(0, 1, LDML_MARKER_CODE); + str.insert(0, 1, LDML_UC_SENTINEL); + } else { + // add hex part + prepend_hex_quad(str, marker); + // add static part + km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, u'\\', u'u'}; + str.insert(0, markstr, 4); + } + } +} + void prepend_hex_quad(std::u32string &str, KMX_DWORD marker) { for (auto i = 0; i < 4; i++) { @@ -1089,7 +1119,7 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) { return mark_no; } -std::u32string remove_markers(const std::u32string &str, marker_map *markers, bool for_regex) { +std::u32string remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) { std::u32string out; auto i = str.begin(); auto last = i; @@ -1116,11 +1146,12 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, bo } KMX_DWORD marker_no; - if (!for_regex) { + if (encoding == plain_sentinel) { // #3 marker number marker_no = *i; i++; // if end, we'll break out of the loop } else { + assert(encoding == regex_sentinel); // is it an escape or a range? if (*i == U'\\') { if (++i == str.end()) { diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 3dd3ab494b2..70f8b7b1f73 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -303,6 +303,14 @@ class transforms { /** indicates that the marker was before the end of text. */ const char32_t MARKER_BEFORE_EOT = km::core::kmx::Uni_FFFE_NONCHARACTER; +/** specify the type of encoding for marker text */ +enum marker_encoding { + /** encoding as UC_SENTINEL + CODE_DEADKEY + */ + plain_sentinel, + /** encoding as a regex matching the marker */ + regex_sentinel, +}; + /** map from following-char to marker number. */ typedef std::map marker_map; @@ -314,33 +322,33 @@ bool normalize_nfd(std::u16string &str); * @param markers will be populated with marker chars * @return false on failure **/ -bool normalize_nfd_markers(std::u32string &str, marker_map &markers, bool for_regex = false); -bool normalize_nfd_markers(std::u16string &str, marker_map &markers, bool for_regex = false); -inline bool normalize_nfd_markers(std::u32string &str, bool for_regex = false); -inline bool normalize_nfd_markers(std::u16string &str, bool for_regex = false); +bool normalize_nfd_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +bool normalize_nfd_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +inline bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); +inline bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); /** Normalize a u32string inplace to NFC, retaining markers. * @param markers will be populated with marker chars * @return false on failure **/ -bool normalize_nfc_markers(std::u32string &str, marker_map &markers, bool for_regex = false); -bool normalize_nfc_markers(std::u16string &str, marker_map &markers, bool for_regex = false); -inline bool normalize_nfc_markers(std::u32string &str, bool for_regex = false); -inline bool normalize_nfc_markers(std::u16string &str, bool for_regex = false); +bool normalize_nfc_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +bool normalize_nfc_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); +inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); /** Normalize a u32string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u32string &str); /** Normalize a u16string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u16string &str); /** Remove markers and optionally note their glue characters in the map */ -std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, bool for_regex = false); +std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, marker_encoding encoding = plain_sentinel); /** same but with a reference */ -inline std::u32string remove_markers(const std::u32string &str, marker_map &markers, bool for_regex = false) { - return remove_markers(str, &markers, for_regex); +inline std::u32string remove_markers(const std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel) { + return remove_markers(str, &markers, encoding); } /** prepend the marker string in UC_SENTINEL format to the str */ -inline void prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex = false); +void prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding = plain_sentinel); /** format 'marker' as 0001...FFFF and put it at the beginning of the string */ void prepend_hex_quad(std::u32string &str, KMX_DWORD marker); @@ -348,52 +356,24 @@ void prepend_hex_quad(std::u32string &str, KMX_DWORD marker); /** parse 0001...FFFF into a KMX_DWORD. Returns 0 on failure */ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]); -void -prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex) { - if (!for_regex) { - km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; - str.insert(0, markstr, 3); - } else { - if (marker == LDML_MARKER_ANY_INDEX) { - // recreate the regex from back to front - str.insert(0, 1, U']'); - prepend_hex_quad(str, LDML_MARKER_MAX_INDEX); - str.insert(0, 1, U'u'); - str.insert(0, 1, U'\\'); - str.insert(0, 1, U'-'); - prepend_hex_quad(str, LDML_MARKER_MIN_INDEX); - str.insert(0, 1, U'u'); - str.insert(0, 1, U'\\'); - str.insert(0, 1, U'['); - str.insert(0, 1, LDML_MARKER_CODE); - str.insert(0, 1, LDML_UC_SENTINEL); - } else { - // add hex part - prepend_hex_quad(str, marker); - // add static part - km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, u'\\', u'u'}; - str.insert(0, markstr, 4); - } - } -} - -bool normalize_nfd_markers(std::u16string &str, bool for_regex) { +bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding) { marker_map m; - return normalize_nfd_markers(str, m, for_regex); + return normalize_nfd_markers(str, m, encoding); } -bool normalize_nfc_markers(std::u16string &str, bool for_regex) { +bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) { marker_map m; - return normalize_nfc_markers(str, m, for_regex); + return normalize_nfc_markers(str, m, encoding); } -bool normalize_nfd_markers(std::u32string &str, bool for_regex) { + +bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) { marker_map m; - return normalize_nfd_markers(str, m, for_regex); + return normalize_nfd_markers(str, m, encoding); } -bool normalize_nfc_markers(std::u32string &str, bool for_regex) { +bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding) { marker_map m; - return normalize_nfc_markers(str, m, for_regex); + return normalize_nfc_markers(str, m, encoding); } diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index 9a278d8051f..fae6eb33db4 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -770,11 +770,11 @@ int test_normalize() { } { - // u"4è\U0000ffff\b\U00000001̠" + // u"4è\U0000ffff\u0008\U00000001̠" marker_map map; std::cout << __FILE__ << ":" << __LINE__ << " - complex test 4a" << std::endl; - const std::u32string src = U"4e\u0300\uFFFF\b\u0001\u0320"; - const std::u32string expect = U"4e\uFFFF\b\u0001\u0320\u0300"; + const std::u32string src = U"4e\u0300\uFFFF\u0008\u0001\u0320"; + const std::u32string expect = U"4e\uFFFF\u0008\u0001\u0320\u0300"; std::u32string dst = src; assert(normalize_nfd_markers(dst, map)); if (dst != expect) { @@ -808,10 +808,10 @@ int test_normalize() { // from tests - regex edition marker_map map; std::cout << __FILE__ << ":" << __LINE__ << " - complex test 9c+regex" << std::endl; - const std::u32string src = U"9ce\u0300\uFFFF\b\\u0002\u0320\uFFFF\b\\u0001"; - const std::u32string expect = U"9ce\uFFFF\b\\u0002\u0320\u0300\uFFFF\b\\u0001"; + const std::u32string src = U"9ce\u0300\uFFFF\u0008\\u0002\u0320\uFFFF\u0008\\u0001"; + const std::u32string expect = U"9ce\uFFFF\u0008\\u0002\u0320\u0300\uFFFF\u0008\\u0001"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map, true)); // TODO-LDML: need regex flag + assert(normalize_nfd_markers(dst, map, regex_sentinel)); // TODO-LDML: need regex flag if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -825,10 +825,10 @@ int test_normalize() { // from tests - regex edition marker_map map; std::cout << __FILE__ << ":" << __LINE__ << " - complex test \\m{.}" << std::endl; - const std::u32string src = U"9ce\u0300\uFFFF\b[\\u0001-\\uD7FE]\u0320\uFFFF\b\\u0001"; - const std::u32string expect = U"9ce\uFFFF\b[\\u0001-\\uD7FE]\u0320\u0300\uFFFF\b\\u0001"; + const std::u32string src = U"9ce\u0300\uFFFF\u0008[\\u0001-\\uD7FE]\u0320\uFFFF\u0008\\u0001"; + const std::u32string expect = U"9ce\uFFFF\u0008[\\u0001-\\uD7FE]\u0320\u0300\uFFFF\u0008\\u0001"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map, true)); // TODO-LDML: need regex flag + assert(normalize_nfd_markers(dst, map, regex_sentinel)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl;