From 4108ecd2982ad09d6b979119fac72f83a13cffce Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 25 Jan 2024 17:45:28 -0600 Subject: [PATCH 1/2] =?UTF-8?q?feat(core):=20ldml=20marker=20normalization?= =?UTF-8?q?=20fix=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - decomp the 'glued' char before storing in map For: #10516 --- core/src/ldml/ldml_markers.cpp | 19 ++++- .../keyboards/k_008_transform_norm-test.xml | 38 +++++++++- .../ldml/keyboards/k_008_transform_norm.xml | 6 +- core/tests/unit/ldml/test_transforms.cpp | 73 +++++++++++++++++++ 4 files changed, 130 insertions(+), 6 deletions(-) diff --git a/core/src/ldml/ldml_markers.cpp b/core/src/ldml/ldml_markers.cpp index 10875c30ef7..51dc349925b 100644 --- a/core/src/ldml/ldml_markers.cpp +++ b/core/src/ldml/ldml_markers.cpp @@ -399,7 +399,8 @@ add_pending_markers( marker_map *markers, marker_list &last_markers, const std::u32string::const_iterator &last, - const std::u32string::const_iterator &end) { + const std::u32string::const_iterator &end, + const icu::Normalizer2 *nfd) { if(markers == nullptr || last_markers.empty()) { return; } @@ -407,8 +408,15 @@ add_pending_markers( if (last == end) { marker_ch = MARKER_BEFORE_EOT; } else { - marker_ch = *last; + icu::UnicodeString decomposition; + auto ch = *last; + if(!nfd->getDecomposition(ch, decomposition)) { + marker_ch = ch; + } else { + marker_ch = decomposition.char32At(0); // first UChar32 + } } + add_markers_to_map(*markers, marker_ch, last_markers); last_markers.clear(); // mark as already recorded } @@ -417,6 +425,9 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) { std::u32string out; marker_list last_markers; + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); + UASSERT_SUCCESS(status); auto last = str.begin(); // points to the part of the string after the last matched marker for (auto i = str.begin(); i != str.end();) { @@ -425,7 +436,7 @@ remove_markers(const std::u32string &str, marker_map *markers, marker_encoding e // add any markers found before this entry, but only if there is intervening // text. This prevents the sentinel or the '\u' from becoming the attachment char. if (i != last) { - add_pending_markers(markers, last_markers, last, str.end()); + add_pending_markers(markers, last_markers, last, str.end(), nfd); out.append(last, i); // append any non-marker text since the end of the last marker last = i; // advance over text we've already appended } @@ -443,7 +454,7 @@ remove_markers(const std::u32string &str, marker_map *markers, marker_encoding e // add any remaining pending markers. // if last == str.end() then this wil be MARKER_BEFORE_EOT // otherwise it will be the glue character - add_pending_markers(markers, last_markers, last, str.end()); + add_pending_markers(markers, last_markers, last, str.end(), nfd); // get the suffix between the last marker and the end (could be nothing) out.append(last, str.end()); return out; diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index 1611688b43c..7717ef5c30a 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -305,5 +305,41 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index 2ed6a84bfea..aa121f66fdb 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -13,6 +13,7 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar + @@ -29,7 +30,7 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar - + @@ -84,6 +85,9 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar + + + diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index e6adc3c82f5..452c459cc05 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -950,11 +950,84 @@ test_normalize() { zassert_string_equal(dst_rem, expect_rem); std::u32string dst_nfd = src; assert(normalize_nfd_markers(dst_nfd)); + if (dst_nfd != expect_nfd) { + std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl; + } + zassert_string_equal(dst_nfd, expect_nfd); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl; + // KA \m O -> KA \m E AA + const std::u32string src = U"\u0995\uFFFF\u0008\u0001\u09CB"; + const std::u32string expect_rem = U"\u0995\u09CB"; + const std::u32string expect_nfd = U"\u0995\uFFFF\u0008\u0001\u09C7\u09BE"; + auto dst_rem = remove_markers(src, &map); + marker_map expm({{0x09C7, 0x1L}}); + zassert_string_equal(dst_rem, expect_rem); + std::u32string dst_nfd = src; + assert(normalize_nfd_markers(dst_nfd)); + if (dst_nfd != expect_nfd) { + std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl; + } + zassert_string_equal(dst_nfd, expect_nfd); + assert_marker_map_equal(map, expm); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl; + const std::u32string src = U"\u0995\u09BE\uFFFF\u0008\u0001\u09C7"; + const std::u32string expect_rem = U"\u0995\u09BE\u09C7"; + const std::u32string expect_nfd = src; // does not get reordered + auto dst_rem = remove_markers(src, &map); + marker_map expm({{0x09C7, 0x1L}}); + zassert_string_equal(dst_rem, expect_rem); + std::u32string dst_nfd = src; + assert(normalize_nfd_markers(dst_nfd)); if (dst_nfd != expect_nfd) { std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl; } zassert_string_equal(dst_nfd, expect_nfd); + assert_marker_map_equal(map, expm); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl; + const std::u32string src = U"\u0995\u09BE\uFFFF\u0008\u0001\u09C7"; + const std::u32string expect_rem = U"\u0995\u09BE\u09C7"; + const std::u32string expect_nfd = src; // does not get reordered + auto dst_rem = remove_markers(src, &map); + marker_map expm({{0x09C7, 0x1L}}); + zassert_string_equal(dst_rem, expect_rem); + std::u32string dst_nfd = src; + assert(normalize_nfd_markers(dst_nfd)); + if (dst_nfd != expect_nfd) { + std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl; + } + zassert_string_equal(dst_nfd, expect_nfd); + assert_marker_map_equal(map, expm); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-greek " << std::endl; + const std::u32string src = U"\u03B5\uFFFF\u0008\u0001\u0344"; + const std::u32string expect_rem = U"\u03B5\u0344"; + const std::u32string expect_nfd = U"\u03B5\uFFFF\u0008\u0001\u0308\u0301"; + auto dst_rem = remove_markers(src, &map); + marker_map expm({{0x0308, 0x1L}}); + zassert_string_equal(dst_rem, expect_rem); + std::u32string dst_nfd = src; + assert(normalize_nfd_markers(dst_nfd)); + if (dst_nfd != expect_nfd) { + std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl; + } + zassert_string_equal(dst_nfd, expect_nfd); + assert_marker_map_equal(map, expm); } return EXIT_SUCCESS; From 580e372a7e04020a684acd5c8de5d8053c1c16e0 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 26 Jan 2024 14:45:54 -0600 Subject: [PATCH 2/2] =?UTF-8?q?feat(core):=20ldml=20marker=20normalization?= =?UTF-8?q?=20fix=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - document per code review For: #10516 --- core/src/ldml/ldml_markers.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/core/src/ldml/ldml_markers.cpp b/core/src/ldml/ldml_markers.cpp index 51dc349925b..1faed416cdd 100644 --- a/core/src/ldml/ldml_markers.cpp +++ b/core/src/ldml/ldml_markers.cpp @@ -381,7 +381,7 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) { } /** add the list to the map */ -static void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) { +inline void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) { for (auto i = list.begin(); i < list.end(); i++) { // marker_ch is duplicate, but keeps the structure more shallow. markers.emplace_back(marker_ch, *i); @@ -389,7 +389,7 @@ static void add_markers_to_map(marker_map &markers, char32_t marker_ch, const ma } /** - * Add any markers, if needed + * Add any markers, if needed. Inlined because we need to run it twice. * @param markers marker map or nullptr * @param last the 'last' parameter past the prior parsing * @param end end of the input string @@ -401,24 +401,34 @@ add_pending_markers( const std::u32string::const_iterator &last, const std::u32string::const_iterator &end, const icu::Normalizer2 *nfd) { + // quick check to see if there's no work to do. if(markers == nullptr || last_markers.empty()) { return; } + /** which character this marker is 'glued' to. */ char32_t marker_ch; if (last == end) { + // at end of text, so use a special value to indicate 'EOT'. marker_ch = MARKER_BEFORE_EOT; } else { icu::UnicodeString decomposition; - auto ch = *last; + auto ch = *last; // non-normalized character + + // if the character is composed, we need to use the first decomposed char + // as the 'glue'. if(!nfd->getDecomposition(ch, decomposition)) { + // char does not have a decomposition - so it may be used for the glue marker_ch = ch; } else { - marker_ch = decomposition.char32At(0); // first UChar32 + // 'glue' is the first codepoint of the decomposition. + marker_ch = decomposition.char32At(0); } } + // now, update the map with these markers (in order) on this character. add_markers_to_map(*markers, marker_ch, last_markers); - last_markers.clear(); // mark as already recorded + // clear the list + last_markers.clear(); } std::u32string