diff --git a/core/src/ldml/ldml_markers.cpp b/core/src/ldml/ldml_markers.cpp
index 10875c30ef7..1faed416cdd 100644
--- a/core/src/ldml/ldml_markers.cpp
+++ b/core/src/ldml/ldml_markers.cpp
@@ -381,7 +381,7 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) {
}
/** add the list to the map */
-static void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) {
+inline void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) {
for (auto i = list.begin(); i < list.end(); i++) {
// marker_ch is duplicate, but keeps the structure more shallow.
markers.emplace_back(marker_ch, *i);
@@ -389,7 +389,7 @@ static void add_markers_to_map(marker_map &markers, char32_t marker_ch, const ma
}
/**
- * Add any markers, if needed
+ * Add any markers, if needed. Inlined because we need to run it twice.
* @param markers marker map or nullptr
* @param last the 'last' parameter past the prior parsing
* @param end end of the input string
@@ -399,24 +399,45 @@ add_pending_markers(
marker_map *markers,
marker_list &last_markers,
const std::u32string::const_iterator &last,
- const std::u32string::const_iterator &end) {
+ const std::u32string::const_iterator &end,
+ const icu::Normalizer2 *nfd) {
+ // quick check to see if there's no work to do.
if(markers == nullptr || last_markers.empty()) {
return;
}
+ /** which character this marker is 'glued' to. */
char32_t marker_ch;
if (last == end) {
+ // at end of text, so use a special value to indicate 'EOT'.
marker_ch = MARKER_BEFORE_EOT;
} else {
- marker_ch = *last;
+ icu::UnicodeString decomposition;
+ auto ch = *last; // non-normalized character
+
+ // if the character is composed, we need to use the first decomposed char
+ // as the 'glue'.
+ if(!nfd->getDecomposition(ch, decomposition)) {
+ // char does not have a decomposition - so it may be used for the glue
+ marker_ch = ch;
+ } else {
+ // 'glue' is the first codepoint of the decomposition.
+ marker_ch = decomposition.char32At(0);
+ }
}
+
+ // now, update the map with these markers (in order) on this character.
add_markers_to_map(*markers, marker_ch, last_markers);
- last_markers.clear(); // mark as already recorded
+ // clear the list
+ last_markers.clear();
}
std::u32string
remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) {
std::u32string out;
marker_list last_markers;
+ UErrorCode status = U_ZERO_ERROR;
+ const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
+ UASSERT_SUCCESS(status);
auto last = str.begin(); // points to the part of the string after the last matched marker
for (auto i = str.begin(); i != str.end();) {
@@ -425,7 +446,7 @@ remove_markers(const std::u32string &str, marker_map *markers, marker_encoding e
// add any markers found before this entry, but only if there is intervening
// text. This prevents the sentinel or the '\u' from becoming the attachment char.
if (i != last) {
- add_pending_markers(markers, last_markers, last, str.end());
+ add_pending_markers(markers, last_markers, last, str.end(), nfd);
out.append(last, i); // append any non-marker text since the end of the last marker
last = i; // advance over text we've already appended
}
@@ -443,7 +464,7 @@ remove_markers(const std::u32string &str, marker_map *markers, marker_encoding e
// add any remaining pending markers.
// if last == str.end() then this wil be MARKER_BEFORE_EOT
// otherwise it will be the glue character
- add_pending_markers(markers, last_markers, last, str.end());
+ add_pending_markers(markers, last_markers, last, str.end(), nfd);
// get the suffix between the last marker and the end (could be nothing)
out.append(last, str.end());
return out;
diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml
index 1611688b43c..7717ef5c30a 100644
--- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml
+++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml
@@ -305,5 +305,41 @@
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml
index 2ed6a84bfea..aa121f66fdb 100644
--- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml
+++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml
@@ -13,6 +13,7 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar
+
@@ -29,7 +30,7 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar
-
+
@@ -84,6 +85,9 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar
+
+
+
diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp
index e6adc3c82f5..452c459cc05 100644
--- a/core/tests/unit/ldml/test_transforms.cpp
+++ b/core/tests/unit/ldml/test_transforms.cpp
@@ -950,11 +950,84 @@ test_normalize() {
zassert_string_equal(dst_rem, expect_rem);
std::u32string dst_nfd = src;
assert(normalize_nfd_markers(dst_nfd));
+ if (dst_nfd != expect_nfd) {
+ std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
+ std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
+ }
+ zassert_string_equal(dst_nfd, expect_nfd);
+ }
+ {
+ marker_map map;
+ std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl;
+ // KA \m O -> KA \m E AA
+ const std::u32string src = U"\u0995\uFFFF\u0008\u0001\u09CB";
+ const std::u32string expect_rem = U"\u0995\u09CB";
+ const std::u32string expect_nfd = U"\u0995\uFFFF\u0008\u0001\u09C7\u09BE";
+ auto dst_rem = remove_markers(src, &map);
+ marker_map expm({{0x09C7, 0x1L}});
+ zassert_string_equal(dst_rem, expect_rem);
+ std::u32string dst_nfd = src;
+ assert(normalize_nfd_markers(dst_nfd));
+ if (dst_nfd != expect_nfd) {
+ std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
+ std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
+ }
+ zassert_string_equal(dst_nfd, expect_nfd);
+ assert_marker_map_equal(map, expm);
+ }
+ {
+ marker_map map;
+ std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl;
+ const std::u32string src = U"\u0995\u09BE\uFFFF\u0008\u0001\u09C7";
+ const std::u32string expect_rem = U"\u0995\u09BE\u09C7";
+ const std::u32string expect_nfd = src; // does not get reordered
+ auto dst_rem = remove_markers(src, &map);
+ marker_map expm({{0x09C7, 0x1L}});
+ zassert_string_equal(dst_rem, expect_rem);
+ std::u32string dst_nfd = src;
+ assert(normalize_nfd_markers(dst_nfd));
if (dst_nfd != expect_nfd) {
std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
}
zassert_string_equal(dst_nfd, expect_nfd);
+ assert_marker_map_equal(map, expm);
+ }
+ {
+ marker_map map;
+ std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl;
+ const std::u32string src = U"\u0995\u09BE\uFFFF\u0008\u0001\u09C7";
+ const std::u32string expect_rem = U"\u0995\u09BE\u09C7";
+ const std::u32string expect_nfd = src; // does not get reordered
+ auto dst_rem = remove_markers(src, &map);
+ marker_map expm({{0x09C7, 0x1L}});
+ zassert_string_equal(dst_rem, expect_rem);
+ std::u32string dst_nfd = src;
+ assert(normalize_nfd_markers(dst_nfd));
+ if (dst_nfd != expect_nfd) {
+ std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
+ std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
+ }
+ zassert_string_equal(dst_nfd, expect_nfd);
+ assert_marker_map_equal(map, expm);
+ }
+ {
+ marker_map map;
+ std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-greek " << std::endl;
+ const std::u32string src = U"\u03B5\uFFFF\u0008\u0001\u0344";
+ const std::u32string expect_rem = U"\u03B5\u0344";
+ const std::u32string expect_nfd = U"\u03B5\uFFFF\u0008\u0001\u0308\u0301";
+ auto dst_rem = remove_markers(src, &map);
+ marker_map expm({{0x0308, 0x1L}});
+ zassert_string_equal(dst_rem, expect_rem);
+ std::u32string dst_nfd = src;
+ assert(normalize_nfd_markers(dst_nfd));
+ if (dst_nfd != expect_nfd) {
+ std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
+ std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
+ }
+ zassert_string_equal(dst_nfd, expect_nfd);
+ assert_marker_map_equal(map, expm);
}
return EXIT_SUCCESS;