From b429dc3383e07be88b01142b28725a7477bda5ef Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 29 Jan 2024 11:48:07 -0600 Subject: [PATCH 1/3] =?UTF-8?q?feat(core):=20=20markers=20and=20reorder=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - reorders don't interact with markers, but must transit them - add updated tests - simplify reorder_group::apply(): the 'common prefix' discussion was out-of-date as std::mismatch is handled by the ldml_processor and the core context. - add_back_markers() promoted to SPI so it can be called from reordering. Fixes: #10516 --- core/src/ldml/ldml_markers.cpp | 2 +- core/src/ldml/ldml_markers.hpp | 2 + core/src/ldml/ldml_transforms.cpp | 42 +++++++++---------- .../ldml/keyboards/k_201_reorder_esk-test.xml | 42 +++++++++++++++++++ .../unit/ldml/keyboards/k_201_reorder_esk.xml | 15 ++++--- 5 files changed, 75 insertions(+), 28 deletions(-) diff --git a/core/src/ldml/ldml_markers.cpp b/core/src/ldml/ldml_markers.cpp index 1faed416cdd..38712c7f61c 100644 --- a/core/src/ldml/ldml_markers.cpp +++ b/core/src/ldml/ldml_markers.cpp @@ -58,7 +58,7 @@ bool normalize_nfd(std::u16string &str) { return normalize(nfd, str, status); } -static void add_back_markers(std::u32string &str, const std::u32string &src, marker_map &map, marker_encoding encoding) { +void add_back_markers(std::u32string &str, const std::u32string &src, marker_map &map, marker_encoding encoding) { // need to reconstitute. marker_map map2(map); // make a copy of the map // clear the string diff --git a/core/src/ldml/ldml_markers.hpp b/core/src/ldml/ldml_markers.hpp index 5527c851c7c..ff0aa057b9b 100644 --- a/core/src/ldml/ldml_markers.hpp +++ b/core/src/ldml/ldml_markers.hpp @@ -93,6 +93,8 @@ void prepend_hex_quad(std::u32string &str, KMX_DWORD marker); /** parse 0001...FFFF into a KMX_DWORD. Returns 0 on failure */ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]); +/** re-add markers */ +void add_back_markers(std::u32string &str, const std::u32string &src, marker_map &map, marker_encoding encoding); // bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) { // marker_map m; diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 9c0fd1f8d49..b4fa994de3d 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -331,15 +331,19 @@ reorder_group::apply(std::u32string &str) const { /** did we match anything */ bool some_match = false; + // markers need to 'pass thorugh' reorders. remove and re-add if needed + marker_map markers; + std::u32string out = remove_markers(str, markers, plain_sentinel); + // get a baseline sort key - auto sort_keys = reorder_sort_key::from(str); + auto sort_keys = reorder_sort_key::from(out); // apply ALL reorders in the group. for (const auto &r : list) { // work backward from end of string forward // That is, see if "abc" matches "abc" or "ab" or "a" - for (size_t s = str.size(); s > 0; s--) { - size_t submatch = r.match_end(str, 0, s); + for (size_t s = out.size(); s > 0; s--) { + size_t submatch = r.match_end(out, 0, s); if (submatch != 0) { #if KMXPLUS_DEBUG_TRANSFORM DebugTran("Matched: %S (off=%d, len=%d)", str.c_str(), 0, s); @@ -367,18 +371,6 @@ reorder_group::apply(std::u32string &str) const { } #endif - // TODO-LDML: for now, assume matches entire string. - // A needed optimization here would be to detect a common substring - // at the end of the old and new strings, and keep the match_len - // minimal. This could reduce thrash in core's context. - // However, the calling code does check for a common substring with mismatch() - size_t match_len = str.size(); - - // 'prefix' is the unmatched string before the match - // TODO-LDML: right now, this is empty, because match_len is the entire size. - std::u32string prefix = str; - prefix.resize(str.size() - match_len); // just the part before the matched part. - // Now, we need to actually do the sorting, but we must only sort // 'runs' beginning with 0-weight keys. @@ -420,7 +412,7 @@ reorder_group::apply(std::u32string &str) const { } // recombine into a string by pulling out the 'ch' value // that's in each sortkey element. - std::u32string newSuffix; + out.clear(); // will re-add all text signed char q = sort_keys.begin()->quaternary; // start with the first quaternary for (auto e = sort_keys.begin(); e < sort_keys.end(); e++, q++) { if (q != e->quaternary) { @@ -428,13 +420,12 @@ reorder_group::apply(std::u32string &str) const { applied = true; } // collect the characters - newSuffix.append(1, e->ch); + out.append(1, e->ch); } - if (applied) { - str.resize(prefix.size()); - str.append(newSuffix); - } else { + if (!applied) { DebugTran("Skip: sorting caused no reordering"); + // exit early to avoid string copying and possibly marker re-adding. + return false; // no change } #if KMXPLUS_DEBUG_TRANSFORM DebugTran("Sorted sortkey"); @@ -442,7 +433,14 @@ reorder_group::apply(std::u32string &str) const { r.dump(); } #endif - return applied; + if (markers.empty()) { + // no markers, just copy over to str + str = out; + } else { + // copy and re-add markers + add_back_markers(str, out, markers, plain_sentinel); + } + return true; // updated } transform_entry::transform_entry(const transform_entry &other) diff --git a/core/tests/unit/ldml/keyboards/k_201_reorder_esk-test.xml b/core/tests/unit/ldml/keyboards/k_201_reorder_esk-test.xml index e97ebd35b66..c1510ba1dae 100644 --- a/core/tests/unit/ldml/keyboards/k_201_reorder_esk-test.xml +++ b/core/tests/unit/ldml/keyboards/k_201_reorder_esk-test.xml @@ -21,4 +21,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_201_reorder_esk.xml b/core/tests/unit/ldml/keyboards/k_201_reorder_esk.xml index 0b49446ab1f..5f3134ee05e 100644 --- a/core/tests/unit/ldml/keyboards/k_201_reorder_esk.xml +++ b/core/tests/unit/ldml/keyboards/k_201_reorder_esk.xml @@ -1,10 +1,5 @@ - - @@ -19,6 +14,10 @@ + + + + @@ -29,6 +28,12 @@ + + + + + + From 2c92cfea4bfe12cf52375bcad459e2a280515b84 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 31 Jan 2024 17:41:50 -0600 Subject: [PATCH 2/3] =?UTF-8?q?feat(core):=20reorder=20processing=20=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - update per code review - move quickcheck of map into add_back_markers() For: #10516 --- core/src/ldml/ldml_markers.cpp | 7 ++++--- core/src/ldml/ldml_transforms.cpp | 10 ++-------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/core/src/ldml/ldml_markers.cpp b/core/src/ldml/ldml_markers.cpp index 38712c7f61c..6840d47b2c5 100644 --- a/core/src/ldml/ldml_markers.cpp +++ b/core/src/ldml/ldml_markers.cpp @@ -59,6 +59,10 @@ bool normalize_nfd(std::u16string &str) { } void add_back_markers(std::u32string &str, const std::u32string &src, marker_map &map, marker_encoding encoding) { + if (map.empty()) { + // quick check, nothing to do if no markers + return; + } // need to reconstitute. marker_map map2(map); // make a copy of the map // clear the string @@ -110,9 +114,6 @@ bool normalize_nfd_markers_segment(std::u32string &str, marker_map &map, marker_ std::u32string str_unmarked_nfd = str_unmarked; if(!normalize_nfd(str_unmarked_nfd)) { return false; // normalize failed. - } else if (map.size() == 0) { - // no markers. Return the normalized unmarked str - str = str_unmarked_nfd; } else if (str_unmarked_nfd == str_unmarked) { // Normalization produced no change when markers were removed. // So, we'll call this a no-op. diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index b4fa994de3d..e383c2d0258 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -331,7 +331,7 @@ reorder_group::apply(std::u32string &str) const { /** did we match anything */ bool some_match = false; - // markers need to 'pass thorugh' reorders. remove and re-add if needed + // markers need to 'pass through' reorders. remove and re-add if needed marker_map markers; std::u32string out = remove_markers(str, markers, plain_sentinel); @@ -433,13 +433,7 @@ reorder_group::apply(std::u32string &str) const { r.dump(); } #endif - if (markers.empty()) { - // no markers, just copy over to str - str = out; - } else { - // copy and re-add markers - add_back_markers(str, out, markers, plain_sentinel); - } + add_back_markers(str, out, markers, plain_sentinel); return true; // updated } From ec03a681f7f958075770077ffb767ba48fa257b1 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 31 Jan 2024 22:16:50 -0600 Subject: [PATCH 3/3] =?UTF-8?q?feat(core):=20reorder=20processing=20=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix quickcheck of add_back_markers() For: #10516 --- core/src/ldml/ldml_markers.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/ldml/ldml_markers.cpp b/core/src/ldml/ldml_markers.cpp index 6840d47b2c5..13c711d9d06 100644 --- a/core/src/ldml/ldml_markers.cpp +++ b/core/src/ldml/ldml_markers.cpp @@ -61,6 +61,7 @@ bool normalize_nfd(std::u16string &str) { void add_back_markers(std::u32string &str, const std::u32string &src, marker_map &map, marker_encoding encoding) { if (map.empty()) { // quick check, nothing to do if no markers + str = src; return; } // need to reconstitute.