From efc8ef022f921482fd97f7d13fefe99d212829d0 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 13 Oct 2023 17:00:01 -0500 Subject: [PATCH 01/14] =?UTF-8?q?feat(core):=20ldml=20normalization=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - some failing marker tests For: #9468 --- .../keyboards/k_008_transform_norm-test.xml | 62 +++++++++++++++++++ .../ldml/keyboards/k_008_transform_norm.xml | 14 ++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index 025fe36a483..c1ff50973bd 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -133,4 +133,66 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index 1162a7769e5..e47897b8581 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -15,6 +15,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke + @@ -26,7 +27,7 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke - + @@ -43,6 +44,17 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke + + + + + + + + + + + From 255960ecaed4a6e30c46be7910a96953d9c52e1d Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 20 Oct 2023 14:30:13 -0500 Subject: [PATCH 02/14] =?UTF-8?q?feat(core):=20ldml=20dx:=20dump=20vkey=20?= =?UTF-8?q?and=20modifier=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For: #9468 --- core/tests/unit/ldml/ldml.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index eaf764e18eb..8397893033f 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -27,7 +27,7 @@ #include // for surrogate pair macros #include "ldml_test_source.hpp" - +#include "debuglog.h" namespace { bool g_beep_found = false; @@ -216,7 +216,8 @@ run_test(const km::core::path &source, const km::core::path &compiled, km::tests // handle backspace here if (action.type == km::tests::LDML_ACTION_KEY_EVENT) { auto &p = action.k; - std::cout << "- key action: 0x" << std::hex << p.vk << "/modifier 0x" << p.modifier_state << std::dec << std::endl; + std::cout << "- key action: " << km::kbp::kmx::Debug_VirtualKey(p.vk) << "/modifier " << km::kbp::kmx::Debug_ModifierName(p.modifier_state) << " 0x" << p.modifier_state + << std::dec << std::endl; // Because a normal system tracks caps lock state itself, // we mimic that in the tests. We assume caps lock state is // updated on key_down before the processor receives the From 015a738226d9169a3e6fd0360768a5d8510e7307 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 20 Oct 2023 14:38:59 -0500 Subject: [PATCH 03/14] =?UTF-8?q?feat(core):=20ldml=20marker=20normalizati?= =?UTF-8?q?on=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add a new remove_markers(std::u32string) function - add test cases for text utils - update (failing) test cases for transform - improve documentation of append process - support KM_CORE_BT_UNKNOWN in ldml test - remove_markers with a map - update normalize test For: #9468 --- core/src/ldml/ldml_processor.cpp | 42 +++- core/src/ldml/ldml_transforms.cpp | 75 +++++++- core/src/ldml/ldml_transforms.hpp | 24 +++ .../keyboards/k_008_transform_norm-test.xml | 39 ++++ .../ldml/keyboards/k_008_transform_norm.xml | 8 + core/tests/unit/ldml/ldml.cpp | 23 ++- core/tests/unit/ldml/test_transforms.cpp | 179 +++++++++++++++++- 7 files changed, 359 insertions(+), 31 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index f4bb2648f1e..41a731f9816 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -209,15 +209,20 @@ ldml_processor::process_event( // const size_t matchedContext = transforms->apply(ctxtstr, outputString); } KMX_DWORD last_char = 0UL; + KMX_DWORD last_marker = 0UL; // attempt to get the last char auto end = state->context().rbegin(); if(end != state->context().rend()) { if((*end).type == KM_CORE_CT_CHAR) { last_char = (*end).character; - // TODO-LDML: markers! + } else if((*end).type == KM_CORE_BT_MARKER) { + last_marker = (*end).marker; } } - if (last_char == 0UL) { + if (last_marker != 0UL) { + // delete of a marker + state->actions().push_backspace(KM_CORE_BT_MARKER, last_marker); + } else if (last_char == 0UL) { /* We couldn't find a character at end of context (context is empty), so we'll pass the backspace keystroke on to the app to process; the @@ -297,25 +302,44 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // drop last 'matchedContext': ctxtstr.resize(ctxtstr.length() - matchedContext); ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. - assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? + assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: Need marker-safe normalize here. // Ok. We've done all the happy manipulations. /** NFC and no markers */ - std::u32string ctxtstr_cleanedup = ctxtstr; - // TODO-LDML: remove markers! - assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail? - - // find common prefix + std::u32string ctxtstr_cleanedup = ldml::remove_markers(ctxtstr); + assert(ldml::normalize_nfc(ctxtstr_cleanedup)); + + // find common prefix. + // For example, if the context previously had "aaBBBBB" and it is changing to "aaCCC" then we will have: + // - old_ctxtstr_changed = "BBBBB" + // - new_ctxtstr_changed = "CCC" + // So the BBBBB needs to be removed and then CCC added. auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); - /** the part of the old str that changed */ + /** The part of the old string to be removed */ std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end()); + /** The new context to be added */ std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end()); // drop the old suffix. Note: this mutates old_ctxtstr_changed. remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length()); assert(old_ctxtstr_changed.length() == 0); + // old_ctxtstr_changed is now empty because it's been removed. + // context is "aa" in the above example. emit_text(state, new_ctxtstr_changed); + + // TODO-LDML: need to emit marker here - need to emit text w/ markers, and handle appropriately. + // // TODO-LDML: 1-marker hack! + if (key_str32.length() == 3 && key_str32[0] == LDML_UC_SENTINEL && key_str32[1] == LDML_MARKER_CODE) { + state->context().push_marker(key_str32[2]); + } + + // for debugging! + { + std::u32string ctxtstr2; + (void)context_to_string(state, ctxtstr2, true); // with markers + (void)0; + } } void diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 6ae8feafc87..81817d2b34f 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -863,16 +863,6 @@ transforms::load( // string manipulation -bool normalize_nfd(std::u32string &str) { - std::u16string rstr = km::core::kmx::u32string_to_u16string(str); - if(!normalize_nfd(rstr)) { - return false; - } else { - str = km::core::kmx::u16string_to_u32string(rstr); - return true; - } -} - /** internal function to normalize with a specified mode */ static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode &status) { UASSERT_SUCCESS(status); @@ -886,6 +876,16 @@ static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode return U_SUCCESS(status); } +bool normalize_nfd(std::u32string &str) { + std::u16string rstr = km::core::kmx::u32string_to_u16string(str); + if(!normalize_nfd(rstr)) { + return false; + } else { + str = km::core::kmx::u16string_to_u32string(rstr); + return true; + } +} + bool normalize_nfd(std::u16string &str) { UErrorCode status = U_ZERO_ERROR; const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); @@ -893,6 +893,14 @@ bool normalize_nfd(std::u16string &str) { return normalize(nfd, str, status); } +bool normalize_nfd_markers(std::u16string &str, marker_map &_kmn_unused(map)) { + return normalize_nfd(str); +} + +bool normalize_nfd_markers(std::u32string &str, marker_map &_kmn_unused(map)) { + return normalize_nfd(str); +} + bool normalize_nfc(std::u32string &str) { std::u16string rstr = km::core::kmx::u32string_to_u16string(str); if(!normalize_nfc(rstr)) { @@ -910,6 +918,53 @@ bool normalize_nfc(std::u16string &str) { return normalize(nfc, str, status); } +std::u32string remove_markers(const std::u32string &str, marker_map *markers) { + std::u32string out; + auto i = str.begin(); + auto last = i; + for (i = find(i, str.end(), LDML_UC_SENTINEL); i != str.end(); i = find(i, str.end(), LDML_UC_SENTINEL)) { + // append any prefix (from prior pos'n to here) + out.append(last, i); + + // #1: LDML_UC_SENTINEL (what we searched for) + assert(*i == LDML_UC_SENTINEL); // assert that find() worked + i++; + last = i; + if (i == str.end()) { + break; // hit end + } + + // #2 LDML_MARKER_CODE + assert(*i == LDML_MARKER_CODE); + if (*i != LDML_MARKER_CODE) { + continue; // can't process this, get out + } + i++; + last = i; + if (i == str.end()) { + break; // hit end + } + + // #3 marker number + const KMX_DWORD marker_no = *i; + assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_MAX_INDEX); + i++; // if end, we'll break out of the loop + last = i; + + // record the marker + if (markers != nullptr) { + if (i == str.end()) { + markers->emplace(MARKER_BEFORE_EOT, marker_no); + } else { + markers->emplace(*i, marker_no); + } + } + } + // get the suffix between the last marker and the end + out.append(last, str.end()); + return out; +} + } // namespace ldml } // namespace core } // namespace km diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 044520ade5e..7e887a8fabf 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -8,6 +8,7 @@ #pragma once #include "kmx/kmx_plus.h" +#include "kmx/kmx_xstring.h" #include #include #include @@ -287,14 +288,37 @@ class transforms { // string routines +/** indicates that the marker was before the end of text. */ +const char32_t MARKER_BEFORE_EOT = km::kbp::kmx::Uni_FFFE_NONCHARACTER; + +/** map from following-char to marker number. */ +typedef std::map marker_map; + /** Normalize a u32string inplace to NFD. @return false on failure */ bool normalize_nfd(std::u32string &str); /** Normalize a u16string inplace to NFD. @return false on failure */ bool normalize_nfd(std::u16string &str); +/** Normalize a u32string inplace to NFD, retaining markers. + * @param markers will be populated with marker chars + * @return false on failure + **/ +bool normalize_nfd_markers(std::u32string &str, marker_map &markers); +/** Normalize a u16string inplace to NFD, retaining markers. + * @param markers will be populated with marker chars + * @return false on failure + **/ +bool normalize_nfd_markers(std::u16string &str, marker_map &markers); /** Normalize a u32string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u32string &str); /** Normalize a u16string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u16string &str); +/** Remove markers and optionally note their glue characters in the map */ +std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr); +/** same but with a reference */ +inline std::u32string remove_markers(const std::u32string &str, marker_map &markers) { + return remove_markers(str, &markers); +} + } // namespace ldml } // namespace core diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index c1ff50973bd..e24a512e8b5 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -195,4 +195,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index e47897b8581..226890f854e 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -55,6 +55,14 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke + + + + + + + + diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index 8397893033f..ed3d0d40979 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -105,6 +105,7 @@ apply_action( assert(!context.empty()); assert(context.back().type == KM_CORE_CT_MARKER); context.pop_back(); + // no change to text store. } else if (text_store.length() > 0) { assert(!context.empty() && !text_store.empty()); km_core_usv ch = text_store.back(); @@ -118,11 +119,23 @@ apply_action( text_store.pop_back(); } } - assert(ch == act.backspace.expected_value); - - assert(context.back().type == KM_CORE_CT_CHAR); - assert(context.back().character == ch); - context.pop_back(); + if (act.backspace.expected_type == KM_CORE_BT_CHAR) { + assert(ch == act.backspace.expected_value); + assert(context.back().type == KM_CORE_CT_CHAR); + assert(context.back().character == ch); + context.pop_back(); + } else { + assert(act.backspace.expected_type == KM_CORE_BT_UNKNOWN); // else it must be unknown + // pop to first character or empty + while (!context.empty() && context.back().type != KM_CORE_CT_CHAR) { + // pop off all non-character entries + context.pop_back(); + } + if (!context.empty()) { + // pop off the next character + context.pop_back(); + } + } } break; case KM_CORE_IT_PERSIST_OPT: diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index cb0db12bfe4..57bdd849ee5 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "test_color.h" // TODO-LDML: normal asserts wern't working, so using some hacks. // #include "ldml_test_utils.hpp" @@ -11,13 +12,13 @@ // #include "debuglog.h" #ifndef zassert_string_equal -#define zassert_string_equal(actual, expected) \ - { \ - if (actual != expected) { \ - std::cerr << __FILE__ << ":" << __LINE__ << ": " \ - << "got: " << actual << " expected " << expected << std::endl; \ - return EXIT_FAILURE; \ - } \ +#define zassert_string_equal(actual, expected) \ + { \ + if (actual != expected) { \ + std::wcerr << __FILE__ << ":" << __LINE__ << ": " << console_color::fg(console_color::BRIGHT_RED) << "got: " << actual \ + << " expected " << expected << console_color::reset() << std::endl; \ + return EXIT_FAILURE; \ + } \ } #endif @@ -427,10 +428,167 @@ int test_map() { return EXIT_SUCCESS; } +int test_strutils() { + std::cout << "== " << __FUNCTION__ << std::endl; + + std::cout << __FILE__ << ":" << __LINE__ << " * remove_markers" << std::endl; + + + { + std::cout << __FILE__ << ":" << __LINE__ << " - basic test0" << std::endl; + const std::u32string src = U"abc"; + const std::u32string dst = remove_markers(src); + zassert_string_equal(dst, src); // unchanged + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - basic test" << std::endl; + const std::u32string src = U"abc"; + const std::u32string dst = remove_markers(src, map); + zassert_string_equal(dst, src); // unchanged + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker test" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008\U00000001e"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6e"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + assert_equal(map[U'e'], 0x1L); // marker 1 @ e + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - bad0" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008"; // missing trailing marker # + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - bad1" << std::endl; + const std::u32string src = U"6\U0000ffffq"; // missing code + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6q"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - bad1" << std::endl; + const std::u32string src = U"6\U0000ffff"; // missing code + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - marker end test" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008\U00000001"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + assert_equal(map[MARKER_BEFORE_EOT], 0x1L); // marker 1 @ e + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test" << std::endl; + const std::u32string src = U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"6e\U00000320\U00000300"; + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x2L); + assert_equal(map[0x0300], 0x3L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + } + + return EXIT_SUCCESS; +} + +int test_normalize() { + std::cout << "== " << __FUNCTION__ << std::endl; + + std::cout << __FILE__ << ":" << __LINE__ << " * normalize_nfd_markers" << std::endl; + + + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - noop test" << std::endl; + const std::u32string src = U"6e\U00000320\U00000300"; // already NFD + const std::u32string expect = src; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } + + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - noop test w markers" << std::endl; + const std::u32string src = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" + U"\U0000ffff\U00000008\U00000004"; + const std::u32string expect = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" + U"\U0000ffff\U00000008\U00000004"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x2L); + assert_equal(map[0x0300], 0x3L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + } + + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test" << std::endl; + const std::u32string src = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" + U"\U0000ffff\U00000008\U00000004"; + const std::u32string expect = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" + U"\U0000ffff\U00000008\U00000004"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x2L); + assert_equal(map[0x0300], 0x3L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + + } + + return EXIT_SUCCESS; +} + + int main(int argc, const char *argv[]) { int rc = EXIT_SUCCESS; + bool arg_color = false; + + int first_arg = 1; + + if (first_arg < argc) { + arg_color = std::string(argv[first_arg]) == "--color"; + if(arg_color) { + first_arg++; + } + } + + console_color::enabled = console_color::isaterminal() || arg_color; + if (test_transforms() != EXIT_SUCCESS) { rc = EXIT_FAILURE; } @@ -443,6 +601,13 @@ main(int argc, const char *argv[]) { rc = EXIT_FAILURE; } + if (test_strutils() != EXIT_SUCCESS) { + rc = EXIT_FAILURE; + } + + if (test_normalize() != EXIT_SUCCESS) { + rc = EXIT_FAILURE; + } if (rc == EXIT_FAILURE) { std::cout << "== FAILURE" << std::endl; From 0f0c03645145341246199c1cb97a37c62ef8c6f4 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 1 Nov 2023 16:07:57 -0500 Subject: [PATCH 04/14] =?UTF-8?q?chore(resources):=20ldml=20bn:=20add=20a?= =?UTF-8?q?=20test=20case=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - this sequence failed when run as a system kbd: (qfqf) Related to: #9916 --- .../ldml-keyboards/techpreview/test/bn-test.xml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml b/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml index ba0d7d84302..ad95223ca6e 100644 --- a/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml +++ b/resources/standards-data/ldml-keyboards/techpreview/test/bn-test.xml @@ -2,7 +2,18 @@ - + + + + + + + + + + + + From a69feb583f3298485605bcf7b8cade9421472209 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 3 Nov 2023 12:20:02 -0500 Subject: [PATCH 05/14] =?UTF-8?q?feat(core):=20ldml=20marker=20normalizati?= =?UTF-8?q?on=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - km::kbp is soooo last month! - test_transforms can run NFD with markers, with some caveats. For: #9468 --- core/src/ldml/ldml_processor.cpp | 2 +- core/src/ldml/ldml_processor.hpp | 9 ----- core/src/ldml/ldml_transforms.cpp | 48 +++++++++++++++++++++++- core/src/ldml/ldml_transforms.hpp | 10 ++++- core/tests/unit/ldml/ldml.cpp | 2 +- core/tests/unit/ldml/test_transforms.cpp | 39 ++++++++++++++++--- 6 files changed, 91 insertions(+), 19 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 41a731f9816..cb1e8380314 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -457,7 +457,7 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str, boo } else if (last_type == KM_CORE_BT_MARKER) { assert(km::core::kmx::is_valid_marker(c->marker)); if (include_markers) { - prepend_marker(str, c->marker); + ldml::prepend_marker(str, c->marker); } } else { break; diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index 79c6ae4ac4b..83274a44081 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -111,15 +111,6 @@ namespace core { */ static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true); - /** prepend the marker string in UC_SENTINEL format to the str */ - inline static void prepend_marker(std::u32string &str, KMX_DWORD marker); }; - - void - ldml_processor::prepend_marker(std::u32string &str, KMX_DWORD marker) { - km_core_usv triple[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; - str.insert(0, triple, 3); - } - } // namespace core } // namespace km diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 81817d2b34f..3b3c7db9c3e 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -894,11 +894,55 @@ bool normalize_nfd(std::u16string &str) { } bool normalize_nfd_markers(std::u16string &str, marker_map &_kmn_unused(map)) { + // TODO-LDML return normalize_nfd(str); } -bool normalize_nfd_markers(std::u32string &str, marker_map &_kmn_unused(map)) { - return normalize_nfd(str); +/** + * TODO-LDML: + * - doesn't support >1 marker per char - may need a set instead of a map! + * - ideally this should be used on a normalization safe subsequence + */ +bool normalize_nfd_markers(std::u32string &str, marker_map &map) { + /** original string, but no markers */ + std::u32string str_unmarked = remove_markers(str, map); + /** original string, no markers, NFD */ + std::u32string str_unmarked_nfd = str_unmarked; + if(!normalize_nfd(str_unmarked_nfd)) { + return false; // normalize failed. + } else if (map.size() == 0) { + // no markers. Return the normalized unmarked str + str = str_unmarked_nfd; + } else if (str_unmarked_nfd == str_unmarked) { + // Normalization produced no change when markers were removed. + // So, we'll call this a no-op. + } else { + // need to reconstitute. + marker_map map2(map); // make a copy of the map + // clear the string + str.clear(); + // add the end-of-text marker + { + const auto ch = MARKER_BEFORE_EOT; + const auto m = map2.find(ch); + if (m != map2.end()) { + prepend_marker(str, m->second); + map2.erase(ch); // remove it + } + } + // go from end to beginning of string + for(auto p = str_unmarked_nfd.rbegin(); p != str_unmarked_nfd.rend(); p++) { + const auto ch = *p; + str.insert(0, 1, ch); // prepend + + const auto m = map2.find(ch); + if (m != map2.end()) { + prepend_marker(str, m->second); + map2.erase(ch); // remove it + } + } + } + return true; // all OK } bool normalize_nfc(std::u32string &str) { diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 7e887a8fabf..df0da603bd2 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -289,7 +289,7 @@ class transforms { // string routines /** indicates that the marker was before the end of text. */ -const char32_t MARKER_BEFORE_EOT = km::kbp::kmx::Uni_FFFE_NONCHARACTER; +const char32_t MARKER_BEFORE_EOT = km::core::kmx::Uni_FFFE_NONCHARACTER; /** map from following-char to marker number. */ typedef std::map marker_map; @@ -319,6 +319,14 @@ inline std::u32string remove_markers(const std::u32string &str, marker_map &mark return remove_markers(str, &markers); } +/** prepend the marker string in UC_SENTINEL format to the str */ +inline static void prepend_marker(std::u32string &str, KMX_DWORD marker); + +void +prepend_marker(std::u32string &str, KMX_DWORD marker) { + km_core_usv triple[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; + str.insert(0, triple, 3); +} } // namespace ldml } // namespace core diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index ed3d0d40979..55229d88728 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -229,7 +229,7 @@ run_test(const km::core::path &source, const km::core::path &compiled, km::tests // handle backspace here if (action.type == km::tests::LDML_ACTION_KEY_EVENT) { auto &p = action.k; - std::cout << "- key action: " << km::kbp::kmx::Debug_VirtualKey(p.vk) << "/modifier " << km::kbp::kmx::Debug_ModifierName(p.modifier_state) << " 0x" << p.modifier_state + std::cout << "- key action: " << km::core::kmx::Debug_VirtualKey(p.vk) << "/modifier " << km::core::kmx::Debug_ModifierName(p.modifier_state) << " 0x" << p.modifier_state << std::dec << std::endl; // Because a normal system tracks caps lock state itself, // we mimic that in the tests. We assume caps lock state is diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index 57bdd849ee5..1f1da08d7c2 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -528,6 +528,16 @@ int test_normalize() { zassert_string_equal(dst, expect); assert_equal(map.size(), 0); } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - medium test" << std::endl; + const std::u32string src = U"6e\U00000300\U00000320"; // swapped + const std::u32string expect = U"6e\U00000320\U00000300"; // correct NFD + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + zassert_string_equal(dst, expect); + assert_equal(map.size(), 0); + } { marker_map map; @@ -551,12 +561,10 @@ int test_normalize() { { marker_map map; std::cout << __FILE__ << ":" << __LINE__ << " - complex test" << std::endl; - const std::u32string src = - U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" - U"\U0000ffff\U00000008\U00000004"; + const std::u32string src = // already in order: 320+300 + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; const std::u32string expect = - U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" - U"\U0000ffff\U00000008\U00000004"; + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; std::u32string dst = src; assert(normalize_nfd_markers(dst, map)); zassert_string_equal(dst, expect); @@ -566,6 +574,27 @@ int test_normalize() { assert_equal(map[0x0300], 0x3L); assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test2" << std::endl; + const std::u32string src = // out of order, 300-320 + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000300\U0000ffff\U00000008\U00000003\U00000320\U0000ffff\U00000008\U00000004"; + const std::u32string expect = + U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000003\U00000320\U0000ffff\U00000008\U00000002\U00000300\U0000ffff\U00000008\U00000004"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + if (dst != expect) { + std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; + } + zassert_string_equal(dst, expect); + assert_equal(map.size(), 4); + assert_equal(map[U'e'], 0x1L); + assert_equal(map[0x0320], 0x3L); + assert_equal(map[0x0300], 0x2L); + assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + } return EXIT_SUCCESS; From 72830edfad622dda8d04937f338ad157e6432900 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 6 Nov 2023 16:28:29 -0600 Subject: [PATCH 06/14] Apply suggestions from code review Co-authored-by: Marc Durdin --- core/tests/unit/ldml/test_transforms.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index 1f1da08d7c2..e4d84f3c85b 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -545,9 +545,7 @@ int test_normalize() { const std::u32string src = U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" U"\U0000ffff\U00000008\U00000004"; - const std::u32string expect = - U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300" - U"\U0000ffff\U00000008\U00000004"; + const std::u32string expect = src; std::u32string dst = src; assert(normalize_nfd_markers(dst, map)); zassert_string_equal(dst, expect); @@ -563,8 +561,7 @@ int test_normalize() { std::cout << __FILE__ << ":" << __LINE__ << " - complex test" << std::endl; const std::u32string src = // already in order: 320+300 U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; - const std::u32string expect = - U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; + const std::u32string expect = src; std::u32string dst = src; assert(normalize_nfd_markers(dst, map)); zassert_string_equal(dst, expect); From 78c2ac4aec78cd6b796a7834f23527dce0e63d8d Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 6 Nov 2023 18:23:40 -0600 Subject: [PATCH 07/14] =?UTF-8?q?feat(core):=20ldml=20marker=20normalizati?= =?UTF-8?q?on=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - refactor out backspace processing into a function. - for now, just drop any markers in the context when we're lopping off the end For: #9468 --- core/src/ldml/ldml_processor.cpp | 116 +++++++++++++----------------- core/src/ldml/ldml_processor.hpp | 3 + core/src/ldml/ldml_transforms.cpp | 72 ++++++++++++------- core/src/ldml/ldml_transforms.hpp | 32 ++++++++- core/tests/unit/ldml/ldml.cpp | 15 ++++ 5 files changed, 144 insertions(+), 94 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index cb1e8380314..9c58a40d19f 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -196,47 +196,7 @@ ldml_processor::process_event( switch (vk) { // Special handling for backspace VK case KM_CORE_VKEY_BKSP: - { - if (!!bksp_transforms) { - // TODO-LDML: process bksp - // std::u16string outputString; - // // don't bother if no backspace transforms! - // // TODO-LDML: unroll ctxt into a str - // std::u16string ctxtstr; - // for (size_t i = 0; i < ctxt.size(); i++) { - // ctxtstr.append(ctxt[i]); - // } - // const size_t matchedContext = transforms->apply(ctxtstr, outputString); - } - KMX_DWORD last_char = 0UL; - KMX_DWORD last_marker = 0UL; - // attempt to get the last char - auto end = state->context().rbegin(); - if(end != state->context().rend()) { - if((*end).type == KM_CORE_CT_CHAR) { - last_char = (*end).character; - } else if((*end).type == KM_CORE_BT_MARKER) { - last_marker = (*end).marker; - } - } - if (last_marker != 0UL) { - // delete of a marker - state->actions().push_backspace(KM_CORE_BT_MARKER, last_marker); - } else if (last_char == 0UL) { - /* - We couldn't find a character at end of context (context is empty), - so we'll pass the backspace keystroke on to the app to process; the - app might want to use backspace to move between contexts or delete - a text box, etc. Or it might be a legacy app and we've had our caret - dumped in somewhere unknown, so we will have to depend on the app to - be sensible about backspacing because we know nothing. - */ - state->actions().push_backspace(KM_CORE_BT_UNKNOWN); - } else { - state->actions().push_backspace(KM_CORE_BT_CHAR, last_char); - state->context().pop_back(); - } - } + process_backspace(state); break; default: // all other VKs @@ -264,6 +224,44 @@ ldml_processor::process_event( return KM_CORE_STATUS_OK; } +void +ldml_processor::process_backspace(km_core_state *state) const { + if (!!bksp_transforms) { + // TODO-LDML: process bksp + // std::u16string outputString; + // // don't bother if no backspace transforms! + // // TODO-LDML: unroll ctxt into a str + // std::u16string ctxtstr; + // for (size_t i = 0; i < ctxt.size(); i++) { + // ctxtstr.append(ctxt[i]); + // } + // const size_t matchedContext = transforms->apply(ctxtstr, outputString); + } + + // Find out what the last actual character was and remove it. + // attempt to get the last char + auto end = state->context().rbegin(); + if (end != state->context().rend()) { + if ((*end).type == KM_CORE_CT_CHAR) { + state->actions().push_backspace(KM_CORE_BT_CHAR, (*end).character); + state->context().pop_back(); + return; + } else if ((*end).type == KM_CORE_BT_MARKER) { + state->actions().push_backspace(KM_CORE_BT_MARKER, (*end).marker); + state->context().pop_back(); + } + } + /* + We couldn't find a character at end of context (context is empty), + so we'll pass the backspace keystroke on to the app to process; the + app might want to use backspace to move between contexts or delete + a text box, etc. Or it might be a legacy app and we've had our caret + dumped in somewhere unknown, so we will have to depend on the app to + be sensible about backspacing because we know nothing. + */ + state->actions().push_backspace(KM_CORE_BT_UNKNOWN); +} + void ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const { // We know that key_str is not empty per the caller. @@ -272,19 +270,18 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // we convert the keys str to UTF-32 here instead of using the emit_text() overload // so that we don't have to reconvert it inside the transform code. std::u32string key_str32 = kmx::u16string_to_u32string(key_str); - assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail? - + assert(ldml::normalize_nfd_markers(key_str32)); // TODO-LDML: else fail? // extract context string, in NFC std::u32string old_ctxtstr_nfc; (void)context_to_string(state, old_ctxtstr_nfc, false); - assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail? + assert(ldml::normalize_nfc_markers(old_ctxtstr_nfc)); // TODO-LDML: else fail? // context string in NFD std::u32string ctxtstr; (void)context_to_string(state, ctxtstr, true); // with markers // add the newly added key output to ctxtstr ctxtstr.append(key_str32); - assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail? + assert(ldml::normalize_nfd_markers(ctxtstr)); // TODO-LDML: else fail? /** transform output string */ std::u32string outputString; @@ -302,13 +299,14 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // drop last 'matchedContext': ctxtstr.resize(ctxtstr.length() - matchedContext); ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. - assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: Need marker-safe normalize here. + ldml::marker_map markers; + assert(ldml::normalize_nfd_markers(ctxtstr, markers)); // TODO-LDML: Need marker-safe normalize here. // Ok. We've done all the happy manipulations. /** NFC and no markers */ std::u32string ctxtstr_cleanedup = ldml::remove_markers(ctxtstr); - assert(ldml::normalize_nfc(ctxtstr_cleanedup)); + assert(ldml::normalize_nfc_markers(ctxtstr_cleanedup)); // find common prefix. // For example, if the context previously had "aaBBBBB" and it is changing to "aaCCC" then we will have: @@ -329,16 +327,9 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k emit_text(state, new_ctxtstr_changed); // TODO-LDML: need to emit marker here - need to emit text w/ markers, and handle appropriately. - // // TODO-LDML: 1-marker hack! + // // TODO-LDML: 1-marker hack! need to support a string with intermixed markers. if (key_str32.length() == 3 && key_str32[0] == LDML_UC_SENTINEL && key_str32[1] == LDML_MARKER_CODE) { - state->context().push_marker(key_str32[2]); - } - - // for debugging! - { - std::u32string ctxtstr2; - (void)context_to_string(state, ctxtstr2, true); // with markers - (void)0; + emit_marker(state, key_str32[2]); } } @@ -358,18 +349,7 @@ ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t le str.pop_back(); state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed } else if (type == KM_CORE_BT_MARKER) { - // it's a marker, 'worth' 3 uchars - assert(length >= 3); - assert(lastCtx == c->marker); // end of list - length -= 3; - // pop off the three-part sentinel string (in reverse order of course) - assert(str.back() == c->marker); // marker # - str.pop_back(); - assert(str.back() == LDML_MARKER_CODE); - str.pop_back(); - assert(str.back() == LDML_UC_SENTINEL); - str.pop_back(); - // push a special backspace to delete the marker + // just pop off any markers. state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker); } } diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index 83274a44081..92194ddad10 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -103,6 +103,9 @@ namespace core { /** process a typed key */ void process_key_string(km_core_state *state, const std::u16string &key_str) const; + /** process a backspace */ + void process_backspace(km_core_state *state) const; + /** * add the string+marker portion of the context to the beginning of str. * Stop when a non-string and non-marker is hit. diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 3b3c7db9c3e..012c6781606 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -898,6 +898,33 @@ bool normalize_nfd_markers(std::u16string &str, marker_map &_kmn_unused(map)) { return normalize_nfd(str); } +void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map) { + // need to reconstitute. + marker_map map2(map); // make a copy of the map + // clear the string + str.clear(); + // add the end-of-text marker + { + const auto ch = MARKER_BEFORE_EOT; + const auto m = map2.find(ch); + if (m != map2.end()) { + prepend_marker(str, m->second); + map2.erase(ch); // remove it + } + } + // go from end to beginning of string + for (auto p = src.rbegin(); p != src.rend(); p++) { + const auto ch = *p; + str.insert(0, 1, ch); // prepend + + const auto m = map2.find(ch); + if (m != map2.end()) { + prepend_marker(str, m->second); + map2.erase(ch); // remove it + } + } +} + /** * TODO-LDML: * - doesn't support >1 marker per char - may need a set instead of a map! @@ -917,34 +944,31 @@ bool normalize_nfd_markers(std::u32string &str, marker_map &map) { // Normalization produced no change when markers were removed. // So, we'll call this a no-op. } else { - // need to reconstitute. - marker_map map2(map); // make a copy of the map - // clear the string - str.clear(); - // add the end-of-text marker - { - const auto ch = MARKER_BEFORE_EOT; - const auto m = map2.find(ch); - if (m != map2.end()) { - prepend_marker(str, m->second); - map2.erase(ch); // remove it - } - } - // go from end to beginning of string - for(auto p = str_unmarked_nfd.rbegin(); p != str_unmarked_nfd.rend(); p++) { - const auto ch = *p; - str.insert(0, 1, ch); // prepend - - const auto m = map2.find(ch); - if (m != map2.end()) { - prepend_marker(str, m->second); - map2.erase(ch); // remove it - } - } + add_back_markers(str, str_unmarked_nfd, map); } return true; // all OK } +bool normalize_nfc_markers(std::u32string &str, marker_map &map) { + /** original string, but no markers */ + std::u32string str_unmarked = remove_markers(str, map); + /** original string, no markers, NFC */ + std::u32string str_unmarked_nfc = str_unmarked; + if(!normalize_nfc(str_unmarked_nfc)) { + return false; // normalize failed. + } else if (map.size() == 0) { + // no markers. Return the normalized unmarked str + str = str_unmarked_nfc; + } else if (str_unmarked_nfc == str_unmarked) { + // Normalization produced no change when markers were removed. + // So, we'll call this a no-op. + } else { + add_back_markers(str, str_unmarked_nfc, map); + } + return true; // all OK +} + + bool normalize_nfc(std::u32string &str) { std::u16string rstr = km::core::kmx::u32string_to_u16string(str); if(!normalize_nfc(rstr)) { diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index df0da603bd2..4a2948613db 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -303,11 +303,19 @@ bool normalize_nfd(std::u16string &str); * @return false on failure **/ bool normalize_nfd_markers(std::u32string &str, marker_map &markers); -/** Normalize a u16string inplace to NFD, retaining markers. +bool normalize_nfd_markers(std::u16string &str, marker_map &markers); +inline bool normalize_nfd_markers(std::u32string &str); +inline bool normalize_nfd_markers(std::u16string &str); + +/** Normalize a u32string inplace to NFC, retaining markers. * @param markers will be populated with marker chars * @return false on failure **/ -bool normalize_nfd_markers(std::u16string &str, marker_map &markers); +bool normalize_nfc_markers(std::u32string &str, marker_map &markers); +bool normalize_nfc_markers(std::u16string &str, marker_map &markers); +inline bool normalize_nfc_markers(std::u32string &str); +inline bool normalize_nfc_markers(std::u16string &str); + /** Normalize a u32string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u32string &str); /** Normalize a u16string inplace to NFC. @return false on failure */ @@ -328,6 +336,26 @@ prepend_marker(std::u32string &str, KMX_DWORD marker) { str.insert(0, triple, 3); } +bool normalize_nfd_markers(std::u16string &str) { + marker_map m; + return normalize_nfd_markers(str, m); +} + +bool normalize_nfc_markers(std::u16string &str) { + marker_map m; + return normalize_nfc_markers(str, m); +} +bool normalize_nfd_markers(std::u32string &str) { + marker_map m; + return normalize_nfd_markers(str, m); +} + +bool normalize_nfc_markers(std::u32string &str) { + marker_map m; + return normalize_nfc_markers(str, m); +} + + } // namespace ldml } // namespace core } // namespace km diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index 55229d88728..0f5ee82bbd7 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -169,6 +169,21 @@ verify_context(std::u16string& text_store, km_core_state* &test_state, std::vect km_core_cp *buf = new km_core_cp[n]; try_status(km_core_context_items_to_utf16(citems, buf, &n)); std::cout << "context : " << string_to_hex(buf) << " [" << buf << "]" << std::endl; + std::cout << "testcontext "; + for (auto i = test_context.begin(); i < test_context.end(); i++) { + switch(i->type) { + case KM_CORE_CT_CHAR: + std::cout << "U+" << std::hex << i->character << std::dec << " "; + break; + case KM_CORE_CT_MARKER: + std::cout << "\\m{" << i->character << "} "; + break; + default: + std::cout << "type#" << i->type << " "; + } + } + std::cout << std::endl; + std::cout << "context : " << string_to_hex(buf) << " [" << buf << "]" << std::endl; // Verify that both our local test_context and the core's test_state.context have // not diverged From 7b481945e600726e3bc8483e90392b66737f5d7d Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 8 Nov 2023 21:52:51 -0600 Subject: [PATCH 08/14] =?UTF-8?q?feat(core):=20test=20fix=20=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix a cast - fix some test cases For: #9468 --- core/src/debuglog.cpp | 2 +- core/tests/unit/ldml/keyboards/k_008_transform_norm.xml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/debuglog.cpp b/core/src/debuglog.cpp index 0758aade331..408de2126cc 100644 --- a/core/src/debuglog.cpp +++ b/core/src/debuglog.cpp @@ -464,7 +464,7 @@ const char *Debug_UnicodeString(::std::u32string s, int x) { bufout[x][0] = 0; for (q = bufout[x]; (intptr_t)(q-bufout[x]) < (128*7) && p != s.end(); p++) { - snprintf(q, MEDIUM_BUF_SIZ - (q - bufout[x]), "U+%4.6X ", *p); + snprintf(q, MEDIUM_BUF_SIZ - (q - bufout[x]), "U+%4.6X ", (unsigned int)*p); q = strchr(q, 0); } return bufout[x]; diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index 226890f854e..06ad0695f7d 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -46,20 +46,24 @@ from https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-ke + + + + From 2f843d98f3b665bc4e7ee4157fad75603945367b Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 14 Nov 2023 16:36:15 -0600 Subject: [PATCH 09/14] =?UTF-8?q?feat(core):=20marker=20normalization=20?= =?UTF-8?q?=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - go back to NFD for the context, for now - anticipating when the privatecontext is NFD but the public context is NFC - also update the test cases For: #9468 --- core/src/ldml/ldml_processor.cpp | 16 ++++++++-------- core/tests/unit/ldml/ldml.cpp | 4 ++++ core/tests/unit/ldml/ldml_test_source.cpp | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 9c58a40d19f..ff2fc60b731 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -271,10 +271,10 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // so that we don't have to reconvert it inside the transform code. std::u32string key_str32 = kmx::u16string_to_u32string(key_str); assert(ldml::normalize_nfd_markers(key_str32)); // TODO-LDML: else fail? - // extract context string, in NFC - std::u32string old_ctxtstr_nfc; - (void)context_to_string(state, old_ctxtstr_nfc, false); - assert(ldml::normalize_nfc_markers(old_ctxtstr_nfc)); // TODO-LDML: else fail? + // extract context string, in NFD + std::u32string old_ctxtstr_nfd; + (void)context_to_string(state, old_ctxtstr_nfd, false); + assert(ldml::normalize_nfd_markers(old_ctxtstr_nfd)); // TODO-LDML: else fail? // context string in NFD std::u32string ctxtstr; @@ -304,18 +304,18 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k // Ok. We've done all the happy manipulations. - /** NFC and no markers */ + /** NFD and no markers */ std::u32string ctxtstr_cleanedup = ldml::remove_markers(ctxtstr); - assert(ldml::normalize_nfc_markers(ctxtstr_cleanedup)); + assert(ldml::normalize_nfd_markers(ctxtstr_cleanedup)); // find common prefix. // For example, if the context previously had "aaBBBBB" and it is changing to "aaCCC" then we will have: // - old_ctxtstr_changed = "BBBBB" // - new_ctxtstr_changed = "CCC" // So the BBBBB needs to be removed and then CCC added. - auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); + auto ctxt_prefix = mismatch(old_ctxtstr_nfd.begin(), old_ctxtstr_nfd.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end()); /** The part of the old string to be removed */ - std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end()); + std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfd.end()); /** The new context to be added */ std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end()); diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index 0f5ee82bbd7..546af55ef48 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -28,6 +28,9 @@ #include "ldml_test_source.hpp" #include "debuglog.h" + +#include "ldml/ldml_transforms.hpp" + namespace { bool g_beep_found = false; @@ -279,6 +282,7 @@ run_test(const km::core::path &source, const km::core::path &compiled, km::tests verify_context(text_store, test_state, test_context); } else if (action.type == km::tests::LDML_ACTION_CHECK_EXPECTED) { + assert(km::core::ldml::normalize_nfd(action.string)); // TODO-LDML: should be NFC std::cout << "- check expected" << std::endl; std::cout << "expected : " << string_to_hex(action.string) << " [" << action.string << "]" << std::endl; std::cout << "text store: " << string_to_hex(text_store) << " [" << text_store << "]" << std::endl; diff --git a/core/tests/unit/ldml/ldml_test_source.cpp b/core/tests/unit/ldml/ldml_test_source.cpp index 9171f25163f..93ddff9f7c9 100644 --- a/core/tests/unit/ldml/ldml_test_source.cpp +++ b/core/tests/unit/ldml/ldml_test_source.cpp @@ -472,7 +472,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { if (type == "check") { fillin.type = LDML_ACTION_CHECK_EXPECTED; fillin.string = LdmlTestSource::parse_u8_source_string(result.get()); - assert(km::core::ldml::normalize_nfc(fillin.string)); + assert(km::core::ldml::normalize_nfd(fillin.string)); // TODO-LDML: should be NFC return; } else if (type == "keystroke") { fillin.type = LDML_ACTION_KEY_EVENT; @@ -483,7 +483,7 @@ LdmlJsonTestSource::next_action(ldml_action &fillin) { } else if (type == "emit") { fillin.type = LDML_ACTION_EMIT_STRING; fillin.string = LdmlTestSource::parse_u8_source_string(to.get()); - assert(km::core::ldml::normalize_nfc(fillin.string)); + assert(km::core::ldml::normalize_nfd(fillin.string)); // TODO-LDML: should be NFC return; } else if (type == "backspace") { // backspace is handled as a key event @@ -507,7 +507,7 @@ int LdmlJsonTestSource::load(const nlohmann::json &data) { this->data = data; // TODO-LDML auto startContext = data["/startContext/to"_json_pointer]; context = LdmlTestSource::parse_u8_source_string(startContext); - assert(km::core::ldml::normalize_nfc(context)); + assert(km::core::ldml::normalize_nfd(context)); // TODO-LDML: should be NFC return 0; } From 07a4821336d88242d48175ca609e736c4910ea9c Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 15 Nov 2023 17:00:04 -0600 Subject: [PATCH 10/14] =?UTF-8?q?feat(core):=20marker=20normalization=20?= =?UTF-8?q?=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - a little further - couple places where "it wasn't plugged in" - adding some LDML-TODOs - marker creep - fixed one unnecessary alloc/dealloc For: #9468 --- core/src/ldml/ldml_transforms.cpp | 29 ++++++++++++------------ core/tests/unit/ldml/test_transforms.cpp | 18 ++++++++++++++- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 012c6781606..5773152ff9d 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -466,15 +466,13 @@ transform_entry::init() { return false; } // TODO-LDML: if we have mapFrom, may need to do other processing. - const std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom); + std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom); + // normalize, including markers + normalize_nfd_markers(patstr); UErrorCode status = U_ZERO_ERROR; - /* const */ icu::UnicodeString patustr_raw = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); + /* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length()); // add '$' to match to end - patustr_raw.append(u'$'); - icu::UnicodeString patustr; - const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); - // NFD normalize on pattern creation - nfd->normalize(patustr_raw, patustr, status); + patustr.append(u'$'); // TODO-LDML: may need to escape some markers. Marker #91 will look like a `[` to the pattern fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status)); return (UASSERT_SUCCESS(status)); } @@ -556,7 +554,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons } const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status); icu::UnicodeString rustr2; - nfd->normalize(rustr, rustr2, status); + nfd->normalize(rustr, rustr2, status); // TODO-LDML: must be normalize with markers! UASSERT_SUCCESS(status); // here we replace the match output. icu::UnicodeString entireOutput = matcher->replaceFirst(rustr2, status); @@ -567,7 +565,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons // normalize the replaced string icu::UnicodeString outu; - nfd->normalize(outu_raw, outu, status); + nfd->normalize(outu_raw, outu, status); // TODO-LDML: must be normalize with markers! UASSERT_SUCCESS(status); // Special case if there's no output, save some allocs @@ -585,8 +583,6 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons outu.toUTF32((UChar32 *)s, out32len + 1, status); UASSERT_SUCCESS(status); output.assign(s, out32len); - // now, build a u32string - std::u32string out32(s, out32len); // clean up buffer delete [] s; } @@ -893,9 +889,14 @@ bool normalize_nfd(std::u16string &str) { return normalize(nfd, str, status); } -bool normalize_nfd_markers(std::u16string &str, marker_map &_kmn_unused(map)) { - // TODO-LDML - return normalize_nfd(str); +bool normalize_nfd_markers(std::u16string &str, marker_map &map) { + std::u32string rstr = km::core::kmx::u16string_to_u32string(str); + if(!normalize_nfd_markers(rstr, map)) { + return false; + } else { + str = km::core::kmx::u32string_to_u16string(rstr); + return true; + } } void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map) { diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index e4d84f3c85b..6960be0ff3d 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -508,7 +508,6 @@ int test_strutils() { assert_equal(map[0x0300], 0x3L); assert_equal(map[MARKER_BEFORE_EOT], 0x4L); } - return EXIT_SUCCESS; } @@ -594,6 +593,23 @@ int test_normalize() { } + { + // u"4è\U0000ffff\b\U00000001̠" + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test 4a" << std::endl; + const std::u32string src = U"4e\u0300\uFFFF\b\u0001\u0320"; + const std::u32string expect = U"4e\uFFFF\b\u0001\u0320\u0300"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + if (dst != expect) { + std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; + } + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + assert_equal(map[0x0320], 0x1L); + } + return EXIT_SUCCESS; } From 5a04a812fa2f81a67a55a2c0e56630fb37985693 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 15 Nov 2023 17:09:26 -0600 Subject: [PATCH 11/14] =?UTF-8?q?feat(core):=20marker=20normalization=20?= =?UTF-8?q?=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - temporarily remove a backspace test until #9450 is fixed. For: #9468 --- .../unit/ldml/keyboards/k_008_transform_norm-test.xml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index e24a512e8b5..b884ff8aaac 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -181,8 +181,11 @@ - - + + + + From e28f35903a00df633fcfb18d82c25fa854a6105e Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 15 Nov 2023 17:29:04 -0600 Subject: [PATCH 12/14] =?UTF-8?q?feat(core):=20marker=20normalization=20?= =?UTF-8?q?=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - test fix For: #9468 --- .../ldml/keyboards/k_008_transform_norm-test.xml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index b884ff8aaac..2b96de271f1 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -191,11 +191,13 @@ + - + + @@ -223,18 +225,22 @@ - - + + + + + From 4bbafa69036853f6001401bc83fa621f8742e9fd Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 16 Nov 2023 12:11:41 -0600 Subject: [PATCH 13/14] =?UTF-8?q?feat(core):=20marker=20normalization=20?= =?UTF-8?q?=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - literally a bad assert. the error case is handled below, in fact the unit test tests for it. - for some reason, assert.h wasn't included in some cases locally. For: #9468 --- core/src/ldml/ldml_transforms.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 5773152ff9d..5c4ff428cd9 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -10,10 +10,7 @@ #include #include #include "kmx/kmx_xstring.h" - -#ifndef assert -#define assert(x) ((void)0) -#endif +#include namespace km { namespace core { @@ -1004,7 +1001,6 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers) { } // #2 LDML_MARKER_CODE - assert(*i == LDML_MARKER_CODE); if (*i != LDML_MARKER_CODE) { continue; // can't process this, get out } From f524519b3f2fbdb0dae6ff70f0060d85d98071f1 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 20 Nov 2023 16:57:33 -0600 Subject: [PATCH 14/14] =?UTF-8?q?feat(developer):=20ldml=20fix=20testcase?= =?UTF-8?q?=20processing=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - KM_CORE_BT_UNKNOWN should only be used with an empty context For: #9468 but related to #9450 --- core/tests/unit/ldml/ldml.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/core/tests/unit/ldml/ldml.cpp b/core/tests/unit/ldml/ldml.cpp index 546af55ef48..e22367a7f96 100644 --- a/core/tests/unit/ldml/ldml.cpp +++ b/core/tests/unit/ldml/ldml.cpp @@ -128,16 +128,9 @@ apply_action( assert(context.back().character == ch); context.pop_back(); } else { - assert(act.backspace.expected_type == KM_CORE_BT_UNKNOWN); // else it must be unknown - // pop to first character or empty - while (!context.empty() && context.back().type != KM_CORE_CT_CHAR) { - // pop off all non-character entries - context.pop_back(); - } - if (!context.empty()) { - // pop off the next character - context.pop_back(); - } + // assume it's otherwise KM-coRE_BT_UNKNOWN + assert(act.backspace.expected_type == KM_CORE_BT_UNKNOWN); + assert(context.empty()); // if KM_CORE_BT_UNKNOWN, context should be empty. } } break;