Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): ldml marker normalization 🙀 #9761

Merged
merged 21 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
efc8ef0
feat(core): ldml normalization 🙀
srl295 Oct 13, 2023
255960e
feat(core): ldml dx: dump vkey and modifier 🙀
srl295 Oct 20, 2023
015a738
feat(core): ldml marker normalization 🙀
srl295 Oct 20, 2023
0f0c036
chore(resources): ldml bn: add a test case 🙀
srl295 Nov 1, 2023
a69feb5
feat(core): ldml marker normalization 🙀
srl295 Nov 3, 2023
72830ed
Apply suggestions from code review
srl295 Nov 6, 2023
78c2ac4
feat(core): ldml marker normalization 🙀
srl295 Nov 7, 2023
7b48194
feat(core): test fix 🙀
srl295 Nov 9, 2023
bf66909
Merge branch 'fix/common/10004-truthful-test-epic-ldml' into feat/cor…
srl295 Nov 14, 2023
2f843d9
feat(core): marker normalization 🙀
srl295 Nov 14, 2023
0841afb
Merge branch 'master' into feat/core/9468-marker-normalization-epic-ldml
srl295 Nov 15, 2023
07a4821
feat(core): marker normalization 🙀
srl295 Nov 15, 2023
5a04a81
feat(core): marker normalization 🙀
srl295 Nov 15, 2023
2163c1d
Merge branch 'master' into feat/core/9468-marker-normalization-epic-ldml
srl295 Nov 15, 2023
e28f359
feat(core): marker normalization 🙀
srl295 Nov 15, 2023
3905c6d
Merge branch 'master' into feat/core/9468-marker-normalization-epic-ldml
srl295 Nov 16, 2023
4bbafa6
feat(core): marker normalization 🙀
srl295 Nov 16, 2023
386ac30
Merge branch 'master' into feat/core/9468-marker-normalization-epic-ldml
srl295 Nov 16, 2023
934dd45
Merge branch 'master' into feat/core/9468-marker-normalization-epic-ldml
srl295 Nov 16, 2023
f524519
feat(developer): ldml fix testcase processing 🙀
srl295 Nov 20, 2023
5f6dafa
Merge branch 'master' into feat/core/9468-marker-normalization-epic-ldml
srl295 Nov 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 70 additions & 66 deletions core/src/ldml/ldml_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,42 +196,7 @@ ldml_processor::process_event(
switch (vk) {
// Special handling for backspace VK
case KM_CORE_VKEY_BKSP:
{
if (!!bksp_transforms) {
// TODO-LDML: process bksp
// std::u16string outputString;
// // don't bother if no backspace transforms!
// // TODO-LDML: unroll ctxt into a str
// std::u16string ctxtstr;
// for (size_t i = 0; i < ctxt.size(); i++) {
// ctxtstr.append(ctxt[i]);
// }
// const size_t matchedContext = transforms->apply(ctxtstr, outputString);
}
KMX_DWORD last_char = 0UL;
// attempt to get the last char
auto end = state->context().rbegin();
if(end != state->context().rend()) {
if((*end).type == KM_CORE_CT_CHAR) {
last_char = (*end).character;
// TODO-LDML: markers!
}
}
if (last_char == 0UL) {
/*
We couldn't find a character at end of context (context is empty),
so we'll pass the backspace keystroke on to the app to process; the
app might want to use backspace to move between contexts or delete
a text box, etc. Or it might be a legacy app and we've had our caret
dumped in somewhere unknown, so we will have to depend on the app to
be sensible about backspacing because we know nothing.
*/
state->actions().push_backspace(KM_CORE_BT_UNKNOWN);
} else {
state->actions().push_backspace(KM_CORE_BT_CHAR, last_char);
state->context().pop_back();
}
}
process_backspace(state);
break;
default:
// all other VKs
Expand Down Expand Up @@ -259,6 +224,44 @@ ldml_processor::process_event(
return KM_CORE_STATUS_OK;
}

void
ldml_processor::process_backspace(km_core_state *state) const {
if (!!bksp_transforms) {
// TODO-LDML: process bksp
// std::u16string outputString;
// // don't bother if no backspace transforms!
// // TODO-LDML: unroll ctxt into a str
// std::u16string ctxtstr;
// for (size_t i = 0; i < ctxt.size(); i++) {
// ctxtstr.append(ctxt[i]);
// }
// const size_t matchedContext = transforms->apply(ctxtstr, outputString);
}

// Find out what the last actual character was and remove it.
// attempt to get the last char
auto end = state->context().rbegin();
if (end != state->context().rend()) {
if ((*end).type == KM_CORE_CT_CHAR) {
state->actions().push_backspace(KM_CORE_BT_CHAR, (*end).character);
state->context().pop_back();
return;
} else if ((*end).type == KM_CORE_BT_MARKER) {
state->actions().push_backspace(KM_CORE_BT_MARKER, (*end).marker);
state->context().pop_back();
}
}
/*
We couldn't find a character at end of context (context is empty),
so we'll pass the backspace keystroke on to the app to process; the
app might want to use backspace to move between contexts or delete
a text box, etc. Or it might be a legacy app and we've had our caret
dumped in somewhere unknown, so we will have to depend on the app to
be sensible about backspacing because we know nothing.
*/
state->actions().push_backspace(KM_CORE_BT_UNKNOWN);
}

void
ldml_processor::process_key_string(km_core_state *state, const std::u16string &key_str) const {
// We know that key_str is not empty per the caller.
Expand All @@ -267,19 +270,18 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k
// we convert the keys str to UTF-32 here instead of using the emit_text() overload
// so that we don't have to reconvert it inside the transform code.
std::u32string key_str32 = kmx::u16string_to_u32string(key_str);
assert(ldml::normalize_nfd(key_str32)); // TODO-LDML: else fail?

// extract context string, in NFC
std::u32string old_ctxtstr_nfc;
(void)context_to_string(state, old_ctxtstr_nfc, false);
assert(ldml::normalize_nfc(old_ctxtstr_nfc)); // TODO-LDML: else fail?
assert(ldml::normalize_nfd_markers(key_str32)); // TODO-LDML: else fail?
// extract context string, in NFD
std::u32string old_ctxtstr_nfd;
(void)context_to_string(state, old_ctxtstr_nfd, false);
assert(ldml::normalize_nfd_markers(old_ctxtstr_nfd)); // TODO-LDML: else fail?

// context string in NFD
std::u32string ctxtstr;
(void)context_to_string(state, ctxtstr, true); // with markers
// add the newly added key output to ctxtstr
ctxtstr.append(key_str32);
assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail?
assert(ldml::normalize_nfd_markers(ctxtstr)); // TODO-LDML: else fail?

/** transform output string */
std::u32string outputString;
Expand All @@ -297,25 +299,38 @@ ldml_processor::process_key_string(km_core_state *state, const std::u16string &k
// drop last 'matchedContext':
ctxtstr.resize(ctxtstr.length() - matchedContext);
ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here.
assert(ldml::normalize_nfd(ctxtstr)); // TODO-LDML: else fail?
ldml::marker_map markers;
assert(ldml::normalize_nfd_markers(ctxtstr, markers)); // TODO-LDML: Need marker-safe normalize here.

// Ok. We've done all the happy manipulations.

/** NFC and no markers */
std::u32string ctxtstr_cleanedup = ctxtstr;
// TODO-LDML: remove markers!
assert(ldml::normalize_nfc(ctxtstr_cleanedup)); // TODO-LDML: else fail?

// find common prefix
auto ctxt_prefix = mismatch(old_ctxtstr_nfc.begin(), old_ctxtstr_nfc.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end());
/** the part of the old str that changed */
std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfc.end());
/** NFD and no markers */
std::u32string ctxtstr_cleanedup = ldml::remove_markers(ctxtstr);
assert(ldml::normalize_nfd_markers(ctxtstr_cleanedup));

// find common prefix.
// For example, if the context previously had "aaBBBBB" and it is changing to "aaCCC" then we will have:
// - old_ctxtstr_changed = "BBBBB"
// - new_ctxtstr_changed = "CCC"
// So the BBBBB needs to be removed and then CCC added.
auto ctxt_prefix = mismatch(old_ctxtstr_nfd.begin(), old_ctxtstr_nfd.end(), ctxtstr_cleanedup.begin(), ctxtstr_cleanedup.end());
/** The part of the old string to be removed */
std::u32string old_ctxtstr_changed(ctxt_prefix.first,old_ctxtstr_nfd.end());
/** The new context to be added */
std::u32string new_ctxtstr_changed(ctxt_prefix.second,ctxtstr_cleanedup.end());

// drop the old suffix. Note: this mutates old_ctxtstr_changed.
remove_text(state, old_ctxtstr_changed, old_ctxtstr_changed.length());
assert(old_ctxtstr_changed.length() == 0);
// old_ctxtstr_changed is now empty because it's been removed.
// context is "aa" in the above example.
emit_text(state, new_ctxtstr_changed);

// TODO-LDML: need to emit marker here - need to emit text w/ markers, and handle appropriately.
// // TODO-LDML: 1-marker hack! need to support a string with intermixed markers.
if (key_str32.length() == 3 && key_str32[0] == LDML_UC_SENTINEL && key_str32[1] == LDML_MARKER_CODE) {
emit_marker(state, key_str32[2]);
}
}

void
Expand All @@ -334,18 +349,7 @@ ldml_processor::remove_text(km_core_state *state, std::u32string &str, size_t le
str.pop_back();
state->actions().push_backspace(KM_CORE_BT_CHAR, c->character); // Cause prior char to be removed
} else if (type == KM_CORE_BT_MARKER) {
// it's a marker, 'worth' 3 uchars
assert(length >= 3);
assert(lastCtx == c->marker); // end of list
length -= 3;
// pop off the three-part sentinel string (in reverse order of course)
assert(str.back() == c->marker); // marker #
str.pop_back();
assert(str.back() == LDML_MARKER_CODE);
str.pop_back();
assert(str.back() == LDML_UC_SENTINEL);
str.pop_back();
// push a special backspace to delete the marker
// just pop off any markers.
state->actions().push_backspace(KM_CORE_BT_MARKER, c->marker);
}
}
Expand Down Expand Up @@ -433,7 +437,7 @@ ldml_processor::context_to_string(km_core_state *state, std::u32string &str, boo
} else if (last_type == KM_CORE_BT_MARKER) {
assert(km::core::kmx::is_valid_marker(c->marker));
if (include_markers) {
prepend_marker(str, c->marker);
ldml::prepend_marker(str, c->marker);
}
} else {
break;
Expand Down
12 changes: 3 additions & 9 deletions core/src/ldml/ldml_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ namespace core {
/** process a typed key */
void process_key_string(km_core_state *state, const std::u16string &key_str) const;

/** process a backspace */
void process_backspace(km_core_state *state) const;

/**
* add the string+marker portion of the context to the beginning of str.
* Stop when a non-string and non-marker is hit.
Expand All @@ -111,15 +114,6 @@ namespace core {
*/
static size_t context_to_string(km_core_state *state, std::u32string &str, bool include_markers = true);

/** prepend the marker string in UC_SENTINEL format to the str */
inline static void prepend_marker(std::u32string &str, KMX_DWORD marker);
};

void
ldml_processor::prepend_marker(std::u32string &str, KMX_DWORD marker) {
km_core_usv triple[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker};
str.insert(0, triple, 3);
}

} // namespace core
} // namespace km
Loading