Skip to content

Commit

Permalink
feat(core): fixup markers as regex 🙀
Browse files Browse the repository at this point in the history
- move prepend_marker out of line
- replace bool for_regex with an enum
- cleanup another instance of \b -> \u0008

#9121
  • Loading branch information
srl295 committed Jan 2, 2024
1 parent a6151e6 commit 91b761b
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 72 deletions.
59 changes: 45 additions & 14 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ transform_entry::init() {
// TODO-LDML: if we have mapFrom, may need to do other processing.
std::u16string patstr = km::core::kmx::u32string_to_u16string(fFrom);
// normalize, including markers, for regex
normalize_nfd_markers(patstr, true);
normalize_nfd_markers(patstr, regex_sentinel);
UErrorCode status = U_ZERO_ERROR;
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
// add '$' to match to end
Expand Down Expand Up @@ -950,17 +950,17 @@ bool normalize_nfd(std::u16string &str) {
return normalize(nfd, str, status);
}

bool normalize_nfd_markers(std::u16string &str, marker_map &map, bool for_regex) {
bool normalize_nfd_markers(std::u16string &str, marker_map &map, marker_encoding encoding) {
std::u32string rstr = km::core::kmx::u16string_to_u32string(str);
if(!normalize_nfd_markers(rstr, map, for_regex)) {
if(!normalize_nfd_markers(rstr, map, encoding)) {
return false;
} else {
str = km::core::kmx::u32string_to_u16string(rstr);
return true;
}
}

static void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map, bool for_regex) {
static void add_back_markers(std::u32string &str, const std::u32string &src, const marker_map &map, marker_encoding encoding) {
// need to reconstitute.
marker_map map2(map); // make a copy of the map
// clear the string
Expand All @@ -970,7 +970,7 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con
const auto ch = MARKER_BEFORE_EOT;
const auto m = map2.find(ch);
if (m != map2.end()) {
prepend_marker(str, m->second, for_regex);
prepend_marker(str, m->second, encoding);
map2.erase(ch); // remove it
}
}
Expand All @@ -981,7 +981,7 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con

const auto m = map2.find(ch);
if (m != map2.end()) {
prepend_marker(str, m->second, for_regex);
prepend_marker(str, m->second, encoding);
map2.erase(ch); // remove it
}
}
Expand All @@ -992,9 +992,9 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con
* - doesn't support >1 marker per char - may need a set instead of a map!
* - ideally this should be used on a normalization safe subsequence
*/
bool normalize_nfd_markers(std::u32string &str, marker_map &map, bool for_regex) {
bool normalize_nfd_markers(std::u32string &str, marker_map &map, marker_encoding encoding) {
/** original string, but no markers */
std::u32string str_unmarked = remove_markers(str, map, for_regex);
std::u32string str_unmarked = remove_markers(str, map, encoding);
/** original string, no markers, NFD */
std::u32string str_unmarked_nfd = str_unmarked;
if(!normalize_nfd(str_unmarked_nfd)) {
Expand All @@ -1006,14 +1006,14 @@ bool normalize_nfd_markers(std::u32string &str, marker_map &map, bool for_regex)
// Normalization produced no change when markers were removed.
// So, we'll call this a no-op.
} else {
add_back_markers(str, str_unmarked_nfd, map, for_regex);
add_back_markers(str, str_unmarked_nfd, map, encoding);
}
return true; // all OK
}

bool normalize_nfc_markers(std::u32string &str, marker_map &map, bool for_regex) {
bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) {
/** original string, but no markers */
std::u32string str_unmarked = remove_markers(str, map, for_regex);
std::u32string str_unmarked = remove_markers(str, map, encoding);
/** original string, no markers, NFC */
std::u32string str_unmarked_nfc = str_unmarked;
if(!normalize_nfc(str_unmarked_nfc)) {
Expand All @@ -1025,7 +1025,7 @@ bool normalize_nfc_markers(std::u32string &str, marker_map &map, bool for_regex)
// Normalization produced no change when markers were removed.
// So, we'll call this a no-op.
} else {
add_back_markers(str, str_unmarked_nfc, map, for_regex);
add_back_markers(str, str_unmarked_nfc, map, encoding);
}
return true; // all OK
}
Expand All @@ -1048,6 +1048,36 @@ bool normalize_nfc(std::u16string &str) {
return normalize(nfc, str, status);
}

void
prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding) {
if (encoding == plain_sentinel) {
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker};
str.insert(0, markstr, 3);
} else {
assert(encoding == regex_sentinel);
if (marker == LDML_MARKER_ANY_INDEX) {
// recreate the regex from back to front
str.insert(0, 1, U']');
prepend_hex_quad(str, LDML_MARKER_MAX_INDEX);
str.insert(0, 1, U'u');
str.insert(0, 1, U'\\');
str.insert(0, 1, U'-');
prepend_hex_quad(str, LDML_MARKER_MIN_INDEX);
str.insert(0, 1, U'u');
str.insert(0, 1, U'\\');
str.insert(0, 1, U'[');
str.insert(0, 1, LDML_MARKER_CODE);
str.insert(0, 1, LDML_UC_SENTINEL);
} else {
// add hex part
prepend_hex_quad(str, marker);
// add static part
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, u'\\', u'u'};
str.insert(0, markstr, 4);
}
}
}

void
prepend_hex_quad(std::u32string &str, KMX_DWORD marker) {
for (auto i = 0; i < 4; i++) {
Expand Down Expand Up @@ -1089,7 +1119,7 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) {
return mark_no;
}

std::u32string remove_markers(const std::u32string &str, marker_map *markers, bool for_regex) {
std::u32string remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) {
std::u32string out;
auto i = str.begin();
auto last = i;
Expand All @@ -1116,11 +1146,12 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, bo
}

KMX_DWORD marker_no;
if (!for_regex) {
if (encoding == plain_sentinel) {
// #3 marker number
marker_no = *i;
i++; // if end, we'll break out of the loop
} else {
assert(encoding == regex_sentinel);
// is it an escape or a range?
if (*i == U'\\') {
if (++i == str.end()) {
Expand Down
78 changes: 29 additions & 49 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,14 @@ class transforms {
/** indicates that the marker was before the end of text. */
const char32_t MARKER_BEFORE_EOT = km::core::kmx::Uni_FFFE_NONCHARACTER;

/** specify the type of encoding for marker text */
enum marker_encoding {
/** encoding as UC_SENTINEL + CODE_DEADKEY + <number> */
plain_sentinel,
/** encoding as a regex matching the marker */
regex_sentinel,
};

/** map from following-char to marker number. */
typedef std::map<char32_t, KMX_DWORD> marker_map;

Expand All @@ -314,86 +322,58 @@ bool normalize_nfd(std::u16string &str);
* @param markers will be populated with marker chars
* @return false on failure
**/
bool normalize_nfd_markers(std::u32string &str, marker_map &markers, bool for_regex = false);
bool normalize_nfd_markers(std::u16string &str, marker_map &markers, bool for_regex = false);
inline bool normalize_nfd_markers(std::u32string &str, bool for_regex = false);
inline bool normalize_nfd_markers(std::u16string &str, bool for_regex = false);
bool normalize_nfd_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);

/** Normalize a u32string inplace to NFC, retaining markers.
* @param markers will be populated with marker chars
* @return false on failure
**/
bool normalize_nfc_markers(std::u32string &str, marker_map &markers, bool for_regex = false);
bool normalize_nfc_markers(std::u16string &str, marker_map &markers, bool for_regex = false);
inline bool normalize_nfc_markers(std::u32string &str, bool for_regex = false);
inline bool normalize_nfc_markers(std::u16string &str, bool for_regex = false);
bool normalize_nfc_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfc_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);

/** Normalize a u32string inplace to NFC. @return false on failure */
bool normalize_nfc(std::u32string &str);
/** Normalize a u16string inplace to NFC. @return false on failure */
bool normalize_nfc(std::u16string &str);
/** Remove markers and optionally note their glue characters in the map */
std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, bool for_regex = false);
std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, marker_encoding encoding = plain_sentinel);
/** same but with a reference */
inline std::u32string remove_markers(const std::u32string &str, marker_map &markers, bool for_regex = false) {
return remove_markers(str, &markers, for_regex);
inline std::u32string remove_markers(const std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel) {
return remove_markers(str, &markers, encoding);
}

/** prepend the marker string in UC_SENTINEL format to the str */
inline void prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex = false);
void prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding = plain_sentinel);

/** format 'marker' as 0001...FFFF and put it at the beginning of the string */
void prepend_hex_quad(std::u32string &str, KMX_DWORD marker);

/** parse 0001...FFFF into a KMX_DWORD. Returns 0 on failure */
KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]);

void
prepend_marker(std::u32string &str, KMX_DWORD marker, bool for_regex) {
if (!for_regex) {
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker};
str.insert(0, markstr, 3);
} else {
if (marker == LDML_MARKER_ANY_INDEX) {
// recreate the regex from back to front
str.insert(0, 1, U']');
prepend_hex_quad(str, LDML_MARKER_MAX_INDEX);
str.insert(0, 1, U'u');
str.insert(0, 1, U'\\');
str.insert(0, 1, U'-');
prepend_hex_quad(str, LDML_MARKER_MIN_INDEX);
str.insert(0, 1, U'u');
str.insert(0, 1, U'\\');
str.insert(0, 1, U'[');
str.insert(0, 1, LDML_MARKER_CODE);
str.insert(0, 1, LDML_UC_SENTINEL);
} else {
// add hex part
prepend_hex_quad(str, marker);
// add static part
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, u'\\', u'u'};
str.insert(0, markstr, 4);
}
}
}

bool normalize_nfd_markers(std::u16string &str, bool for_regex) {
bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfd_markers(str, m, for_regex);
return normalize_nfd_markers(str, m, encoding);
}

bool normalize_nfc_markers(std::u16string &str, bool for_regex) {
bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfc_markers(str, m, for_regex);
return normalize_nfc_markers(str, m, encoding);
}
bool normalize_nfd_markers(std::u32string &str, bool for_regex) {

bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfd_markers(str, m, for_regex);
return normalize_nfd_markers(str, m, encoding);
}

bool normalize_nfc_markers(std::u32string &str, bool for_regex) {
bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfc_markers(str, m, for_regex);
return normalize_nfc_markers(str, m, encoding);
}


Expand Down
18 changes: 9 additions & 9 deletions core/tests/unit/ldml/test_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,11 +770,11 @@ int test_normalize() {
}

{
// u"4è\U0000ffff\b\U00000001̠"
// u"4è\U0000ffff\u0008\U00000001̠"
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - complex test 4a" << std::endl;
const std::u32string src = U"4e\u0300\uFFFF\b\u0001\u0320";
const std::u32string expect = U"4e\uFFFF\b\u0001\u0320\u0300";
const std::u32string src = U"4e\u0300\uFFFF\u0008\u0001\u0320";
const std::u32string expect = U"4e\uFFFF\u0008\u0001\u0320\u0300";
std::u32string dst = src;
assert(normalize_nfd_markers(dst, map));
if (dst != expect) {
Expand Down Expand Up @@ -808,10 +808,10 @@ int test_normalize() {
// from tests - regex edition
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - complex test 9c+regex" << std::endl;
const std::u32string src = U"9ce\u0300\uFFFF\b\\u0002\u0320\uFFFF\b\\u0001";
const std::u32string expect = U"9ce\uFFFF\b\\u0002\u0320\u0300\uFFFF\b\\u0001";
const std::u32string src = U"9ce\u0300\uFFFF\u0008\\u0002\u0320\uFFFF\u0008\\u0001";
const std::u32string expect = U"9ce\uFFFF\u0008\\u0002\u0320\u0300\uFFFF\u0008\\u0001";
std::u32string dst = src;
assert(normalize_nfd_markers(dst, map, true)); // TODO-LDML: need regex flag
assert(normalize_nfd_markers(dst, map, regex_sentinel)); // TODO-LDML: need regex flag
if (dst != expect) {
std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl;
Expand All @@ -825,10 +825,10 @@ int test_normalize() {
// from tests - regex edition
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - complex test \\m{.}" << std::endl;
const std::u32string src = U"9ce\u0300\uFFFF\b[\\u0001-\\uD7FE]\u0320\uFFFF\b\\u0001";
const std::u32string expect = U"9ce\uFFFF\b[\\u0001-\\uD7FE]\u0320\u0300\uFFFF\b\\u0001";
const std::u32string src = U"9ce\u0300\uFFFF\u0008[\\u0001-\\uD7FE]\u0320\uFFFF\u0008\\u0001";
const std::u32string expect = U"9ce\uFFFF\u0008[\\u0001-\\uD7FE]\u0320\u0300\uFFFF\u0008\\u0001";
std::u32string dst = src;
assert(normalize_nfd_markers(dst, map, true)); // TODO-LDML: need regex flag
assert(normalize_nfd_markers(dst, map, regex_sentinel));
if (dst != expect) {
std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl;
Expand Down

0 comments on commit 91b761b

Please sign in to comment.