Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(core): prepare normalization code to call into JS under WASM 🙀 #11519

Merged
merged 3 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions core/src/core_icu.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,23 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/normalizer2.h"

#include "debuglog.h"
#include <assert.h>


/** @returns true on success */
inline bool uassert_success(const char *file, int line, const char *function, UErrorCode status) {
if (U_FAILURE(status)) {
DebugLog2(file, line, function, "U_FAILURE(%s)", u_errorName(status));
return false;
} else {
return true;
}
}

/**
* Assert an ICU4C UErrorCode
* the first assert is for debug builds, the second triggers the debuglog and has the return value.
* */
#define UASSERT_SUCCESS(status) (assert(U_SUCCESS(status)), uassert_success(__FILE__, __LINE__, __FUNCTION__, status))
2 changes: 1 addition & 1 deletion core/src/keyman_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
#include <keyman/keyman_core_api.h>
#include <keyman/keyman_core_api_actions.h>
#include <keyman/keyman_core_api_context.h>
#include <keyman/keyman_core_api_debug.h>
#include <keyman/keyman_core_api_debug.h>
28 changes: 2 additions & 26 deletions core/src/km_core_state_context_set_if_needed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include "processor.hpp"
#include "state.hpp"
#include "debuglog.h"
#include "core_icu.h"
#include "util_normalize.hpp"
#include "kmx/kmx_xstring.h" // for Unicode routines

using namespace km::core;
Expand Down Expand Up @@ -94,7 +94,7 @@ km_core_state_context_set_if_needed(
km_core_cu const *new_cached_context = nullptr;

if (should_normalize(state)) {
if (!do_normalize_nfd(new_app_context, normalized_buffer)) {
if (!km::core::util::normalize_nfd(new_app_context, normalized_buffer)) {
return do_fail(app_context, cached_context, "could not normalize string");
}
new_cached_context = normalized_buffer.c_str();
Expand Down Expand Up @@ -283,30 +283,6 @@ get_context_items_change(
return change_type;
}

/**
* Normalize the input string using ICU
*/
bool do_normalize_nfd(km_core_cu const * src, std::u16string &dst) {
UErrorCode icu_status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(icu_status);
assert(U_SUCCESS(icu_status));
if(!U_SUCCESS(icu_status)) {
// TODO: log the failure code
return false;
}
icu::UnicodeString udst;
icu::UnicodeString usrc = icu::UnicodeString(src);
nfd->normalize(usrc, udst, icu_status);
assert(U_SUCCESS(icu_status));
if(!U_SUCCESS(icu_status)) {
// TODO: log the failure code
return false;
}

dst.assign(udst.getBuffer(), udst.length());
return true;
}

/**
* Clear the context when we have a failure so we don't end up with inconsistent
* context buffers, and log the error to our diagnostic log.
Expand Down
49 changes: 2 additions & 47 deletions core/src/ldml/ldml_markers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#include <string>
#include "kmx/kmx_xstring.h"
#include <assert.h>
#include "util_normalize.hpp"

#include "ldml_utils.hpp"
#include <ldml/keyman_core_ldml.h>

namespace km {
Expand All @@ -26,51 +26,6 @@ const std::u32string RAW_PREFIX = U"\uffff\u0008";
const std::u32string REGEX_ANY_MATCH = U"[\\u0001-\\ud7fe]";
static_assert(LDML_MARKER_NO_INDEX < LDML_MARKER_MIN_INDEX, "LDML_MARKER_NO_INDEX must be < LDML_MARKER_MIN_INDEX");

// string manipulation

/**
* Internal function to normalize with a specified mode.
* Note: that this function _does_ assert failure, so it is not
* required to assert its return code. The return is provided so
* that callers can exit (such as making no change) if there was failure.
*
* Also note that "failure" here is something catastrophic: ICU not initialized,
* or, more likely, some low memory situation. Does not fail on "bad" data.
* @param n the ICU Normalizer to use
* @param str input/output string
* @param status error code, must be initialized on input
* @return false if failure
*/
static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode &status) {
UASSERT_SUCCESS(status);
assert(n != nullptr);
icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
n->normalize(src, dest, status);
// the next line here will assert
if (UASSERT_SUCCESS(status)) {
str.assign(dest.getBuffer(), dest.length());
}
return U_SUCCESS(status);
}

bool normalize_nfd(std::u32string &str) {
std::u16string rstr = km::core::kmx::u32string_to_u16string(str);
if(!normalize_nfd(rstr)) {
return false;
} else {
str = km::core::kmx::u16string_to_u32string(rstr);
return true;
}
}

bool normalize_nfd(std::u16string &str) {
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
UASSERT_SUCCESS(status);
return normalize(nfd, str, status);
}

marker_entry::marker_entry(char32_t c) : ch(c), marker(LDML_MARKER_NO_INDEX), processed(false), end(true) {
}

Expand Down Expand Up @@ -146,7 +101,7 @@ bool normalize_nfd_markers_segment(std::u32string &str, marker_map &map, marker_
std::u32string str_unmarked = remove_markers(str, map, encoding);
/** original string, no markers, NFD */
std::u32string str_unmarked_nfd = str_unmarked;
if(!normalize_nfd(str_unmarked_nfd)) {
if(!km::core::util::normalize_nfd(str_unmarked_nfd)) {
return false; // normalize failed.
} else if (str_unmarked_nfd == str_unmarked) {
// Normalization produced no change when markers were removed.
Expand Down
19 changes: 0 additions & 19 deletions core/src/ldml/ldml_markers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,32 +71,13 @@ typedef std::deque<marker_entry> marker_map;
/** count number of non-end entries */
size_t count_markers(const marker_map &map);

/** Normalize a u32string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u32string &str);
/** Normalize a u16string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u16string &str);
/** Normalize a u32string inplace to NFD, retaining markers.
* @param markers will be populated with marker chars
* @return false on failure
**/
bool normalize_nfd_markers_segment(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);

// /** Normalize a u32string inplace to NFC, retaining markers.
// * @param markers will be populated with marker chars
// * @return false on failure
// **/
// bool normalize_nfd_markers_segment(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
// bool normalize_nfd_markers_segment(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
// inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
// inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);

// /** Normalize a u32string inplace to NFC. @return false on failure */
// bool normalize_nfc(std::u32string &str);

// /** Normalize a u16string inplace to NFC. @return false on failure */
// bool normalize_nfc(std::u16string &str);

/** Remove markers and optionally note their glue characters in the map */
std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, marker_encoding encoding = plain_sentinel);

Expand Down
1 change: 0 additions & 1 deletion core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#include <string>
#include "kmx/kmx_xstring.h"
#include <assert.h>
#include "ldml_utils.hpp"

namespace km {
namespace core {
Expand Down
32 changes: 0 additions & 32 deletions core/src/ldml/ldml_utils.hpp

This file was deleted.

1 change: 1 addition & 0 deletions core/src/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ kmx_files = files(
'km_core_debug_api.cpp',
'km_core_processevent_api.cpp',
'jsonpp.cpp',
'util_normalize.cpp',
'ldml/ldml_processor.cpp',
'ldml/ldml_transforms.cpp',
'ldml/ldml_markers.cpp',
Expand Down
88 changes: 88 additions & 0 deletions core/src/util_normalize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
Copyright: © SIL International.
Description: Common LDML utilities
Create Date: 6 Jan 2024
Authors: Steven R. Loomis
*/

#include "util_normalize.hpp"

#include "core_icu.h"
#include "kmx/kmx_xstring.h"


namespace km {
namespace core {
namespace util {


/**
* Internal function to normalize with a specified mode.
* Note: that this function _does_ assert failure, so it is not
* required to assert its return code. The return is provided so
* that callers can exit (such as making no change) if there was failure.
*
* Also note that "failure" here is something catastrophic: ICU not initialized,
* or, more likely, some low memory situation. Does not fail on "bad" data.
* @param n the ICU Normalizer to use
* @param str input/output string
* @param status error code, must be initialized on input
* @return false if failure
*/
static bool normalize(const icu::Normalizer2 *n, std::u16string &str, UErrorCode &status) {
UASSERT_SUCCESS(status);
assert(n != nullptr);
icu::UnicodeString dest;
icu::UnicodeString src = icu::UnicodeString(str.data(), (int32_t)str.length());
n->normalize(src, dest, status);
// the next line here will assert
if (UASSERT_SUCCESS(status)) {
str.assign(dest.getBuffer(), dest.length());
}
return U_SUCCESS(status);
}

bool normalize_nfd(std::u32string &str) {
std::u16string rstr = km::core::kmx::u32string_to_u16string(str);
if(!km::core::util::normalize_nfd(rstr)) {
return false;
} else {
str = km::core::kmx::u16string_to_u32string(rstr);
return true;
}
}

bool normalize_nfd(std::u16string &str) {
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
UASSERT_SUCCESS(status);
return normalize(nfd, str, status);
}

/**
* Normalize the input string using ICU, out of place
*/
bool normalize_nfd(km_core_cu const * src, std::u16string &dst) {
UErrorCode icu_status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(icu_status);
assert(U_SUCCESS(icu_status));
if(!U_SUCCESS(icu_status)) {
// TODO: log the failure code
return false;
}
icu::UnicodeString udst;
icu::UnicodeString usrc = icu::UnicodeString(src);
nfd->normalize(usrc, udst, icu_status);
assert(U_SUCCESS(icu_status));
if(!U_SUCCESS(icu_status)) {
// TODO: log the failure code
return false;
}

dst.assign(udst.getBuffer(), udst.length());
return true;
}

}
}
}
28 changes: 28 additions & 0 deletions core/src/util_normalize.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
Copyright: © SIL International.
Description: Normalization and Regex utilities
Create Date: 23 May 2024
Authors: Steven R. Loomis
*/

#pragma once

#include <string>
#include "keyman_core.h"

namespace km {
namespace core {
namespace util {

/** Normalize a u32string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u32string &str);

/** Normalize a u16string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u16string &str);

/** normalize src to dst in NFD. @return false on failure */
bool normalize_nfd(km_core_cu const * src, std::u16string &dst);

}
}
}
3 changes: 2 additions & 1 deletion core/tests/unit/ldml/ldml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <test_color.h>

#include "keyman_core.h"
#include "util_normalize.hpp"

#include <kmx/kmx_xstring.h> // for surrogate pair macros

Expand Down Expand Up @@ -361,7 +362,7 @@ run_test(const km::core::path &source, const km::core::path &compiled, km::tests
} break;
case km::tests::LDML_ACTION_CHECK_EXPECTED: {
if (!normalization_disabled) {
assert(km::core::ldml::normalize_nfd(action.string)); // TODO-LDML: should be NFC
assert(km::core::util::normalize_nfd(action.string)); // TODO-LDML: should be NFC
}
std::cout << "- check expected" << std::endl;
std::cout << "expected : " << string_to_hex(action.string) << " [" << action.string << "]" << std::endl;
Expand Down
Loading
Loading