Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve macro handling of McBopomofoLM and expand test coverage #133

Merged
merged 6 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ if (ENABLE_TEST)
add_executable(McBopomofoLMLibTest
AssociatedPhrasesV2Test.cpp
KeyValueBlobReaderTest.cpp
McBopomofoLMTest.cpp
MemoryMappedFileTest.cpp
ParselessLMTest.cpp
ParselessPhraseDBTest.cpp
Expand Down
5 changes: 5 additions & 0 deletions src/Engine/KeyValueBlobReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ class KeyValueBlobReader {
KeyValueBlobReader(const char* blob, size_t size)
: current_(blob), end_(blob + size) {}

KeyValueBlobReader(const KeyValueBlobReader&) = delete;
KeyValueBlobReader(KeyValueBlobReader&&) = delete;
KeyValueBlobReader& operator=(const KeyValueBlobReader&) = delete;
KeyValueBlobReader& operator=(KeyValueBlobReader&&) = delete;

// Parse the next key-value pair and return the state of the reader. If
// `out` is passed, out will be set to the produced key-value pair if there
// is one.
Expand Down
68 changes: 47 additions & 21 deletions src/Engine/McBopomofoLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,8 @@

namespace McBopomofo {

McBopomofoLM::McBopomofoLM() {}

McBopomofoLM::~McBopomofoLM() {
languageModel_.close();
userPhrases_.close();
excludedPhrases_.close();
phraseReplacement_.close();
associatedPhrasesV2_.close();
}
static constexpr std::string_view kMacroPrefix = "MACRO@";
static constexpr double kMacroScore = -8.0;

void McBopomofoLM::loadLanguageModel(const char* languageModelDataPath) {
if (languageModelDataPath) {
Expand Down Expand Up @@ -95,11 +88,11 @@ McBopomofoLM::getUnigrams(const std::string& key) {
if (excludedPhrases_.hasUnigrams(key)) {
std::vector<Formosa::Gramambular2::LanguageModel::Unigram>
excludedUnigrams = excludedPhrases_.getUnigrams(key);
transform(excludedUnigrams.begin(), excludedUnigrams.end(),
inserter(excludedValues, excludedValues.end()),
[](const Formosa::Gramambular2::LanguageModel::Unigram& u) {
return u.value();
});
std::transform(excludedUnigrams.begin(), excludedUnigrams.end(),
std::inserter(excludedValues, excludedValues.end()),
[](const Formosa::Gramambular2::LanguageModel::Unigram& u) {
return u.value();
});
}

if (userPhrases_.hasUnigrams(key)) {
Expand Down Expand Up @@ -177,7 +170,7 @@ bool McBopomofoLM::hasUnigrams(const std::string& key) {
return !getUnigrams(key).empty();
}

std::string McBopomofoLM::getReading(const std::string& value) {
std::string McBopomofoLM::getReading(const std::string& value) const {
std::vector<ParselessLM::FoundReading> foundReadings =
languageModel_.getReadings(value);
double topScore = std::numeric_limits<double>::lowest();
Expand All @@ -191,6 +184,12 @@ std::string McBopomofoLM::getReading(const std::string& value) {
return topValue;
}

std::vector<AssociatedPhrasesV2::Phrase> McBopomofoLM::findAssociatedPhrasesV2(
const std::string& prefixValue,
const std::vector<std::string>& prefixReadings) const {
return associatedPhrasesV2_.findPhrases(prefixValue, prefixReadings);
}

void McBopomofoLM::setPhraseReplacementEnabled(bool enabled) {
phraseReplacementEnabled_ = enabled;
}
Expand Down Expand Up @@ -246,11 +245,18 @@ McBopomofoLM::filterAndTransformUnigrams(
value = replacement;
}
}
if (macroConverter_) {
if (macroConverter_ != nullptr) {
std::string replacement = macroConverter_(value);
value = replacement;
}
if (externalConverterEnabled_ && externalConverter_) {

// Check if the string is an unsupported macro
if (unigram.score() == kMacroScore && value.size() > kMacroPrefix.size() &&
value.compare(0, kMacroPrefix.size(), kMacroPrefix) == 0) {
continue;
}

if (externalConverterEnabled_ && externalConverter_ != nullptr) {
std::string replacement = externalConverter_(value);
value = replacement;
}
Expand All @@ -262,10 +268,30 @@ McBopomofoLM::filterAndTransformUnigrams(
return results;
}

std::vector<AssociatedPhrasesV2::Phrase> McBopomofoLM::findAssociatedPhrasesV2(
const std::string& prefixValue,
const std::vector<std::string>& prefixReadings) const {
return associatedPhrasesV2_.findPhrases(prefixValue, prefixReadings);
void McBopomofoLM::loadLanguageModel(std::unique_ptr<ParselessPhraseDB> db) {
languageModel_.close();
languageModel_.open(std::move(db));
}

void McBopomofoLM::loadAssociatedPhrasesV2(
std::unique_ptr<ParselessPhraseDB> db) {
associatedPhrasesV2_.close();
associatedPhrasesV2_.open(std::move(db));
}

void McBopomofoLM::loadUserPhrases(const char* data, size_t length) {
userPhrases_.close();
userPhrases_.load(data, length);
}

void McBopomofoLM::loadExcludedPhrases(const char* data, size_t length) {
excludedPhrases_.close();
excludedPhrases_.load(data, length);
}

void McBopomofoLM::loadPhraseReplacementMap(const char* data, size_t length) {
phraseReplacement_.close();
phraseReplacement_.load(data, length);
}

} // namespace McBopomofo
123 changes: 56 additions & 67 deletions src/Engine/McBopomofoLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
#ifndef SRC_ENGINE_MCBOPOMOFOLM_H_
#define SRC_ENGINE_MCBOPOMOFOLM_H_

#include <cstdio>
#include <functional>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
Expand All @@ -38,99 +38,85 @@

namespace McBopomofo {

/// McBopomofoLM is a facade for managing a set of models including
/// the input method language model, user phrases and excluded phrases.
///
/// It is the primary model class that the input controller and grammar builder
/// of McBopomofo talks to. When the grammar builder starts to build a sentence
/// from a series of BPMF readings, it passes the readings to the model to see
/// if there are valid unigrams, and use returned unigrams to produce the final
/// results.
///
/// McBopomofoLM combine and transform the unigrams from the primary language
/// model and user phrases. The process is
///
/// 1) Get the original unigrams.
/// 2) Drop the unigrams whose value is contained in the exclusion map.
/// 3) Replace the values of the unigrams using the phrase replacement map.
/// 4) Replace the values of the unigrams using an external converter lambda.
/// 5) Drop the duplicated phrases.
///
/// The controller can ask the model to load the primary input method language
/// model while launching and to load the user phrases anytime if the custom
/// files are modified. It does not keep the reference of the data paths but
/// you have to pass the paths when you ask it to do loading.
// McBopomofoLM manages the input method's language models and performs text
// and macro conversions.
//
// When the reading grid requests unigrams from McBopomofoLM, the LM combines
// and transforms the unigrams from the primary language model and user phrases.
// The process is
//
// 1. Get the original unigrams.
// 2. Drop the unigrams from the user-exclusion list.
// 3. Replace the unigram values specified by the user phrase replacement map.
// 4. Transform the unigram values with an external converter, if supplied.
// 5. Remove any duplicates.
//
// McBopomofoLM itself is not responsible for reloading custom models (user
// phrases, excluded phrases, and replacement map). The LM's owner, usually the
// input method controller, needs to take care of checking for updates and
// telling McBopomofoLM to reload as needed.
class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
public:
McBopomofoLM();
~McBopomofoLM() override;
McBopomofoLM() = default;

/// Asks to load the primary language model at the given path.
/// @param languageModelPath The path of the language model.
void loadLanguageModel(const char* languageModelDataPath);
/// If the data model is already loaded.
bool isDataModelLoaded();
McBopomofoLM(const McBopomofoLM&) = delete;
McBopomofoLM(McBopomofoLM&&) = delete;
McBopomofoLM& operator=(const McBopomofoLM&) = delete;
McBopomofoLM& operator=(McBopomofoLM&&) = delete;

/// Asks to load the associated phrases at the given path.
/// @param associatedPhrasesPath The path of the associated phrases.
void loadAssociatedPhrases(const char* associatedPhrasesPath);
// Loads (or reloads, if already loaded) the primary language model data file.
void loadLanguageModel(const char* languageModelDataPath);

/// If the associated phrases already loaded.
bool isAssociatedPhrasesLoaded();
bool isDataModelLoaded();

// Loads (or reloads if already loaded) the associated phrases data file.
void loadAssociatedPhrasesV2(const char* associatedPhrasesPath);

/// Asks to load the user phrases and excluded phrases at the given path.
/// @param userPhrasesPath The path of user phrases.
/// @param excludedPhrasesPath The path of excluded phrases.
// Loads (or reloads if already loaded) both the user phrases and the excluded
// phrases files. If one argument is passed a nullptr, that file will not
// be loaded or reloaded.
void loadUserPhrases(const char* userPhrasesDataPath,
const char* excludedPhrasesDataPath);
/// Asks to load th phrase replacement table at the given path.
/// @param phraseReplacementPath The path of the phrase replacement table.

// Loads (or reloads if already loaded) the phrase replacement mapping file.
void loadPhraseReplacementMap(const char* phraseReplacementPath);

/// Returns a list of available unigram for the given key.
/// @param key A string represents the BPMF reading or a symbol key. For
/// example, it you pass "ㄇㄚ", it returns "嗎", "媽", and so on.
// Returns a list of unigrams for the reading. For example, if the reading is
// "ㄇㄚ", the return may be [unigram("嗎"), unigram("媽") and so on.
std::vector<Formosa::Gramambular2::LanguageModel::Unigram> getUnigrams(
const std::string& key) override;
/// If the model has unigrams for the given key.
/// @param key The key.

bool hasUnigrams(const std::string& key) override;

/// Enables or disables phrase replacement.
std::string getReading(const std::string& value) const;

std::vector<AssociatedPhrasesV2::Phrase> findAssociatedPhrasesV2(
const std::string& prefixValue,
const std::vector<std::string>& prefixReadings) const;

void setPhraseReplacementEnabled(bool enabled);
/// If phrase replacement is enabled or not.
bool phraseReplacementEnabled() const;

/// Enables or disables the external converter.
void setExternalConverterEnabled(bool enabled);
/// If the external converted is enabled or not.
bool externalConverterEnabled() const;
/// Sets a lambda to let the values of unigrams could be converted by it.
void setExternalConverter(
std::function<std::string(const std::string&)> externalConverter);

/// Sets a lambda to convert the macro to a string.
void setMacroConverter(
std::function<std::string(const std::string&)> macroConverter);
std::string convertMacro(const std::string& input);

std::vector<AssociatedPhrasesV2::Phrase> findAssociatedPhrasesV2(
const std::string& prefixValue,
const std::vector<std::string>& prefixReadings) const;

/// Returns the top-scored reading from the base model, given the value.
std::string getReading(const std::string& value);
// Methods to allow loading in-memory data for testing purposes.
void loadLanguageModel(std::unique_ptr<ParselessPhraseDB> db);
void loadAssociatedPhrasesV2(std::unique_ptr<ParselessPhraseDB> db);
void loadUserPhrases(const char* data, size_t length);
void loadExcludedPhrases(const char* data, size_t length);
void loadPhraseReplacementMap(const char* data, size_t length);

protected:
/// Filters and converts the input unigrams and return a new list of unigrams.
///
/// @param unigrams The unigrams to be processed.
/// @param excludedValues The values to excluded unigrams.
/// @param insertedValues The values for unigrams already in the results.
/// It helps to prevent duplicated unigrams. Please note that the method
/// has a side effect that it inserts values to `insertedValues`.
// Filters and converts the input unigrams and returns a new list of unigrams.
// Unigrams whose values are found in `excludedValues` are removed, and the
// kept values will be inserted to the `insertedValues` set.
std::vector<Formosa::Gramambular2::LanguageModel::Unigram>
filterAndTransformUnigrams(
const std::vector<Formosa::Gramambular2::LanguageModel::Unigram> unigrams,
Expand All @@ -143,11 +129,14 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
PhraseReplacementMap phraseReplacement_;
AssociatedPhrasesV2 associatedPhrasesV2_;

std::function<std::string(const std::string&)> macroConverter_;
bool phraseReplacementEnabled_;
bool externalConverterEnabled_;
bool phraseReplacementEnabled_ = false;

bool externalConverterEnabled_ = false;
std::function<std::string(const std::string&)> externalConverter_;

std::function<std::string(const std::string&)> macroConverter_;
};

} // namespace McBopomofo

#endif // SRC_ENGINE_MCBOPOMOFOLM_H_
Loading