diff --git a/src/Engine/CMakeLists.txt b/src/Engine/CMakeLists.txt index edaa1c2..454ae88 100644 --- a/src/Engine/CMakeLists.txt +++ b/src/Engine/CMakeLists.txt @@ -43,6 +43,7 @@ if (ENABLE_TEST) add_executable(McBopomofoLMLibTest AssociatedPhrasesV2Test.cpp KeyValueBlobReaderTest.cpp + McBopomofoLMTest.cpp MemoryMappedFileTest.cpp ParselessLMTest.cpp ParselessPhraseDBTest.cpp diff --git a/src/Engine/KeyValueBlobReader.h b/src/Engine/KeyValueBlobReader.h index 2f5a77c..8421e96 100644 --- a/src/Engine/KeyValueBlobReader.h +++ b/src/Engine/KeyValueBlobReader.h @@ -74,6 +74,11 @@ class KeyValueBlobReader { KeyValueBlobReader(const char* blob, size_t size) : current_(blob), end_(blob + size) {} + KeyValueBlobReader(const KeyValueBlobReader&) = delete; + KeyValueBlobReader(KeyValueBlobReader&&) = delete; + KeyValueBlobReader& operator=(const KeyValueBlobReader&) = delete; + KeyValueBlobReader& operator=(KeyValueBlobReader&&) = delete; + // Parse the next key-value pair and return the state of the reader. If // `out` is passed, out will be set to the produced key-value pair if there // is one. diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index e577bb9..8cb1edd 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -33,15 +33,8 @@ namespace McBopomofo { -McBopomofoLM::McBopomofoLM() {} - -McBopomofoLM::~McBopomofoLM() { - languageModel_.close(); - userPhrases_.close(); - excludedPhrases_.close(); - phraseReplacement_.close(); - associatedPhrasesV2_.close(); -} +static constexpr std::string_view kMacroPrefix = "MACRO@"; +static constexpr double kMacroScore = -8.0; void McBopomofoLM::loadLanguageModel(const char* languageModelDataPath) { if (languageModelDataPath) { @@ -95,11 +88,11 @@ McBopomofoLM::getUnigrams(const std::string& key) { if (excludedPhrases_.hasUnigrams(key)) { std::vector excludedUnigrams = excludedPhrases_.getUnigrams(key); - transform(excludedUnigrams.begin(), excludedUnigrams.end(), - inserter(excludedValues, excludedValues.end()), - [](const Formosa::Gramambular2::LanguageModel::Unigram& u) { - return u.value(); - }); + std::transform(excludedUnigrams.begin(), excludedUnigrams.end(), + std::inserter(excludedValues, excludedValues.end()), + [](const Formosa::Gramambular2::LanguageModel::Unigram& u) { + return u.value(); + }); } if (userPhrases_.hasUnigrams(key)) { @@ -177,7 +170,7 @@ bool McBopomofoLM::hasUnigrams(const std::string& key) { return !getUnigrams(key).empty(); } -std::string McBopomofoLM::getReading(const std::string& value) { +std::string McBopomofoLM::getReading(const std::string& value) const { std::vector foundReadings = languageModel_.getReadings(value); double topScore = std::numeric_limits::lowest(); @@ -191,6 +184,12 @@ std::string McBopomofoLM::getReading(const std::string& value) { return topValue; } +std::vector McBopomofoLM::findAssociatedPhrasesV2( + const std::string& prefixValue, + const std::vector& prefixReadings) const { + return associatedPhrasesV2_.findPhrases(prefixValue, prefixReadings); +} + void McBopomofoLM::setPhraseReplacementEnabled(bool enabled) { phraseReplacementEnabled_ = enabled; } @@ -246,11 +245,18 @@ McBopomofoLM::filterAndTransformUnigrams( value = replacement; } } - if (macroConverter_) { + if (macroConverter_ != nullptr) { std::string replacement = macroConverter_(value); value = replacement; } - if (externalConverterEnabled_ && externalConverter_) { + + // Check if the string is an unsupported macro + if (unigram.score() == kMacroScore && value.size() > kMacroPrefix.size() && + value.compare(0, kMacroPrefix.size(), kMacroPrefix) == 0) { + continue; + } + + if (externalConverterEnabled_ && externalConverter_ != nullptr) { std::string replacement = externalConverter_(value); value = replacement; } @@ -262,10 +268,30 @@ McBopomofoLM::filterAndTransformUnigrams( return results; } -std::vector McBopomofoLM::findAssociatedPhrasesV2( - const std::string& prefixValue, - const std::vector& prefixReadings) const { - return associatedPhrasesV2_.findPhrases(prefixValue, prefixReadings); +void McBopomofoLM::loadLanguageModel(std::unique_ptr db) { + languageModel_.close(); + languageModel_.open(std::move(db)); +} + +void McBopomofoLM::loadAssociatedPhrasesV2( + std::unique_ptr db) { + associatedPhrasesV2_.close(); + associatedPhrasesV2_.open(std::move(db)); +} + +void McBopomofoLM::loadUserPhrases(const char* data, size_t length) { + userPhrases_.close(); + userPhrases_.load(data, length); +} + +void McBopomofoLM::loadExcludedPhrases(const char* data, size_t length) { + excludedPhrases_.close(); + excludedPhrases_.load(data, length); +} + +void McBopomofoLM::loadPhraseReplacementMap(const char* data, size_t length) { + phraseReplacement_.close(); + phraseReplacement_.load(data, length); } } // namespace McBopomofo diff --git a/src/Engine/McBopomofoLM.h b/src/Engine/McBopomofoLM.h index 32ef266..1492628 100644 --- a/src/Engine/McBopomofoLM.h +++ b/src/Engine/McBopomofoLM.h @@ -24,8 +24,8 @@ #ifndef SRC_ENGINE_MCBOPOMOFOLM_H_ #define SRC_ENGINE_MCBOPOMOFOLM_H_ -#include #include +#include #include #include #include @@ -38,99 +38,85 @@ namespace McBopomofo { -/// McBopomofoLM is a facade for managing a set of models including -/// the input method language model, user phrases and excluded phrases. -/// -/// It is the primary model class that the input controller and grammar builder -/// of McBopomofo talks to. When the grammar builder starts to build a sentence -/// from a series of BPMF readings, it passes the readings to the model to see -/// if there are valid unigrams, and use returned unigrams to produce the final -/// results. -/// -/// McBopomofoLM combine and transform the unigrams from the primary language -/// model and user phrases. The process is -/// -/// 1) Get the original unigrams. -/// 2) Drop the unigrams whose value is contained in the exclusion map. -/// 3) Replace the values of the unigrams using the phrase replacement map. -/// 4) Replace the values of the unigrams using an external converter lambda. -/// 5) Drop the duplicated phrases. -/// -/// The controller can ask the model to load the primary input method language -/// model while launching and to load the user phrases anytime if the custom -/// files are modified. It does not keep the reference of the data paths but -/// you have to pass the paths when you ask it to do loading. +// McBopomofoLM manages the input method's language models and performs text +// and macro conversions. +// +// When the reading grid requests unigrams from McBopomofoLM, the LM combines +// and transforms the unigrams from the primary language model and user phrases. +// The process is +// +// 1. Get the original unigrams. +// 2. Drop the unigrams from the user-exclusion list. +// 3. Replace the unigram values specified by the user phrase replacement map. +// 4. Transform the unigram values with an external converter, if supplied. +// 5. Remove any duplicates. +// +// McBopomofoLM itself is not responsible for reloading custom models (user +// phrases, excluded phrases, and replacement map). The LM's owner, usually the +// input method controller, needs to take care of checking for updates and +// telling McBopomofoLM to reload as needed. class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { public: - McBopomofoLM(); - ~McBopomofoLM() override; + McBopomofoLM() = default; - /// Asks to load the primary language model at the given path. - /// @param languageModelPath The path of the language model. - void loadLanguageModel(const char* languageModelDataPath); - /// If the data model is already loaded. - bool isDataModelLoaded(); + McBopomofoLM(const McBopomofoLM&) = delete; + McBopomofoLM(McBopomofoLM&&) = delete; + McBopomofoLM& operator=(const McBopomofoLM&) = delete; + McBopomofoLM& operator=(McBopomofoLM&&) = delete; - /// Asks to load the associated phrases at the given path. - /// @param associatedPhrasesPath The path of the associated phrases. - void loadAssociatedPhrases(const char* associatedPhrasesPath); + // Loads (or reloads, if already loaded) the primary language model data file. + void loadLanguageModel(const char* languageModelDataPath); - /// If the associated phrases already loaded. - bool isAssociatedPhrasesLoaded(); + bool isDataModelLoaded(); + // Loads (or reloads if already loaded) the associated phrases data file. void loadAssociatedPhrasesV2(const char* associatedPhrasesPath); - /// Asks to load the user phrases and excluded phrases at the given path. - /// @param userPhrasesPath The path of user phrases. - /// @param excludedPhrasesPath The path of excluded phrases. + // Loads (or reloads if already loaded) both the user phrases and the excluded + // phrases files. If one argument is passed a nullptr, that file will not + // be loaded or reloaded. void loadUserPhrases(const char* userPhrasesDataPath, const char* excludedPhrasesDataPath); - /// Asks to load th phrase replacement table at the given path. - /// @param phraseReplacementPath The path of the phrase replacement table. + + // Loads (or reloads if already loaded) the phrase replacement mapping file. void loadPhraseReplacementMap(const char* phraseReplacementPath); - /// Returns a list of available unigram for the given key. - /// @param key A string represents the BPMF reading or a symbol key. For - /// example, it you pass "ㄇㄚ", it returns "嗎", "媽", and so on. + // Returns a list of unigrams for the reading. For example, if the reading is + // "ㄇㄚ", the return may be [unigram("嗎"), unigram("媽") and so on. std::vector getUnigrams( const std::string& key) override; - /// If the model has unigrams for the given key. - /// @param key The key. + bool hasUnigrams(const std::string& key) override; - /// Enables or disables phrase replacement. + std::string getReading(const std::string& value) const; + + std::vector findAssociatedPhrasesV2( + const std::string& prefixValue, + const std::vector& prefixReadings) const; + void setPhraseReplacementEnabled(bool enabled); - /// If phrase replacement is enabled or not. bool phraseReplacementEnabled() const; - /// Enables or disables the external converter. void setExternalConverterEnabled(bool enabled); - /// If the external converted is enabled or not. bool externalConverterEnabled() const; - /// Sets a lambda to let the values of unigrams could be converted by it. void setExternalConverter( std::function externalConverter); - /// Sets a lambda to convert the macro to a string. void setMacroConverter( std::function macroConverter); std::string convertMacro(const std::string& input); - std::vector findAssociatedPhrasesV2( - const std::string& prefixValue, - const std::vector& prefixReadings) const; - - /// Returns the top-scored reading from the base model, given the value. - std::string getReading(const std::string& value); + // Methods to allow loading in-memory data for testing purposes. + void loadLanguageModel(std::unique_ptr db); + void loadAssociatedPhrasesV2(std::unique_ptr db); + void loadUserPhrases(const char* data, size_t length); + void loadExcludedPhrases(const char* data, size_t length); + void loadPhraseReplacementMap(const char* data, size_t length); protected: - /// Filters and converts the input unigrams and return a new list of unigrams. - /// - /// @param unigrams The unigrams to be processed. - /// @param excludedValues The values to excluded unigrams. - /// @param insertedValues The values for unigrams already in the results. - /// It helps to prevent duplicated unigrams. Please note that the method - /// has a side effect that it inserts values to `insertedValues`. + // Filters and converts the input unigrams and returns a new list of unigrams. + // Unigrams whose values are found in `excludedValues` are removed, and the + // kept values will be inserted to the `insertedValues` set. std::vector filterAndTransformUnigrams( const std::vector unigrams, @@ -143,11 +129,14 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { PhraseReplacementMap phraseReplacement_; AssociatedPhrasesV2 associatedPhrasesV2_; - std::function macroConverter_; - bool phraseReplacementEnabled_; - bool externalConverterEnabled_; + bool phraseReplacementEnabled_ = false; + + bool externalConverterEnabled_ = false; std::function externalConverter_; + + std::function macroConverter_; }; + } // namespace McBopomofo #endif // SRC_ENGINE_MCBOPOMOFOLM_H_ diff --git a/src/Engine/McBopomofoLMTest.cpp b/src/Engine/McBopomofoLMTest.cpp new file mode 100644 index 0000000..cc38b11 --- /dev/null +++ b/src/Engine/McBopomofoLMTest.cpp @@ -0,0 +1,286 @@ +// Copyright (c) 2024 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "McBopomofoLM.h" +#include "gtest/gtest.h" + +namespace McBopomofo { + +constexpr char kPrimaryLMData[] = R"( +# format org.openvanilla.mcbopomofo.sorted +ㄇㄧㄥˊ 明 -3.07936356 +ㄇㄧㄥˊ 名 -3.12166252 +ㄇㄧㄥˊ 銘 -4.43019121 +ㄇㄧㄥˊ-ㄘˊ 名詞 -4.61364867 +ㄇㄧㄥˊ-ㄘˋ 名次 -5.47446950 +ㄉㄨㄥˋ 動 -2.83459585 +ㄉㄨㄥˋ 洞 -4.31757780 +ㄉㄨㄥˋ-ㄗㄨㄛˋ 動作 -4.17449149 +ㄐㄧㄣ-ㄊㄧㄢ 今天 -3.28959497 +ㄐㄧㄣ-ㄊㄧㄢ MACRO@DATE_TODAY_SHORT -8 +ㄐㄧㄣ-ㄊㄧㄢ MACRO@DATE_TODAY_MEDIUM -8 +ㄔㄥˊ-ㄕˋ 城市 -3.98856498 +ㄔㄥˊ-ㄕˋ 程式 -4.07624939 +ㄔㄥˊ-ㄕˋ 成事 -5.88664994 +ㄙㄜˋ-ㄍㄨˇ 澀谷 -6.78973993 +ㄙㄜˋ-ㄍㄨˇ 渋谷 -6.78973993 +)"; + +constexpr char kAssociatedPhrasesV2Data[] = R"( +# format org.openvanilla.mcbopomofo.sorted +名-ㄇㄧㄥˊ-下-ㄒㄧㄚˋ -5.7106 +名-ㄇㄧㄥˊ-不-ㄅㄨˊ-見-ㄐㄧㄢˋ-經-ㄐㄧㄥ-傳-ㄓㄨㄢˋ -5.9904 +)"; + +constexpr char kUserPhrasesData[] = R"( +茗 ㄇㄧㄥˊ +丼 ㄉㄨㄥˋ +名刺 ㄇㄧㄥˊ-ㄘˋ +程式 ㄔㄥˊ-ㄕˋ +)"; + +constexpr char kExcludedPhrasesData[] = R"( +動作 ㄉㄨㄥˋ-ㄗㄨㄛˋ +)"; + +constexpr char kPhreaseReplacementMapData[] = R"( +動作 动作 +澀谷 渋谷 +)"; + +TEST(McBopomofoLMTest, PrimaryLanguageModel) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + EXPECT_TRUE(lm.hasUnigrams("ㄇㄧㄥˊ-ㄘˊ")); + EXPECT_FALSE(lm.hasUnigrams("ㄉㄨㄥˋ-ㄘˊ")); + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞"); + EXPECT_LT(unigrams[0].score(), 0); +} + +TEST(McBopomofoLMTest, AssociatedPhrasesV2) { + McBopomofoLM lm; + auto db = std::make_unique( + kAssociatedPhrasesV2Data, sizeof(kAssociatedPhrasesV2Data)); + lm.loadAssociatedPhrasesV2(std::move(db)); + + auto phrases = lm.findAssociatedPhrasesV2("名", {"ㄇㄧㄥˊ"}); + EXPECT_FALSE(phrases.empty()); + EXPECT_EQ(phrases[0].value, "名下"); + EXPECT_EQ(phrases[0].readings, + (std::vector{"ㄇㄧㄥˊ", "ㄒㄧㄚˋ"})); + + EXPECT_TRUE(lm.findAssociatedPhrasesV2("銘", {"ㄇㄧㄥˊ"}).empty()); +} + +TEST(McBopomofoLMTest, UserPhrases) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "茗"); +} + +TEST(McBopomofoLMTest, ExcludedPhrases) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + EXPECT_FALSE(unigrams.empty()); + + lm.loadExcludedPhrases(kExcludedPhrasesData, sizeof(kExcludedPhrasesData)); + unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + EXPECT_TRUE(unigrams.empty()); +} + +TEST(McBopomofoLMTest, PhraseReplacementMap) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadPhraseReplacementMap(kPhreaseReplacementMapData, + sizeof(kPhreaseReplacementMapData)); + auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "動作"); + + lm.setPhraseReplacementEnabled(true); + unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "动作"); +} + +TEST(McBopomofoLMTest, PhraseReplacementMapDeduplicates) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadPhraseReplacementMap(kPhreaseReplacementMapData, + sizeof(kPhreaseReplacementMapData)); + auto unigrams = lm.getUnigrams("ㄙㄜˋ-ㄍㄨˇ"); + EXPECT_EQ(unigrams.size(), 2); + + lm.setPhraseReplacementEnabled(true); + unigrams = lm.getUnigrams("ㄙㄜˋ-ㄍㄨˇ"); + ASSERT_EQ(unigrams.size(), 1); + EXPECT_EQ(unigrams[0].value(), "渋谷"); +} + +TEST(McBopomofoLMTest, UserPhrasesOverrideDefaultLanguageModelPhrases) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + auto unigrams = lm.getUnigrams("ㄔㄥˊ-ㄕˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "城市"); + + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + unigrams = lm.getUnigrams("ㄔㄥˊ-ㄕˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "程式"); +} + +TEST(McBopomofoLMTest, MonoSyllableUserPhrasesNeedRewrite) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + + auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "丼"); + // This user unigram's score is recomputed and must be < kUserUnigramScore. + EXPECT_LT(unigrams[0].score(), UserPhrasesLM::kUserUnigramScore); + EXPECT_GT(unigrams[0].score(), unigrams[1].score()); + // The delta between the two should be minuscule. + EXPECT_LT(std::abs(unigrams[0].score() - unigrams[1].score()), 0.000001); +} + +TEST(McBopomofoLMTest, MultipleSyllableUserPhrasesNeedNoRewrite) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名刺"); + // This user unigram's score is not recomputed, so is still kUserUnigramScore. + EXPECT_EQ(unigrams[0].score(), UserPhrasesLM::kUserUnigramScore); + EXPECT_EQ(unigrams[1].value(), "名次"); + EXPECT_LT(unigrams[1].score(), 0); +} + +TEST(McBopomofoLMTest, ExternalConverterWhenNotSetThenEnablingItIsNoOp) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.setExternalConverterEnabled(true); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞"); +} + +TEST(McBopomofoLMTest, ExternalConverter) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞"); + + lm.setExternalConverterEnabled(true); + lm.setExternalConverter([](const auto& value) { return value + "!"; }); + + unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞!"); +} + +TEST(McBopomofoLMTest, ExternalConverterConversionResultsAreDeduplicated) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ"); + ASSERT_GT(unigrams.size(), 1); + + lm.setExternalConverterEnabled(true); + lm.setExternalConverter([](const auto&) { return "!"; }); + + unigrams = lm.getUnigrams("ㄇㄧㄥˊ"); + ASSERT_EQ(unigrams.size(), 1); + EXPECT_EQ(unigrams[0].value(), "!"); +} + +TEST(McBopomofoLMTest, DefaultMacroConverterIsNoOp) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + ASSERT_EQ(lm.convertMacro("MACRO@DATE_TODAY_SHORT"), + "MACRO@DATE_TODAY_SHORT"); +} + +TEST(McBopomofoLMTest, MacroValueNotRecognizedByMacroConverterIsFiltered) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + lm.setMacroConverter([](const std::string& macro) { + if (macro == "MACRO@DATE_TODAY_SHORT") { + return std::string("6/10/21"); + } + return macro; + }); + + auto unigrams = lm.getUnigrams("ㄐㄧㄣ-ㄊㄧㄢ"); + ASSERT_EQ(unigrams.size(), 2); // only one macro is supported + EXPECT_EQ(unigrams[0].value(), "今天"); + EXPECT_EQ(unigrams[1].value(), "6/10/21"); +} + +} // namespace McBopomofo diff --git a/src/Engine/MemoryMappedFile.cpp b/src/Engine/MemoryMappedFile.cpp index e38e617..d6e8952 100644 --- a/src/Engine/MemoryMappedFile.cpp +++ b/src/Engine/MemoryMappedFile.cpp @@ -28,8 +28,24 @@ #include #include +#include + namespace McBopomofo { +MemoryMappedFile::MemoryMappedFile(MemoryMappedFile&& other) noexcept + : fd_(std::exchange(other.fd_, -1)), + data_(std::exchange(other.data_, nullptr)), + length_(std::exchange(other.length_, -1)) {} + +MemoryMappedFile& MemoryMappedFile::operator=( + MemoryMappedFile&& other) noexcept { + close(); + fd_ = std::exchange(other.fd_, -1); + data_ = std::exchange(other.data_, nullptr); + length_ = std::exchange(other.length_, 0); + return *this; +} + MemoryMappedFile::~MemoryMappedFile() { close(); } bool MemoryMappedFile::open(const char* path) { diff --git a/src/Engine/MemoryMappedFile.h b/src/Engine/MemoryMappedFile.h index fccb400..daf22eb 100644 --- a/src/Engine/MemoryMappedFile.h +++ b/src/Engine/MemoryMappedFile.h @@ -28,9 +28,22 @@ namespace McBopomofo { -// A wrapper for managing a memory-mapped file. Access is read-only. +// A wrapper for managing a memory-mapped file. +// +// On POSIX systems, we obtain a readable (PROT_READ) shared page (MAP_SHARED), +// and *file content* changes are reflected in the mapped memory. This class +// does not track the underlying file: it is up to the user of this class to +// decide what to do when the underlying file gets updated, resized, or removed. class MemoryMappedFile { public: + MemoryMappedFile() = default; + MemoryMappedFile(MemoryMappedFile&& other) noexcept; + MemoryMappedFile& operator=(MemoryMappedFile&& other) noexcept; + + // Forbids copying. + MemoryMappedFile& operator=(const MemoryMappedFile&) = delete; + MemoryMappedFile(const MemoryMappedFile&) = delete; + ~MemoryMappedFile(); bool open(const char* path); void close(); diff --git a/src/Engine/MemoryMappedFileTest.cpp b/src/Engine/MemoryMappedFileTest.cpp index 3ebac1b..a59f03e 100644 --- a/src/Engine/MemoryMappedFileTest.cpp +++ b/src/Engine/MemoryMappedFileTest.cpp @@ -69,6 +69,8 @@ TEST(MemoryMappedFileTest, BasicFunctionalities) { out.close(); MemoryMappedFile mf; + ASSERT_EQ(mf.length(), 0); + ASSERT_EQ(mf.data(), nullptr); bool open_result = mf.open(tmp_file_path.c_str()); ASSERT_TRUE(open_result); @@ -76,7 +78,6 @@ TEST(MemoryMappedFileTest, BasicFunctionalities) { EXPECT_TRUE(mf.data() != nullptr); EXPECT_EQ(memcmp(mf.data(), buf, kBufSize), 0); - delete[] buf; mf.close(); EXPECT_EQ(mf.length(), 0); EXPECT_EQ(mf.data(), nullptr); @@ -84,14 +85,41 @@ TEST(MemoryMappedFileTest, BasicFunctionalities) { // Should be a no-op. mf.close(); + MemoryMappedFile mf2; + open_result = mf2.open(tmp_file_path.c_str()); + EXPECT_TRUE(open_result); + EXPECT_TRUE(mf2.data() != nullptr); + + MemoryMappedFile mf3(std::move(mf2)); + EXPECT_TRUE(mf2.data() == nullptr); + EXPECT_TRUE(mf3.data() != nullptr); + + MemoryMappedFile mf4 = std::move(mf3); + EXPECT_TRUE(mf3.data() == nullptr); + EXPECT_TRUE(mf4.data() != nullptr); + + // Flip a byte of the underlying file. The map should reflect that. + FILE* f = fopen(tmp_file_path.c_str(), "r+b"); + EXPECT_NE(f, nullptr); + EXPECT_NE(fputc(~buf[0], f), EOF); + EXPECT_EQ(fclose(f), 0); + + EXPECT_NE(memcmp(mf4.data(), buf, kBufSize), 0); + // The rest of the data should be the same. + EXPECT_EQ(memcmp(mf4.data() + 1, buf + 1, kBufSize - 1), 0); + + mf4.close(); + EXPECT_TRUE(mf4.data() == nullptr); + std::filesystem::remove(tmp_file_path); + delete[] buf; // Opening a non-existence file. - MemoryMappedFile mf2; - open_result = mf2.open(tmp_file_path.c_str()); + MemoryMappedFile mf5; + open_result = mf5.open(tmp_file_path.c_str()); EXPECT_FALSE(open_result); - EXPECT_EQ(mf2.length(), 0); - EXPECT_EQ(mf2.data(), nullptr); + EXPECT_EQ(mf5.length(), 0); + EXPECT_EQ(mf5.data(), nullptr); } } // namespace McBopomofo diff --git a/src/Engine/ParselessLM.cpp b/src/Engine/ParselessLM.cpp index f055718..4dd5afb 100644 --- a/src/Engine/ParselessLM.cpp +++ b/src/Engine/ParselessLM.cpp @@ -32,11 +32,11 @@ #include #include -bool McBopomofo::ParselessLM::isLoaded() { - return mmapedFile_.data() != nullptr; -} +namespace McBopomofo { + +bool ParselessLM::isLoaded() { return mmapedFile_.data() != nullptr; } -bool McBopomofo::ParselessLM::open(const char* path) { +bool ParselessLM::open(const char* path) { if (!mmapedFile_.open(path)) { return false; } @@ -45,13 +45,22 @@ bool McBopomofo::ParselessLM::open(const char* path) { return true; } -void McBopomofo::ParselessLM::close() { +void ParselessLM::close() { mmapedFile_.close(); db_ = nullptr; } +bool ParselessLM::open(std::unique_ptr db) { + if (db_ != nullptr) { + return false; + } + + db_ = std::move(db); + return true; +} + std::vector -McBopomofo::ParselessLM::getUnigrams(const std::string& key) { +ParselessLM::getUnigrams(const std::string& key) { if (db_ == nullptr) { return {}; } @@ -99,7 +108,7 @@ McBopomofo::ParselessLM::getUnigrams(const std::string& key) { return results; } -bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key) { +bool ParselessLM::hasUnigrams(const std::string& key) { if (db_ == nullptr) { return false; } @@ -107,13 +116,13 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key) { return db_->findFirstMatchingLine(key + " ") != nullptr; } -std::vector -McBopomofo::ParselessLM::getReadings(const std::string& value) { +std::vector ParselessLM::getReadings( + const std::string& value) const { if (db_ == nullptr) { return {}; } - std::vector results; + std::vector results; // We append a space so that we only find rows with the exact value. We // are taking advantage of the fact that a well-form row in this LM must @@ -153,7 +162,9 @@ McBopomofo::ParselessLM::getReadings(const std::string& value) { if (it != row.end()) { score = std::stod(std::string(it, row.end())); } - results.emplace_back(McBopomofo::ParselessLM::FoundReading{key, score}); + results.emplace_back(ParselessLM::FoundReading{key, score}); } return results; } + +} // namespace McBopomofo diff --git a/src/Engine/ParselessLM.h b/src/Engine/ParselessLM.h index ff85b7b..cf9ba91 100644 --- a/src/Engine/ParselessLM.h +++ b/src/Engine/ParselessLM.h @@ -36,10 +36,19 @@ namespace McBopomofo { class ParselessLM : public Formosa::Gramambular2::LanguageModel { public: + ParselessLM() = default; + ParselessLM(const ParselessLM&) = delete; + ParselessLM(ParselessLM&&) = delete; + ParselessLM& operator=(const ParselessLM&) = delete; + ParselessLM& operator=(ParselessLM&&) = delete; + bool isLoaded(); bool open(const char* path); void close(); + // Allows the use of existing in-memory db. + bool open(std::unique_ptr db); + std::vector getUnigrams( const std::string& key) override; bool hasUnigrams(const std::string& key) override; @@ -50,7 +59,7 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel { }; // Look up reading by value. This is specific to ParselessLM only. - std::vector getReadings(const std::string& value); + std::vector getReadings(const std::string& value) const; private: MemoryMappedFile mmapedFile_; diff --git a/src/Engine/ParselessLMTest.cpp b/src/Engine/ParselessLMTest.cpp index 88d3594..8777063 100644 --- a/src/Engine/ParselessLMTest.cpp +++ b/src/Engine/ParselessLMTest.cpp @@ -29,6 +29,63 @@ namespace McBopomofo { +constexpr char kSample[] = R"( +# format org.openvanilla.mcbopomofo.sorted +ㄅㄚ 八 -3.27631260 +ㄅㄚ 吧 -3.59800309 +ㄅㄚ 巴 -3.80233706 +ㄅㄚ-ㄅㄞˇ 八百 -4.67026409 +ㄅㄚ-ㄅㄞˇ 捌佰 -7.26686119 +ㄅㄚ˙ 吧 -3.59800309 +)"; + +TEST(ParselessLMTest, IdempotentClose) { + auto db = std::make_unique(kSample, sizeof(kSample)); + + ParselessLM lm; + EXPECT_TRUE(lm.open(std::move(db))); + lm.close(); + + // Safe to do so. + lm.close(); +} + +TEST(ParselessLMTest, OpenFailsIfAlreadyOpened) { + auto db = std::make_unique(kSample, sizeof(kSample)); + + ParselessLM lm; + EXPECT_TRUE(lm.open(std::move(db))); + + auto db2 = std::make_unique(kSample, sizeof(kSample)); + EXPECT_FALSE(lm.open(std::move(db2))); +} + +TEST(ParselessLMTest, UnopenedInstanceReturnsNothing) { + ParselessLM lm; + EXPECT_FALSE(lm.hasUnigrams("八")); +} + +TEST(ParselessLMTest, ReturnsResults) { + ParselessLM lm; + auto db = std::make_unique(kSample, sizeof(kSample)); + EXPECT_TRUE(lm.open(std::move(db))); + + using Unigram = Formosa::Gramambular2::LanguageModel::Unigram; + std::vector unigrams = lm.getUnigrams("ㄅㄚ-ㄅㄞˇ"); + EXPECT_EQ(unigrams.size(), 2); + EXPECT_EQ(unigrams[0].value(), "八百"); + EXPECT_NEAR(unigrams[0].score(), -4.67026409, 0.00000001); + EXPECT_EQ(unigrams[1].value(), "捌佰"); + EXPECT_NEAR(unigrams[1].score(), -7.26686119, 0.00000001); + + std::vector readings = lm.getReadings("吧"); + EXPECT_EQ(readings.size(), 2); + EXPECT_EQ(readings[0].reading, "ㄅㄚ"); + EXPECT_NEAR(readings[0].score, -3.59800309, 0.00000001); + EXPECT_EQ(readings[1].reading, "ㄅㄚ˙"); + EXPECT_NEAR(readings[1].score, -3.59800309, 0.00000001); +} + TEST(ParselessLMTest, SanityCheckTest) { constexpr const char* data_path = "data.txt"; if (!std::filesystem::exists(data_path)) { diff --git a/src/Engine/ParselessPhraseDB.h b/src/Engine/ParselessPhraseDB.h index 2bc2d5d..384f919 100644 --- a/src/Engine/ParselessPhraseDB.h +++ b/src/Engine/ParselessPhraseDB.h @@ -43,6 +43,11 @@ class ParselessPhraseDB { ParselessPhraseDB(const char* buf, size_t length, bool validate_pragma = false); + ParselessPhraseDB(const ParselessPhraseDB&) = delete; + ParselessPhraseDB(ParselessPhraseDB&&) = delete; + ParselessPhraseDB& operator=(const ParselessPhraseDB&) = delete; + ParselessPhraseDB& operator=(ParselessPhraseDB&&) = delete; + // Find the rows that match the key. Note that prefix match is used. If you // need exact match, the key will need to have a delimiter (usually a space) // at the end. diff --git a/src/Engine/PhraseReplacementMap.cpp b/src/Engine/PhraseReplacementMap.cpp index f1c6848..9e634e3 100644 --- a/src/Engine/PhraseReplacementMap.cpp +++ b/src/Engine/PhraseReplacementMap.cpp @@ -39,24 +39,39 @@ bool PhraseReplacementMap::open(const char* path) { return false; } - KeyValueBlobReader reader(mmapedFile_.data(), mmapedFile_.length()); - KeyValueBlobReader::KeyValue keyValue; - while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { - keyValueMap_[keyValue.key] = keyValue.value; - } - return true; + // MemoryMappedFile self-closes, and so this is fine. + return load(mmapedFile_.data(), mmapedFile_.length()); } void PhraseReplacementMap::close() { + keyValueMap_.clear(); mmapedFile_.close(); +} + +bool PhraseReplacementMap::load(const char* data, size_t length) { + if (mmapedFile_.data() != nullptr) { + // Cannot load while mmapedFile_ is already open. + return false; + } + + if (data == nullptr || length == 0) { + return false; + } keyValueMap_.clear(); + + KeyValueBlobReader reader(data, length); + KeyValueBlobReader::KeyValue keyValue; + while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { + keyValueMap_[keyValue.key] = keyValue.value; + } + return true; } std::string PhraseReplacementMap::valueForKey(const std::string& key) const { auto iter = keyValueMap_.find(key); if (iter != keyValueMap_.end()) { const std::string_view& v = iter->second; - return {v.cbegin(), v.cend()}; + return std::string(v); } return {}; } diff --git a/src/Engine/PhraseReplacementMap.h b/src/Engine/PhraseReplacementMap.h index c01bdcf..1d2ce7b 100644 --- a/src/Engine/PhraseReplacementMap.h +++ b/src/Engine/PhraseReplacementMap.h @@ -34,8 +34,19 @@ namespace McBopomofo { class PhraseReplacementMap { public: + PhraseReplacementMap() = default; + PhraseReplacementMap(const PhraseReplacementMap&) = delete; + PhraseReplacementMap(PhraseReplacementMap&&) = delete; + PhraseReplacementMap& operator=(const PhraseReplacementMap&) = delete; + PhraseReplacementMap& operator=(PhraseReplacementMap&&) = delete; + bool open(const char* path); void close(); + + // Allows loading existing in-memory data. It's the caller's responsibility + // to make sure that data outlives this instance. + bool load(const char* data, size_t length); + std::string valueForKey(const std::string& key) const; protected: diff --git a/src/Engine/PhraseReplacementMapTest.cpp b/src/Engine/PhraseReplacementMapTest.cpp index c4f62f2..50ce7b9 100644 --- a/src/Engine/PhraseReplacementMapTest.cpp +++ b/src/Engine/PhraseReplacementMapTest.cpp @@ -31,27 +31,15 @@ namespace McBopomofo { TEST(PhraseReplacementMapTest, LenientReading) { - std::string tmp_name = - std::string(std::filesystem::temp_directory_path() / "test.txt"); - FILE* f = fopen(tmp_name.c_str(), "w"); - ASSERT_NE(f, nullptr); - - fprintf(f, "key value\n"); - fprintf(f, "key2\n"); // error line - fprintf(f, "key3 value2\n"); - int r = fclose(f); - ASSERT_EQ(r, 0); + constexpr char kTestData[] = "key value\nkey2\nkey3 value3"; PhraseReplacementMap map; - map.open(tmp_name.c_str()); + ASSERT_TRUE(map.load(kTestData, sizeof(kTestData))); ASSERT_EQ(map.valueForKey("key"), "value"); - ASSERT_EQ(map.valueForKey("key2"), ""); + ASSERT_TRUE(map.valueForKey("key2").empty()); // key2 causes parsing error, and the line that has key3 won't be parsed. - ASSERT_EQ(map.valueForKey("key3"), ""); - - r = remove(tmp_name.c_str()); - ASSERT_EQ(r, 0); + ASSERT_TRUE(map.valueForKey("key3").empty()); } } // namespace McBopomofo diff --git a/src/Engine/UserPhrasesLM.cpp b/src/Engine/UserPhrasesLM.cpp index 193d384..7d6e57f 100644 --- a/src/Engine/UserPhrasesLM.cpp +++ b/src/Engine/UserPhrasesLM.cpp @@ -40,19 +40,32 @@ bool UserPhrasesLM::open(const char* path) { return false; } - KeyValueBlobReader reader(mmapedFile_.data(), mmapedFile_.length()); - KeyValueBlobReader::KeyValue keyValue; - while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { - // We invert the key and value, since in user phrases, "key" is the phrase - // value, and "value" is the BPMF reading. - keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key); - } - return true; + // MemoryMappedFile self-closes, and so this is fine. + return load(mmapedFile_.data(), mmapedFile_.length()); } void UserPhrasesLM::close() { + keyRowMap.clear(); mmapedFile_.close(); +} + +bool UserPhrasesLM::load(const char* data, size_t length) { + if (data == nullptr || length == 0) { + return false; + } + keyRowMap.clear(); + + KeyValueBlobReader reader(data, length); + KeyValueBlobReader::KeyValue keyValue; + while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { + // The format of the user phrases files is a bit different. The first column + // is the phrase (value) and the second column is the Bopofomo reading. The + // KeyValueBlobReader::KeyValue's value is the second column, that's why + // it's used as the key of the keyRowMap here. + keyRowMap[keyValue.value].emplace_back(keyValue.key); + } + return true; } std::vector @@ -60,9 +73,9 @@ UserPhrasesLM::getUnigrams(const std::string& key) { std::vector v; auto iter = keyRowMap.find(key); if (iter != keyRowMap.end()) { - const std::vector& rows = iter->second; - for (const auto& row : rows) { - v.emplace_back(std::string(row.value), 0); + const std::vector& values = iter->second; + for (const auto& value : values) { + v.emplace_back(std::string(value), kUserUnigramScore); } } diff --git a/src/Engine/UserPhrasesLM.h b/src/Engine/UserPhrasesLM.h index c89a725..e58e048 100644 --- a/src/Engine/UserPhrasesLM.h +++ b/src/Engine/UserPhrasesLM.h @@ -37,22 +37,28 @@ namespace McBopomofo { class UserPhrasesLM : public Formosa::Gramambular2::LanguageModel { public: + UserPhrasesLM() = default; + UserPhrasesLM(const UserPhrasesLM&) = delete; + UserPhrasesLM(UserPhrasesLM&&) = delete; + UserPhrasesLM& operator=(const UserPhrasesLM&) = delete; + UserPhrasesLM& operator=(UserPhrasesLM&&) = delete; + bool open(const char* path); void close(); + // Allows loading existing in-memory data. It's the caller's responsibility + // to make sure that data outlives this instance. + bool load(const char* data, size_t length); + std::vector getUnigrams( const std::string& key) override; bool hasUnigrams(const std::string& key) override; - protected: - struct Row { - Row(std::string_view k, std::string_view v) : key(k), value(v) {} - const std::string_view key; - const std::string_view value; - }; + static constexpr double kUserUnigramScore = 0; + protected: MemoryMappedFile mmapedFile_; - std::map> keyRowMap; + std::map> keyRowMap; }; } // namespace McBopomofo diff --git a/src/Engine/UserPhrasesLMTest.cpp b/src/Engine/UserPhrasesLMTest.cpp index c2d566a..73e0126 100644 --- a/src/Engine/UserPhrasesLMTest.cpp +++ b/src/Engine/UserPhrasesLMTest.cpp @@ -31,28 +31,20 @@ namespace McBopomofo { TEST(UserPhrasesLMTest, LenientReading) { - std::string tmp_name = - std::string(std::filesystem::temp_directory_path() / "test.txt"); - - FILE* f = fopen(tmp_name.c_str(), "w"); - ASSERT_NE(f, nullptr); - - fprintf(f, "value1 reading1\n"); - fprintf(f, "value2 \n"); // error line - fprintf(f, "value3 reading2\n"); - int r = fclose(f); - ASSERT_EQ(r, 0); + constexpr char kTestData[] = "value1 reading1\nvalue2 \nvalue3 reading2"; UserPhrasesLM lm; - lm.open(tmp_name.c_str()); - ASSERT_TRUE(lm.hasUnigrams("reading1")); - ASSERT_FALSE(lm.hasUnigrams("value2")); + ASSERT_TRUE(lm.load(kTestData, sizeof(kTestData))); + EXPECT_TRUE(lm.hasUnigrams("reading1")); + EXPECT_FALSE(lm.hasUnigrams("value2")); // Anything after the error won't be parsed, so reading2 won't be found. - ASSERT_FALSE(lm.hasUnigrams("reading2")); + EXPECT_FALSE(lm.hasUnigrams("reading2")); - r = remove(tmp_name.c_str()); - ASSERT_EQ(r, 0); + std::vector results = + lm.getUnigrams("reading1"); + EXPECT_EQ(results[0].value(), "value1"); + EXPECT_EQ(results[0].score(), UserPhrasesLM::kUserUnigramScore); } } // namespace McBopomofo