From 2b383259bfe174c4dc8d0abbe38355e7966baa18 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Thu, 14 Mar 2024 08:30:00 -0700 Subject: [PATCH 1/6] Make MemoryMappedFile movable and clarify contract --- src/Engine/MemoryMappedFile.cpp | 16 ++++++++++++ src/Engine/MemoryMappedFile.h | 15 +++++++++++- src/Engine/MemoryMappedFileTest.cpp | 38 +++++++++++++++++++++++++---- 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/src/Engine/MemoryMappedFile.cpp b/src/Engine/MemoryMappedFile.cpp index e38e617..d6e8952 100644 --- a/src/Engine/MemoryMappedFile.cpp +++ b/src/Engine/MemoryMappedFile.cpp @@ -28,8 +28,24 @@ #include #include +#include + namespace McBopomofo { +MemoryMappedFile::MemoryMappedFile(MemoryMappedFile&& other) noexcept + : fd_(std::exchange(other.fd_, -1)), + data_(std::exchange(other.data_, nullptr)), + length_(std::exchange(other.length_, -1)) {} + +MemoryMappedFile& MemoryMappedFile::operator=( + MemoryMappedFile&& other) noexcept { + close(); + fd_ = std::exchange(other.fd_, -1); + data_ = std::exchange(other.data_, nullptr); + length_ = std::exchange(other.length_, 0); + return *this; +} + MemoryMappedFile::~MemoryMappedFile() { close(); } bool MemoryMappedFile::open(const char* path) { diff --git a/src/Engine/MemoryMappedFile.h b/src/Engine/MemoryMappedFile.h index fccb400..daf22eb 100644 --- a/src/Engine/MemoryMappedFile.h +++ b/src/Engine/MemoryMappedFile.h @@ -28,9 +28,22 @@ namespace McBopomofo { -// A wrapper for managing a memory-mapped file. Access is read-only. +// A wrapper for managing a memory-mapped file. +// +// On POSIX systems, we obtain a readable (PROT_READ) shared page (MAP_SHARED), +// and *file content* changes are reflected in the mapped memory. This class +// does not track the underlying file: it is up to the user of this class to +// decide what to do when the underlying file gets updated, resized, or removed. class MemoryMappedFile { public: + MemoryMappedFile() = default; + MemoryMappedFile(MemoryMappedFile&& other) noexcept; + MemoryMappedFile& operator=(MemoryMappedFile&& other) noexcept; + + // Forbids copying. + MemoryMappedFile& operator=(const MemoryMappedFile&) = delete; + MemoryMappedFile(const MemoryMappedFile&) = delete; + ~MemoryMappedFile(); bool open(const char* path); void close(); diff --git a/src/Engine/MemoryMappedFileTest.cpp b/src/Engine/MemoryMappedFileTest.cpp index 3ebac1b..a59f03e 100644 --- a/src/Engine/MemoryMappedFileTest.cpp +++ b/src/Engine/MemoryMappedFileTest.cpp @@ -69,6 +69,8 @@ TEST(MemoryMappedFileTest, BasicFunctionalities) { out.close(); MemoryMappedFile mf; + ASSERT_EQ(mf.length(), 0); + ASSERT_EQ(mf.data(), nullptr); bool open_result = mf.open(tmp_file_path.c_str()); ASSERT_TRUE(open_result); @@ -76,7 +78,6 @@ TEST(MemoryMappedFileTest, BasicFunctionalities) { EXPECT_TRUE(mf.data() != nullptr); EXPECT_EQ(memcmp(mf.data(), buf, kBufSize), 0); - delete[] buf; mf.close(); EXPECT_EQ(mf.length(), 0); EXPECT_EQ(mf.data(), nullptr); @@ -84,14 +85,41 @@ TEST(MemoryMappedFileTest, BasicFunctionalities) { // Should be a no-op. mf.close(); + MemoryMappedFile mf2; + open_result = mf2.open(tmp_file_path.c_str()); + EXPECT_TRUE(open_result); + EXPECT_TRUE(mf2.data() != nullptr); + + MemoryMappedFile mf3(std::move(mf2)); + EXPECT_TRUE(mf2.data() == nullptr); + EXPECT_TRUE(mf3.data() != nullptr); + + MemoryMappedFile mf4 = std::move(mf3); + EXPECT_TRUE(mf3.data() == nullptr); + EXPECT_TRUE(mf4.data() != nullptr); + + // Flip a byte of the underlying file. The map should reflect that. + FILE* f = fopen(tmp_file_path.c_str(), "r+b"); + EXPECT_NE(f, nullptr); + EXPECT_NE(fputc(~buf[0], f), EOF); + EXPECT_EQ(fclose(f), 0); + + EXPECT_NE(memcmp(mf4.data(), buf, kBufSize), 0); + // The rest of the data should be the same. + EXPECT_EQ(memcmp(mf4.data() + 1, buf + 1, kBufSize - 1), 0); + + mf4.close(); + EXPECT_TRUE(mf4.data() == nullptr); + std::filesystem::remove(tmp_file_path); + delete[] buf; // Opening a non-existence file. - MemoryMappedFile mf2; - open_result = mf2.open(tmp_file_path.c_str()); + MemoryMappedFile mf5; + open_result = mf5.open(tmp_file_path.c_str()); EXPECT_FALSE(open_result); - EXPECT_EQ(mf2.length(), 0); - EXPECT_EQ(mf2.data(), nullptr); + EXPECT_EQ(mf5.length(), 0); + EXPECT_EQ(mf5.data(), nullptr); } } // namespace McBopomofo From 689397796ca896c177e96cabed433c1dd30f7d67 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Thu, 14 Mar 2024 08:37:36 -0700 Subject: [PATCH 2/6] Forbid copying and moving of two classes --- src/Engine/KeyValueBlobReader.h | 5 +++++ src/Engine/ParselessPhraseDB.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/src/Engine/KeyValueBlobReader.h b/src/Engine/KeyValueBlobReader.h index 2f5a77c..8421e96 100644 --- a/src/Engine/KeyValueBlobReader.h +++ b/src/Engine/KeyValueBlobReader.h @@ -74,6 +74,11 @@ class KeyValueBlobReader { KeyValueBlobReader(const char* blob, size_t size) : current_(blob), end_(blob + size) {} + KeyValueBlobReader(const KeyValueBlobReader&) = delete; + KeyValueBlobReader(KeyValueBlobReader&&) = delete; + KeyValueBlobReader& operator=(const KeyValueBlobReader&) = delete; + KeyValueBlobReader& operator=(KeyValueBlobReader&&) = delete; + // Parse the next key-value pair and return the state of the reader. If // `out` is passed, out will be set to the produced key-value pair if there // is one. diff --git a/src/Engine/ParselessPhraseDB.h b/src/Engine/ParselessPhraseDB.h index 2bc2d5d..384f919 100644 --- a/src/Engine/ParselessPhraseDB.h +++ b/src/Engine/ParselessPhraseDB.h @@ -43,6 +43,11 @@ class ParselessPhraseDB { ParselessPhraseDB(const char* buf, size_t length, bool validate_pragma = false); + ParselessPhraseDB(const ParselessPhraseDB&) = delete; + ParselessPhraseDB(ParselessPhraseDB&&) = delete; + ParselessPhraseDB& operator=(const ParselessPhraseDB&) = delete; + ParselessPhraseDB& operator=(ParselessPhraseDB&&) = delete; + // Find the rows that match the key. Note that prefix match is used. If you // need exact match, the key will need to have a delimiter (usually a space) // at the end. From cfeb4f509f9d1e37d9c1940d9aea94c2cfa63121 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Thu, 14 Mar 2024 17:23:02 -0700 Subject: [PATCH 3/6] API and test improvements for the LM classes - Allow loading existing in-memory data. - Improve tests. - Disallow copying and moving. - Clearly state that MemoryMappedFile does not manage underlying file changes. - Clearly state that if an existing block of data is used for load() in PhraseReplacementMap or UserPhrasesLM, it is the caller's responsibility to ensure that the data outlives the LM object. --- src/Engine/ParselessLM.cpp | 33 +++++++++----- src/Engine/ParselessLM.h | 11 ++++- src/Engine/ParselessLMTest.cpp | 57 +++++++++++++++++++++++++ src/Engine/PhraseReplacementMap.cpp | 29 ++++++++++--- src/Engine/PhraseReplacementMap.h | 11 +++++ src/Engine/PhraseReplacementMapTest.cpp | 20 ++------- src/Engine/UserPhrasesLM.cpp | 35 ++++++++++----- src/Engine/UserPhrasesLM.h | 18 +++++--- src/Engine/UserPhrasesLMTest.cpp | 25 ++++------- 9 files changed, 170 insertions(+), 69 deletions(-) diff --git a/src/Engine/ParselessLM.cpp b/src/Engine/ParselessLM.cpp index f055718..4dd5afb 100644 --- a/src/Engine/ParselessLM.cpp +++ b/src/Engine/ParselessLM.cpp @@ -32,11 +32,11 @@ #include #include -bool McBopomofo::ParselessLM::isLoaded() { - return mmapedFile_.data() != nullptr; -} +namespace McBopomofo { + +bool ParselessLM::isLoaded() { return mmapedFile_.data() != nullptr; } -bool McBopomofo::ParselessLM::open(const char* path) { +bool ParselessLM::open(const char* path) { if (!mmapedFile_.open(path)) { return false; } @@ -45,13 +45,22 @@ bool McBopomofo::ParselessLM::open(const char* path) { return true; } -void McBopomofo::ParselessLM::close() { +void ParselessLM::close() { mmapedFile_.close(); db_ = nullptr; } +bool ParselessLM::open(std::unique_ptr db) { + if (db_ != nullptr) { + return false; + } + + db_ = std::move(db); + return true; +} + std::vector -McBopomofo::ParselessLM::getUnigrams(const std::string& key) { +ParselessLM::getUnigrams(const std::string& key) { if (db_ == nullptr) { return {}; } @@ -99,7 +108,7 @@ McBopomofo::ParselessLM::getUnigrams(const std::string& key) { return results; } -bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key) { +bool ParselessLM::hasUnigrams(const std::string& key) { if (db_ == nullptr) { return false; } @@ -107,13 +116,13 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key) { return db_->findFirstMatchingLine(key + " ") != nullptr; } -std::vector -McBopomofo::ParselessLM::getReadings(const std::string& value) { +std::vector ParselessLM::getReadings( + const std::string& value) const { if (db_ == nullptr) { return {}; } - std::vector results; + std::vector results; // We append a space so that we only find rows with the exact value. We // are taking advantage of the fact that a well-form row in this LM must @@ -153,7 +162,9 @@ McBopomofo::ParselessLM::getReadings(const std::string& value) { if (it != row.end()) { score = std::stod(std::string(it, row.end())); } - results.emplace_back(McBopomofo::ParselessLM::FoundReading{key, score}); + results.emplace_back(ParselessLM::FoundReading{key, score}); } return results; } + +} // namespace McBopomofo diff --git a/src/Engine/ParselessLM.h b/src/Engine/ParselessLM.h index ff85b7b..cf9ba91 100644 --- a/src/Engine/ParselessLM.h +++ b/src/Engine/ParselessLM.h @@ -36,10 +36,19 @@ namespace McBopomofo { class ParselessLM : public Formosa::Gramambular2::LanguageModel { public: + ParselessLM() = default; + ParselessLM(const ParselessLM&) = delete; + ParselessLM(ParselessLM&&) = delete; + ParselessLM& operator=(const ParselessLM&) = delete; + ParselessLM& operator=(ParselessLM&&) = delete; + bool isLoaded(); bool open(const char* path); void close(); + // Allows the use of existing in-memory db. + bool open(std::unique_ptr db); + std::vector getUnigrams( const std::string& key) override; bool hasUnigrams(const std::string& key) override; @@ -50,7 +59,7 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel { }; // Look up reading by value. This is specific to ParselessLM only. - std::vector getReadings(const std::string& value); + std::vector getReadings(const std::string& value) const; private: MemoryMappedFile mmapedFile_; diff --git a/src/Engine/ParselessLMTest.cpp b/src/Engine/ParselessLMTest.cpp index 88d3594..8777063 100644 --- a/src/Engine/ParselessLMTest.cpp +++ b/src/Engine/ParselessLMTest.cpp @@ -29,6 +29,63 @@ namespace McBopomofo { +constexpr char kSample[] = R"( +# format org.openvanilla.mcbopomofo.sorted +ㄅㄚ 八 -3.27631260 +ㄅㄚ 吧 -3.59800309 +ㄅㄚ 巴 -3.80233706 +ㄅㄚ-ㄅㄞˇ 八百 -4.67026409 +ㄅㄚ-ㄅㄞˇ 捌佰 -7.26686119 +ㄅㄚ˙ 吧 -3.59800309 +)"; + +TEST(ParselessLMTest, IdempotentClose) { + auto db = std::make_unique(kSample, sizeof(kSample)); + + ParselessLM lm; + EXPECT_TRUE(lm.open(std::move(db))); + lm.close(); + + // Safe to do so. + lm.close(); +} + +TEST(ParselessLMTest, OpenFailsIfAlreadyOpened) { + auto db = std::make_unique(kSample, sizeof(kSample)); + + ParselessLM lm; + EXPECT_TRUE(lm.open(std::move(db))); + + auto db2 = std::make_unique(kSample, sizeof(kSample)); + EXPECT_FALSE(lm.open(std::move(db2))); +} + +TEST(ParselessLMTest, UnopenedInstanceReturnsNothing) { + ParselessLM lm; + EXPECT_FALSE(lm.hasUnigrams("八")); +} + +TEST(ParselessLMTest, ReturnsResults) { + ParselessLM lm; + auto db = std::make_unique(kSample, sizeof(kSample)); + EXPECT_TRUE(lm.open(std::move(db))); + + using Unigram = Formosa::Gramambular2::LanguageModel::Unigram; + std::vector unigrams = lm.getUnigrams("ㄅㄚ-ㄅㄞˇ"); + EXPECT_EQ(unigrams.size(), 2); + EXPECT_EQ(unigrams[0].value(), "八百"); + EXPECT_NEAR(unigrams[0].score(), -4.67026409, 0.00000001); + EXPECT_EQ(unigrams[1].value(), "捌佰"); + EXPECT_NEAR(unigrams[1].score(), -7.26686119, 0.00000001); + + std::vector readings = lm.getReadings("吧"); + EXPECT_EQ(readings.size(), 2); + EXPECT_EQ(readings[0].reading, "ㄅㄚ"); + EXPECT_NEAR(readings[0].score, -3.59800309, 0.00000001); + EXPECT_EQ(readings[1].reading, "ㄅㄚ˙"); + EXPECT_NEAR(readings[1].score, -3.59800309, 0.00000001); +} + TEST(ParselessLMTest, SanityCheckTest) { constexpr const char* data_path = "data.txt"; if (!std::filesystem::exists(data_path)) { diff --git a/src/Engine/PhraseReplacementMap.cpp b/src/Engine/PhraseReplacementMap.cpp index f1c6848..9e634e3 100644 --- a/src/Engine/PhraseReplacementMap.cpp +++ b/src/Engine/PhraseReplacementMap.cpp @@ -39,24 +39,39 @@ bool PhraseReplacementMap::open(const char* path) { return false; } - KeyValueBlobReader reader(mmapedFile_.data(), mmapedFile_.length()); - KeyValueBlobReader::KeyValue keyValue; - while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { - keyValueMap_[keyValue.key] = keyValue.value; - } - return true; + // MemoryMappedFile self-closes, and so this is fine. + return load(mmapedFile_.data(), mmapedFile_.length()); } void PhraseReplacementMap::close() { + keyValueMap_.clear(); mmapedFile_.close(); +} + +bool PhraseReplacementMap::load(const char* data, size_t length) { + if (mmapedFile_.data() != nullptr) { + // Cannot load while mmapedFile_ is already open. + return false; + } + + if (data == nullptr || length == 0) { + return false; + } keyValueMap_.clear(); + + KeyValueBlobReader reader(data, length); + KeyValueBlobReader::KeyValue keyValue; + while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { + keyValueMap_[keyValue.key] = keyValue.value; + } + return true; } std::string PhraseReplacementMap::valueForKey(const std::string& key) const { auto iter = keyValueMap_.find(key); if (iter != keyValueMap_.end()) { const std::string_view& v = iter->second; - return {v.cbegin(), v.cend()}; + return std::string(v); } return {}; } diff --git a/src/Engine/PhraseReplacementMap.h b/src/Engine/PhraseReplacementMap.h index c01bdcf..1d2ce7b 100644 --- a/src/Engine/PhraseReplacementMap.h +++ b/src/Engine/PhraseReplacementMap.h @@ -34,8 +34,19 @@ namespace McBopomofo { class PhraseReplacementMap { public: + PhraseReplacementMap() = default; + PhraseReplacementMap(const PhraseReplacementMap&) = delete; + PhraseReplacementMap(PhraseReplacementMap&&) = delete; + PhraseReplacementMap& operator=(const PhraseReplacementMap&) = delete; + PhraseReplacementMap& operator=(PhraseReplacementMap&&) = delete; + bool open(const char* path); void close(); + + // Allows loading existing in-memory data. It's the caller's responsibility + // to make sure that data outlives this instance. + bool load(const char* data, size_t length); + std::string valueForKey(const std::string& key) const; protected: diff --git a/src/Engine/PhraseReplacementMapTest.cpp b/src/Engine/PhraseReplacementMapTest.cpp index c4f62f2..50ce7b9 100644 --- a/src/Engine/PhraseReplacementMapTest.cpp +++ b/src/Engine/PhraseReplacementMapTest.cpp @@ -31,27 +31,15 @@ namespace McBopomofo { TEST(PhraseReplacementMapTest, LenientReading) { - std::string tmp_name = - std::string(std::filesystem::temp_directory_path() / "test.txt"); - FILE* f = fopen(tmp_name.c_str(), "w"); - ASSERT_NE(f, nullptr); - - fprintf(f, "key value\n"); - fprintf(f, "key2\n"); // error line - fprintf(f, "key3 value2\n"); - int r = fclose(f); - ASSERT_EQ(r, 0); + constexpr char kTestData[] = "key value\nkey2\nkey3 value3"; PhraseReplacementMap map; - map.open(tmp_name.c_str()); + ASSERT_TRUE(map.load(kTestData, sizeof(kTestData))); ASSERT_EQ(map.valueForKey("key"), "value"); - ASSERT_EQ(map.valueForKey("key2"), ""); + ASSERT_TRUE(map.valueForKey("key2").empty()); // key2 causes parsing error, and the line that has key3 won't be parsed. - ASSERT_EQ(map.valueForKey("key3"), ""); - - r = remove(tmp_name.c_str()); - ASSERT_EQ(r, 0); + ASSERT_TRUE(map.valueForKey("key3").empty()); } } // namespace McBopomofo diff --git a/src/Engine/UserPhrasesLM.cpp b/src/Engine/UserPhrasesLM.cpp index 193d384..7d6e57f 100644 --- a/src/Engine/UserPhrasesLM.cpp +++ b/src/Engine/UserPhrasesLM.cpp @@ -40,19 +40,32 @@ bool UserPhrasesLM::open(const char* path) { return false; } - KeyValueBlobReader reader(mmapedFile_.data(), mmapedFile_.length()); - KeyValueBlobReader::KeyValue keyValue; - while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { - // We invert the key and value, since in user phrases, "key" is the phrase - // value, and "value" is the BPMF reading. - keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key); - } - return true; + // MemoryMappedFile self-closes, and so this is fine. + return load(mmapedFile_.data(), mmapedFile_.length()); } void UserPhrasesLM::close() { + keyRowMap.clear(); mmapedFile_.close(); +} + +bool UserPhrasesLM::load(const char* data, size_t length) { + if (data == nullptr || length == 0) { + return false; + } + keyRowMap.clear(); + + KeyValueBlobReader reader(data, length); + KeyValueBlobReader::KeyValue keyValue; + while (reader.Next(&keyValue) == KeyValueBlobReader::State::HAS_PAIR) { + // The format of the user phrases files is a bit different. The first column + // is the phrase (value) and the second column is the Bopofomo reading. The + // KeyValueBlobReader::KeyValue's value is the second column, that's why + // it's used as the key of the keyRowMap here. + keyRowMap[keyValue.value].emplace_back(keyValue.key); + } + return true; } std::vector @@ -60,9 +73,9 @@ UserPhrasesLM::getUnigrams(const std::string& key) { std::vector v; auto iter = keyRowMap.find(key); if (iter != keyRowMap.end()) { - const std::vector& rows = iter->second; - for (const auto& row : rows) { - v.emplace_back(std::string(row.value), 0); + const std::vector& values = iter->second; + for (const auto& value : values) { + v.emplace_back(std::string(value), kUserUnigramScore); } } diff --git a/src/Engine/UserPhrasesLM.h b/src/Engine/UserPhrasesLM.h index c89a725..d0d9f1d 100644 --- a/src/Engine/UserPhrasesLM.h +++ b/src/Engine/UserPhrasesLM.h @@ -37,22 +37,28 @@ namespace McBopomofo { class UserPhrasesLM : public Formosa::Gramambular2::LanguageModel { public: + UserPhrasesLM() = default; + UserPhrasesLM(const UserPhrasesLM&) = delete; + UserPhrasesLM(UserPhrasesLM&&) = delete; + UserPhrasesLM& operator=(const UserPhrasesLM&) = delete; + UserPhrasesLM& operator=(UserPhrasesLM&&) = delete; + bool open(const char* path); void close(); + // Allows loading existing in-memory data. It's the caller's responsibility + // to make sure that data outlives this instance. + bool load(const char* data, size_t length); + std::vector getUnigrams( const std::string& key) override; bool hasUnigrams(const std::string& key) override; protected: - struct Row { - Row(std::string_view k, std::string_view v) : key(k), value(v) {} - const std::string_view key; - const std::string_view value; - }; + static constexpr double kUserUnigramScore = 0; MemoryMappedFile mmapedFile_; - std::map> keyRowMap; + std::map> keyRowMap; }; } // namespace McBopomofo diff --git a/src/Engine/UserPhrasesLMTest.cpp b/src/Engine/UserPhrasesLMTest.cpp index c2d566a..236920c 100644 --- a/src/Engine/UserPhrasesLMTest.cpp +++ b/src/Engine/UserPhrasesLMTest.cpp @@ -31,28 +31,19 @@ namespace McBopomofo { TEST(UserPhrasesLMTest, LenientReading) { - std::string tmp_name = - std::string(std::filesystem::temp_directory_path() / "test.txt"); - - FILE* f = fopen(tmp_name.c_str(), "w"); - ASSERT_NE(f, nullptr); - - fprintf(f, "value1 reading1\n"); - fprintf(f, "value2 \n"); // error line - fprintf(f, "value3 reading2\n"); - int r = fclose(f); - ASSERT_EQ(r, 0); + constexpr char kTestData[] = "value1 reading1\nvalue2 \nvalue3 reading2"; UserPhrasesLM lm; - lm.open(tmp_name.c_str()); - ASSERT_TRUE(lm.hasUnigrams("reading1")); - ASSERT_FALSE(lm.hasUnigrams("value2")); + ASSERT_TRUE(lm.load(kTestData, sizeof(kTestData))); + EXPECT_TRUE(lm.hasUnigrams("reading1")); + EXPECT_FALSE(lm.hasUnigrams("value2")); // Anything after the error won't be parsed, so reading2 won't be found. - ASSERT_FALSE(lm.hasUnigrams("reading2")); + EXPECT_FALSE(lm.hasUnigrams("reading2")); - r = remove(tmp_name.c_str()); - ASSERT_EQ(r, 0); + std::vector results = + lm.getUnigrams("reading1"); + EXPECT_EQ(results[0].value(), "value1"); } } // namespace McBopomofo From cbb2d6457a838f223158aab01837e5ba86fa8808 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Thu, 14 Mar 2024 11:44:49 -0700 Subject: [PATCH 4/6] Remove value if macro is unsupported; add tests - Removes macro values not handled by the macro converter - Reformats and clarifies McBopomofoLM comments - Rearranges the order of member functions - Adds tests --- src/Engine/CMakeLists.txt | 1 + src/Engine/McBopomofoLM.cpp | 64 ++++--- src/Engine/McBopomofoLM.h | 123 +++++++------- src/Engine/McBopomofoLMTest.cpp | 284 ++++++++++++++++++++++++++++++++ 4 files changed, 386 insertions(+), 86 deletions(-) create mode 100644 src/Engine/McBopomofoLMTest.cpp diff --git a/src/Engine/CMakeLists.txt b/src/Engine/CMakeLists.txt index edaa1c2..454ae88 100644 --- a/src/Engine/CMakeLists.txt +++ b/src/Engine/CMakeLists.txt @@ -43,6 +43,7 @@ if (ENABLE_TEST) add_executable(McBopomofoLMLibTest AssociatedPhrasesV2Test.cpp KeyValueBlobReaderTest.cpp + McBopomofoLMTest.cpp MemoryMappedFileTest.cpp ParselessLMTest.cpp ParselessPhraseDBTest.cpp diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index e577bb9..052e2a1 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -33,15 +33,8 @@ namespace McBopomofo { -McBopomofoLM::McBopomofoLM() {} - -McBopomofoLM::~McBopomofoLM() { - languageModel_.close(); - userPhrases_.close(); - excludedPhrases_.close(); - phraseReplacement_.close(); - associatedPhrasesV2_.close(); -} +static constexpr std::string_view kMacroPrefix = "MACRO@"; +static constexpr double kMacroScore = -8.0; void McBopomofoLM::loadLanguageModel(const char* languageModelDataPath) { if (languageModelDataPath) { @@ -95,11 +88,11 @@ McBopomofoLM::getUnigrams(const std::string& key) { if (excludedPhrases_.hasUnigrams(key)) { std::vector excludedUnigrams = excludedPhrases_.getUnigrams(key); - transform(excludedUnigrams.begin(), excludedUnigrams.end(), - inserter(excludedValues, excludedValues.end()), - [](const Formosa::Gramambular2::LanguageModel::Unigram& u) { - return u.value(); - }); + std::transform(excludedUnigrams.begin(), excludedUnigrams.end(), + std::inserter(excludedValues, excludedValues.end()), + [](const Formosa::Gramambular2::LanguageModel::Unigram& u) { + return u.value(); + }); } if (userPhrases_.hasUnigrams(key)) { @@ -177,7 +170,7 @@ bool McBopomofoLM::hasUnigrams(const std::string& key) { return !getUnigrams(key).empty(); } -std::string McBopomofoLM::getReading(const std::string& value) { +std::string McBopomofoLM::getReading(const std::string& value) const { std::vector foundReadings = languageModel_.getReadings(value); double topScore = std::numeric_limits::lowest(); @@ -191,6 +184,12 @@ std::string McBopomofoLM::getReading(const std::string& value) { return topValue; } +std::vector McBopomofoLM::findAssociatedPhrasesV2( + const std::string& prefixValue, + const std::vector& prefixReadings) const { + return associatedPhrasesV2_.findPhrases(prefixValue, prefixReadings); +} + void McBopomofoLM::setPhraseReplacementEnabled(bool enabled) { phraseReplacementEnabled_ = enabled; } @@ -250,6 +249,13 @@ McBopomofoLM::filterAndTransformUnigrams( std::string replacement = macroConverter_(value); value = replacement; } + + // Check if the string is an unsupported macro + if (unigram.score() == kMacroScore && value.size() > kMacroPrefix.size() && + value.compare(0, kMacroPrefix.size(), kMacroPrefix) == 0) { + continue; + } + if (externalConverterEnabled_ && externalConverter_) { std::string replacement = externalConverter_(value); value = replacement; @@ -262,10 +268,30 @@ McBopomofoLM::filterAndTransformUnigrams( return results; } -std::vector McBopomofoLM::findAssociatedPhrasesV2( - const std::string& prefixValue, - const std::vector& prefixReadings) const { - return associatedPhrasesV2_.findPhrases(prefixValue, prefixReadings); +void McBopomofoLM::loadLanguageModel(std::unique_ptr db) { + languageModel_.close(); + languageModel_.open(std::move(db)); +} + +void McBopomofoLM::loadAssociatedPhrasesV2( + std::unique_ptr db) { + associatedPhrasesV2_.close(); + associatedPhrasesV2_.open(std::move(db)); +} + +void McBopomofoLM::loadUserPhrases(const char* data, size_t length) { + userPhrases_.close(); + userPhrases_.load(data, length); +} + +void McBopomofoLM::loadExcludedPhrases(const char* data, size_t length) { + excludedPhrases_.close(); + excludedPhrases_.load(data, length); +} + +void McBopomofoLM::loadPhraseReplacementMap(const char* data, size_t length) { + phraseReplacement_.close(); + phraseReplacement_.load(data, length); } } // namespace McBopomofo diff --git a/src/Engine/McBopomofoLM.h b/src/Engine/McBopomofoLM.h index 32ef266..1492628 100644 --- a/src/Engine/McBopomofoLM.h +++ b/src/Engine/McBopomofoLM.h @@ -24,8 +24,8 @@ #ifndef SRC_ENGINE_MCBOPOMOFOLM_H_ #define SRC_ENGINE_MCBOPOMOFOLM_H_ -#include #include +#include #include #include #include @@ -38,99 +38,85 @@ namespace McBopomofo { -/// McBopomofoLM is a facade for managing a set of models including -/// the input method language model, user phrases and excluded phrases. -/// -/// It is the primary model class that the input controller and grammar builder -/// of McBopomofo talks to. When the grammar builder starts to build a sentence -/// from a series of BPMF readings, it passes the readings to the model to see -/// if there are valid unigrams, and use returned unigrams to produce the final -/// results. -/// -/// McBopomofoLM combine and transform the unigrams from the primary language -/// model and user phrases. The process is -/// -/// 1) Get the original unigrams. -/// 2) Drop the unigrams whose value is contained in the exclusion map. -/// 3) Replace the values of the unigrams using the phrase replacement map. -/// 4) Replace the values of the unigrams using an external converter lambda. -/// 5) Drop the duplicated phrases. -/// -/// The controller can ask the model to load the primary input method language -/// model while launching and to load the user phrases anytime if the custom -/// files are modified. It does not keep the reference of the data paths but -/// you have to pass the paths when you ask it to do loading. +// McBopomofoLM manages the input method's language models and performs text +// and macro conversions. +// +// When the reading grid requests unigrams from McBopomofoLM, the LM combines +// and transforms the unigrams from the primary language model and user phrases. +// The process is +// +// 1. Get the original unigrams. +// 2. Drop the unigrams from the user-exclusion list. +// 3. Replace the unigram values specified by the user phrase replacement map. +// 4. Transform the unigram values with an external converter, if supplied. +// 5. Remove any duplicates. +// +// McBopomofoLM itself is not responsible for reloading custom models (user +// phrases, excluded phrases, and replacement map). The LM's owner, usually the +// input method controller, needs to take care of checking for updates and +// telling McBopomofoLM to reload as needed. class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { public: - McBopomofoLM(); - ~McBopomofoLM() override; + McBopomofoLM() = default; - /// Asks to load the primary language model at the given path. - /// @param languageModelPath The path of the language model. - void loadLanguageModel(const char* languageModelDataPath); - /// If the data model is already loaded. - bool isDataModelLoaded(); + McBopomofoLM(const McBopomofoLM&) = delete; + McBopomofoLM(McBopomofoLM&&) = delete; + McBopomofoLM& operator=(const McBopomofoLM&) = delete; + McBopomofoLM& operator=(McBopomofoLM&&) = delete; - /// Asks to load the associated phrases at the given path. - /// @param associatedPhrasesPath The path of the associated phrases. - void loadAssociatedPhrases(const char* associatedPhrasesPath); + // Loads (or reloads, if already loaded) the primary language model data file. + void loadLanguageModel(const char* languageModelDataPath); - /// If the associated phrases already loaded. - bool isAssociatedPhrasesLoaded(); + bool isDataModelLoaded(); + // Loads (or reloads if already loaded) the associated phrases data file. void loadAssociatedPhrasesV2(const char* associatedPhrasesPath); - /// Asks to load the user phrases and excluded phrases at the given path. - /// @param userPhrasesPath The path of user phrases. - /// @param excludedPhrasesPath The path of excluded phrases. + // Loads (or reloads if already loaded) both the user phrases and the excluded + // phrases files. If one argument is passed a nullptr, that file will not + // be loaded or reloaded. void loadUserPhrases(const char* userPhrasesDataPath, const char* excludedPhrasesDataPath); - /// Asks to load th phrase replacement table at the given path. - /// @param phraseReplacementPath The path of the phrase replacement table. + + // Loads (or reloads if already loaded) the phrase replacement mapping file. void loadPhraseReplacementMap(const char* phraseReplacementPath); - /// Returns a list of available unigram for the given key. - /// @param key A string represents the BPMF reading or a symbol key. For - /// example, it you pass "ㄇㄚ", it returns "嗎", "媽", and so on. + // Returns a list of unigrams for the reading. For example, if the reading is + // "ㄇㄚ", the return may be [unigram("嗎"), unigram("媽") and so on. std::vector getUnigrams( const std::string& key) override; - /// If the model has unigrams for the given key. - /// @param key The key. + bool hasUnigrams(const std::string& key) override; - /// Enables or disables phrase replacement. + std::string getReading(const std::string& value) const; + + std::vector findAssociatedPhrasesV2( + const std::string& prefixValue, + const std::vector& prefixReadings) const; + void setPhraseReplacementEnabled(bool enabled); - /// If phrase replacement is enabled or not. bool phraseReplacementEnabled() const; - /// Enables or disables the external converter. void setExternalConverterEnabled(bool enabled); - /// If the external converted is enabled or not. bool externalConverterEnabled() const; - /// Sets a lambda to let the values of unigrams could be converted by it. void setExternalConverter( std::function externalConverter); - /// Sets a lambda to convert the macro to a string. void setMacroConverter( std::function macroConverter); std::string convertMacro(const std::string& input); - std::vector findAssociatedPhrasesV2( - const std::string& prefixValue, - const std::vector& prefixReadings) const; - - /// Returns the top-scored reading from the base model, given the value. - std::string getReading(const std::string& value); + // Methods to allow loading in-memory data for testing purposes. + void loadLanguageModel(std::unique_ptr db); + void loadAssociatedPhrasesV2(std::unique_ptr db); + void loadUserPhrases(const char* data, size_t length); + void loadExcludedPhrases(const char* data, size_t length); + void loadPhraseReplacementMap(const char* data, size_t length); protected: - /// Filters and converts the input unigrams and return a new list of unigrams. - /// - /// @param unigrams The unigrams to be processed. - /// @param excludedValues The values to excluded unigrams. - /// @param insertedValues The values for unigrams already in the results. - /// It helps to prevent duplicated unigrams. Please note that the method - /// has a side effect that it inserts values to `insertedValues`. + // Filters and converts the input unigrams and returns a new list of unigrams. + // Unigrams whose values are found in `excludedValues` are removed, and the + // kept values will be inserted to the `insertedValues` set. std::vector filterAndTransformUnigrams( const std::vector unigrams, @@ -143,11 +129,14 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { PhraseReplacementMap phraseReplacement_; AssociatedPhrasesV2 associatedPhrasesV2_; - std::function macroConverter_; - bool phraseReplacementEnabled_; - bool externalConverterEnabled_; + bool phraseReplacementEnabled_ = false; + + bool externalConverterEnabled_ = false; std::function externalConverter_; + + std::function macroConverter_; }; + } // namespace McBopomofo #endif // SRC_ENGINE_MCBOPOMOFOLM_H_ diff --git a/src/Engine/McBopomofoLMTest.cpp b/src/Engine/McBopomofoLMTest.cpp new file mode 100644 index 0000000..e45ee3b --- /dev/null +++ b/src/Engine/McBopomofoLMTest.cpp @@ -0,0 +1,284 @@ +// Copyright (c) 2024 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "McBopomofoLM.h" +#include "gtest/gtest.h" + +namespace McBopomofo { + +constexpr char kPrimaryLMData[] = R"( +# format org.openvanilla.mcbopomofo.sorted +ㄇㄧㄥˊ 明 -3.07936356 +ㄇㄧㄥˊ 名 -3.12166252 +ㄇㄧㄥˊ 銘 -4.43019121 +ㄇㄧㄥˊ-ㄘˊ 名詞 -4.61364867 +ㄇㄧㄥˊ-ㄘˋ 名次 -5.47446950 +ㄉㄨㄥˋ 動 -2.83459585 +ㄉㄨㄥˋ 洞 -4.31757780 +ㄉㄨㄥˋ-ㄗㄨㄛˋ 動作 -4.17449149 +ㄐㄧㄣ-ㄊㄧㄢ 今天 -3.28959497 +ㄐㄧㄣ-ㄊㄧㄢ MACRO@DATE_TODAY_SHORT -8 +ㄐㄧㄣ-ㄊㄧㄢ MACRO@DATE_TODAY_MEDIUM -8 +ㄔㄥˊ-ㄕˋ 城市 -3.98856498 +ㄔㄥˊ-ㄕˋ 程式 -4.07624939 +ㄔㄥˊ-ㄕˋ 成事 -5.88664994 +ㄙㄜˋ-ㄍㄨˇ 澀谷 -6.78973993 +ㄙㄜˋ-ㄍㄨˇ 渋谷 -6.78973993 +)"; + +constexpr char kAssociatedPhrasesV2Data[] = R"( +# format org.openvanilla.mcbopomofo.sorted +名-ㄇㄧㄥˊ-下-ㄒㄧㄚˋ -5.7106 +名-ㄇㄧㄥˊ-不-ㄅㄨˊ-見-ㄐㄧㄢˋ-經-ㄐㄧㄥ-傳-ㄓㄨㄢˋ -5.9904 +)"; + +constexpr char kUserPhrasesData[] = R"( +茗 ㄇㄧㄥˊ +丼 ㄉㄨㄥˋ +名刺 ㄇㄧㄥˊ-ㄘˋ +程式 ㄔㄥˊ-ㄕˋ +)"; + +constexpr char kExcludedPhrasesData[] = R"( +動作 ㄉㄨㄥˋ-ㄗㄨㄛˋ +)"; + +constexpr char kPhreaseReplacementMapData[] = R"( +動作 动作 +澀谷 渋谷 +)"; + +TEST(McBopomofoLMTest, PrimaryLanguageModel) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + EXPECT_TRUE(lm.hasUnigrams("ㄇㄧㄥˊ-ㄘˊ")); + EXPECT_FALSE(lm.hasUnigrams("ㄉㄨㄥˋ-ㄘˊ")); + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞"); + EXPECT_LT(unigrams[0].score(), 0); +} + +TEST(McBopomofoLMTest, AssociatedPhrasesV2) { + McBopomofoLM lm; + auto db = std::make_unique( + kAssociatedPhrasesV2Data, sizeof(kAssociatedPhrasesV2Data)); + lm.loadAssociatedPhrasesV2(std::move(db)); + + auto phrases = lm.findAssociatedPhrasesV2("名", {"ㄇㄧㄥˊ"}); + EXPECT_FALSE(phrases.empty()); + EXPECT_EQ(phrases[0].value, "名下"); + EXPECT_EQ(phrases[0].readings, + (std::vector{"ㄇㄧㄥˊ", "ㄒㄧㄚˋ"})); + + EXPECT_TRUE(lm.findAssociatedPhrasesV2("銘", {"ㄇㄧㄥˊ"}).empty()); +} + +TEST(McBopomofoLMTest, UserPhrases) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "茗"); +} + +TEST(McBopomofoLMTest, ExcludedPhrases) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + EXPECT_FALSE(unigrams.empty()); + + lm.loadExcludedPhrases(kExcludedPhrasesData, sizeof(kExcludedPhrasesData)); + unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + EXPECT_TRUE(unigrams.empty()); +} + +TEST(McBopomofoLMTest, PhraseReplacementMap) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadPhraseReplacementMap(kPhreaseReplacementMapData, + sizeof(kPhreaseReplacementMapData)); + auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "動作"); + + lm.setPhraseReplacementEnabled(true); + unigrams = lm.getUnigrams("ㄉㄨㄥˋ-ㄗㄨㄛˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "动作"); +} + +TEST(McBopomofoLMTest, PhraseReplacementMapDeduplicates) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadPhraseReplacementMap(kPhreaseReplacementMapData, + sizeof(kPhreaseReplacementMapData)); + auto unigrams = lm.getUnigrams("ㄙㄜˋ-ㄍㄨˇ"); + EXPECT_EQ(unigrams.size(), 2); + + lm.setPhraseReplacementEnabled(true); + unigrams = lm.getUnigrams("ㄙㄜˋ-ㄍㄨˇ"); + ASSERT_EQ(unigrams.size(), 1); + EXPECT_EQ(unigrams[0].value(), "渋谷"); +} + +TEST(McBopomofoLMTest, UserPhrasesOverrideDefaultLanguageModelPhrases) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + auto unigrams = lm.getUnigrams("ㄔㄥˊ-ㄕˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "城市"); + + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + unigrams = lm.getUnigrams("ㄔㄥˊ-ㄕˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "程式"); +} + +TEST(McBopomofoLMTest, MonoSyllableUserPhrasesNeedRewrite) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + + auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "丼"); + EXPECT_LT(unigrams[0].score(), 0); // note: this is not 0! + EXPECT_GT(unigrams[0].score(), unigrams[1].score()); + // The delta between the two should be miniscule. + EXPECT_LT(std::abs(unigrams[0].score() - unigrams[1].score()), 0.000001); +} + +TEST(McBopomofoLMTest, MultipleSyllableUserPhrasesNeedNoRewrite) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.loadUserPhrases(kUserPhrasesData, sizeof(kUserPhrasesData)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˋ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名刺"); + EXPECT_EQ(unigrams[0].score(), 0); // note: this is 0! + EXPECT_EQ(unigrams[1].value(), "名次"); + EXPECT_LT(unigrams[1].score(), 0); +} + +TEST(McBopomofoLMTest, ExternalConverterWhenNotSetThenEnablingItIsNoOp) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + lm.setExternalConverterEnabled(true); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞"); +} + +TEST(McBopomofoLMTest, ExternalConverter) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞"); + + lm.setExternalConverterEnabled(true); + lm.setExternalConverter([](const auto& value) { return value + "!"; }); + + unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˊ"); + ASSERT_FALSE(unigrams.empty()); + EXPECT_EQ(unigrams[0].value(), "名詞!"); +} + +TEST(McBopomofoLMTest, ExternalConverterConversionResultsAreDeduplicated) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ"); + ASSERT_GT(unigrams.size(), 1); + + lm.setExternalConverterEnabled(true); + lm.setExternalConverter([](const auto&) { return "!"; }); + + unigrams = lm.getUnigrams("ㄇㄧㄥˊ"); + ASSERT_EQ(unigrams.size(), 1); + EXPECT_EQ(unigrams[0].value(), "!"); +} + +TEST(McBopomofoLMTest, DefaultMacroConverterIsNoOp) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + ASSERT_EQ(lm.convertMacro("MACRO@DATE_TODAY_SHORT"), + "MACRO@DATE_TODAY_SHORT"); +} + +TEST(McBopomofoLMTest, MacroValueNotRecognizedByMacroConverterIsFiltered) { + McBopomofoLM lm; + auto db = std::make_unique(kPrimaryLMData, + sizeof(kPrimaryLMData)); + lm.loadLanguageModel(std::move(db)); + + lm.setMacroConverter([](const std::string& macro) { + if (macro == "MACRO@DATE_TODAY_SHORT") { + return std::string("6/10/21"); + } + return macro; + }); + + auto unigrams = lm.getUnigrams("ㄐㄧㄣ-ㄊㄧㄢ"); + ASSERT_EQ(unigrams.size(), 2); // only one macro is supported + EXPECT_EQ(unigrams[0].value(), "今天"); + EXPECT_EQ(unigrams[1].value(), "6/10/21"); +} + +} // namespace McBopomofo From 1545e23b5b4eca6e3bddf716ae72e079324889b8 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Thu, 14 Mar 2024 20:23:32 -0700 Subject: [PATCH 5/6] Make use of UserPhrasesLM::kUserUnigramScore --- src/Engine/McBopomofoLMTest.cpp | 8 +++++--- src/Engine/UserPhrasesLM.h | 2 +- src/Engine/UserPhrasesLMTest.cpp | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Engine/McBopomofoLMTest.cpp b/src/Engine/McBopomofoLMTest.cpp index e45ee3b..cc38b11 100644 --- a/src/Engine/McBopomofoLMTest.cpp +++ b/src/Engine/McBopomofoLMTest.cpp @@ -184,9 +184,10 @@ TEST(McBopomofoLMTest, MonoSyllableUserPhrasesNeedRewrite) { auto unigrams = lm.getUnigrams("ㄉㄨㄥˋ"); ASSERT_FALSE(unigrams.empty()); EXPECT_EQ(unigrams[0].value(), "丼"); - EXPECT_LT(unigrams[0].score(), 0); // note: this is not 0! + // This user unigram's score is recomputed and must be < kUserUnigramScore. + EXPECT_LT(unigrams[0].score(), UserPhrasesLM::kUserUnigramScore); EXPECT_GT(unigrams[0].score(), unigrams[1].score()); - // The delta between the two should be miniscule. + // The delta between the two should be minuscule. EXPECT_LT(std::abs(unigrams[0].score() - unigrams[1].score()), 0.000001); } @@ -200,7 +201,8 @@ TEST(McBopomofoLMTest, MultipleSyllableUserPhrasesNeedNoRewrite) { auto unigrams = lm.getUnigrams("ㄇㄧㄥˊ-ㄘˋ"); ASSERT_FALSE(unigrams.empty()); EXPECT_EQ(unigrams[0].value(), "名刺"); - EXPECT_EQ(unigrams[0].score(), 0); // note: this is 0! + // This user unigram's score is not recomputed, so is still kUserUnigramScore. + EXPECT_EQ(unigrams[0].score(), UserPhrasesLM::kUserUnigramScore); EXPECT_EQ(unigrams[1].value(), "名次"); EXPECT_LT(unigrams[1].score(), 0); } diff --git a/src/Engine/UserPhrasesLM.h b/src/Engine/UserPhrasesLM.h index d0d9f1d..e58e048 100644 --- a/src/Engine/UserPhrasesLM.h +++ b/src/Engine/UserPhrasesLM.h @@ -54,9 +54,9 @@ class UserPhrasesLM : public Formosa::Gramambular2::LanguageModel { const std::string& key) override; bool hasUnigrams(const std::string& key) override; - protected: static constexpr double kUserUnigramScore = 0; + protected: MemoryMappedFile mmapedFile_; std::map> keyRowMap; }; diff --git a/src/Engine/UserPhrasesLMTest.cpp b/src/Engine/UserPhrasesLMTest.cpp index 236920c..73e0126 100644 --- a/src/Engine/UserPhrasesLMTest.cpp +++ b/src/Engine/UserPhrasesLMTest.cpp @@ -44,6 +44,7 @@ TEST(UserPhrasesLMTest, LenientReading) { std::vector results = lm.getUnigrams("reading1"); EXPECT_EQ(results[0].value(), "value1"); + EXPECT_EQ(results[0].score(), UserPhrasesLM::kUserUnigramScore); } } // namespace McBopomofo From 5a10a34cbaae3c664550f7ee98ccff9794fd8932 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Fri, 15 Mar 2024 08:02:30 -0700 Subject: [PATCH 6/6] Replace `if (x)` with `if (x != nullptr)` --- src/Engine/McBopomofoLM.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index 052e2a1..8cb1edd 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -245,7 +245,7 @@ McBopomofoLM::filterAndTransformUnigrams( value = replacement; } } - if (macroConverter_) { + if (macroConverter_ != nullptr) { std::string replacement = macroConverter_(value); value = replacement; } @@ -256,7 +256,7 @@ McBopomofoLM::filterAndTransformUnigrams( continue; } - if (externalConverterEnabled_ && externalConverter_) { + if (externalConverterEnabled_ && externalConverter_ != nullptr) { std::string replacement = externalConverter_(value); value = replacement; }