From b1ef71ebeb3be686ebd4b2c5dd1efbd2c9269cd8 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Tue, 3 Oct 2023 19:57:23 -0700 Subject: [PATCH] Bring in lookup-by-value introduced in the Mac version Code from https://github.com/openvanilla/McBopomofo/pull/372 and https://github.com/openvanilla/McBopomofo/pull/374 --- src/Engine/McBopomofoLM.cpp | 16 +++++++++ src/Engine/McBopomofoLM.h | 3 ++ src/Engine/ParselessLM.cpp | 51 ++++++++++++++++++++++++++++ src/Engine/ParselessLM.h | 7 ++++ src/Engine/ParselessLMTest.cpp | 14 ++++++++ src/Engine/ParselessPhraseDB.cpp | 40 ++++++++++++++++++++++ src/Engine/ParselessPhraseDB.h | 9 ++++- src/Engine/ParselessPhraseDBTest.cpp | 26 ++++++++++++++ 8 files changed, 165 insertions(+), 1 deletion(-) diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index fd58a9a..f3e0787 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -23,7 +23,9 @@ #include "McBopomofoLM.h" #include +#include #include +#include namespace McBopomofo { @@ -135,6 +137,20 @@ bool McBopomofoLM::hasUnigrams(const std::string& key) return !getUnigrams(key).empty(); } +std::string McBopomofoLM::getReading(const std::string& value) +{ + std::vector foundReadings = m_languageModel.getReadings(value); + double topScore = std::numeric_limits::lowest(); + std::string topValue; + for (const auto& foundReading : foundReadings) { + if (foundReading.score > topScore) { + topValue = foundReading.reading; + topScore = foundReading.score; + } + } + return topValue; +} + void McBopomofoLM::setPhraseReplacementEnabled(bool enabled) { m_phraseReplacementEnabled = enabled; diff --git a/src/Engine/McBopomofoLM.h b/src/Engine/McBopomofoLM.h index 249a6de..80ab5e1 100644 --- a/src/Engine/McBopomofoLM.h +++ b/src/Engine/McBopomofoLM.h @@ -105,6 +105,9 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { const std::vector associatedPhrasesForKey(const std::string& key); bool hasAssociatedPhrasesForKey(const std::string& key); + /// Returns the top-scored reading from the base model, given the value. + std::string getReading(const std::string& value); + protected: /// Filters and converts the input unigrams and return a new list of unigrams. /// diff --git a/src/Engine/ParselessLM.cpp b/src/Engine/ParselessLM.cpp index 9b1d3e5..dd2c2e6 100644 --- a/src/Engine/ParselessLM.cpp +++ b/src/Engine/ParselessLM.cpp @@ -141,3 +141,54 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key) return db_->findFirstMatchingLine(key + " ") != nullptr; } + +std::vector McBopomofo::ParselessLM::getReadings(const std::string& value) +{ + if (db_ == nullptr) { + return std::vector(); + } + + std::vector results; + + // We append a space so that we only find rows with the exact value. We + // are taking advantage of the fact that a well-form row in this LM must + // be in the format of "key value score". + std::string actualValue = value + " "; + + for (const auto& row : db_->reverseFindRows(actualValue)) { + std::string key; + double score = 0; + + // Move ahead until we encounter the first space. This is the key. + auto it = row.begin(); + while (it != row.end() && *it != ' ') { + ++it; + } + + key = std::string(row.begin(), it); + + // Read past the space. + if (it != row.end()) { + ++it; + } + + if (it != row.end()) { + // Now it is the start of the value portion, but we move ahead + // until we encounter the second space to skip this part. + while (it != row.end() && *it != ' ') { + ++it; + } + } + + // Read past the space. The remainder, if it exists, is the score. + if (it != row.end()) { + ++it; + } + + if (it != row.end()) { + score = std::stod(std::string(it, row.end())); + } + results.emplace_back(McBopomofo::ParselessLM::FoundReading { key, score }); + } + return results; +} diff --git a/src/Engine/ParselessLM.h b/src/Engine/ParselessLM.h index 8ac27c7..f215b9e 100644 --- a/src/Engine/ParselessLM.h +++ b/src/Engine/ParselessLM.h @@ -45,6 +45,13 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel { const std::string& key) override; bool hasUnigrams(const std::string& key) override; + struct FoundReading { + std::string reading; + double score; + }; + // Look up reading by value. This is specific to ParselessLM only. + std::vector getReadings(const std::string& value); + private: int fd_ = -1; void* data_ = nullptr; diff --git a/src/Engine/ParselessLMTest.cpp b/src/Engine/ParselessLMTest.cpp index d73aaf6..3694d5b 100644 --- a/src/Engine/ParselessLMTest.cpp +++ b/src/Engine/ParselessLMTest.cpp @@ -53,6 +53,20 @@ TEST(ParselessLMTest, SanityCheckTest) unigrams = lm.getUnigrams("_punctuation_list"); ASSERT_GT(unigrams.size(), 0); + std::vector found_readings; + found_readings = lm.getReadings("不存在的詞"); + ASSERT_TRUE(found_readings.empty()); + + found_readings = lm.getReadings("讀音"); + ASSERT_EQ(found_readings.size(), 1); + + found_readings = lm.getReadings("鑰匙"); + ASSERT_GT(found_readings.size(), 1); + + found_readings = lm.getReadings("得"); + ASSERT_GT(found_readings.size(), 1); + ASSERT_EQ(found_readings[0].reading, "ㄉㄜˊ"); + lm.close(); } diff --git a/src/Engine/ParselessPhraseDB.cpp b/src/Engine/ParselessPhraseDB.cpp index c00cccb..c4ab86e 100644 --- a/src/Engine/ParselessPhraseDB.cpp +++ b/src/Engine/ParselessPhraseDB.cpp @@ -156,4 +156,44 @@ const char* ParselessPhraseDB::findFirstMatchingLine( return nullptr; } +std::vector ParselessPhraseDB::reverseFindRows( + const std::string_view& value) +{ + std::vector rows; + + const char* recordBegin = begin_; + + while (recordBegin < end_) { + const char* ptr = recordBegin; + + // skip over the key to find the field separator + while (ptr < end_ && *ptr != ' ') { + ++ptr; + } + // skip over the field separator. there should be just one, but loop just in case. + while (ptr < end_ && *ptr == ' ') { + ++ptr; + } + + // now walk to the end of this record + const char* recordEnd = ptr; + while (recordEnd < end_ && *recordEnd != '\n') { + ++recordEnd; + } + + if (ptr + value.length() < end_ && memcmp(ptr, value.data(), value.length()) == 0) { + // prefix match, add entire record to return value + rows.emplace_back(recordBegin, recordEnd - recordBegin); + } + + // skip over to the next line start + recordBegin = recordEnd; + while (recordBegin < end_ && *recordBegin == '\n') { + ++recordBegin; + } + } + + return rows; +} + }; // namespace McBopomofo diff --git a/src/Engine/ParselessPhraseDB.h b/src/Engine/ParselessPhraseDB.h index 6d2d001..436a6ca 100644 --- a/src/Engine/ParselessPhraseDB.h +++ b/src/Engine/ParselessPhraseDB.h @@ -25,7 +25,6 @@ #define SOURCE_ENGINE_PARSELESSPHRASEDB_H_ #include -#include #include #include @@ -51,6 +50,14 @@ class ParselessPhraseDB { const char* findFirstMatchingLine(const std::string_view& key); + // Find the rows whose text past the key column plus the field separator + // is a prefix match of the given value. For example, if the row is + // "foo bar -1.00", the values "b", "ba", "bar", "bar ", "bar -1.00" are + // are valid prefix matches, where as the value "barr" isn't. This + // performs linear scan since, unlike lookup-by-key, it cannot take + // advantage of the fact that the underlying data is sorted by keys. + std::vector reverseFindRows(const std::string_view& value); + private: const char* begin_; const char* end_; diff --git a/src/Engine/ParselessPhraseDBTest.cpp b/src/Engine/ParselessPhraseDBTest.cpp index 07c0b7d..a116989 100644 --- a/src/Engine/ParselessPhraseDBTest.cpp +++ b/src/Engine/ParselessPhraseDBTest.cpp @@ -195,4 +195,30 @@ TEST(ParselessPhraseDBTest, StressTest) } } +TEST(ParselessPhraseDBTest, LookUpByValue) +{ + std::string data = "a 1\nb 1 \nc 2\nd 3"; + ParselessPhraseDB db(data.c_str(), data.length()); + + std::vector rows; + rows = db.reverseFindRows("1"); + ASSERT_EQ(rows, (std::vector { "a 1", "b 1 " })); + + rows = db.reverseFindRows("2"); + ASSERT_EQ(rows, (std::vector { "c 2" })); + + // This is a quirk of the function, but is actually valid. + rows = db.reverseFindRows("2\n"); + ASSERT_EQ(rows, (std::vector { "c 2" })); + + rows = db.reverseFindRows("22"); + ASSERT_TRUE(rows.empty()); + + rows = db.reverseFindRows("3\n"); + ASSERT_TRUE(rows.empty()); + + rows = db.reverseFindRows("4"); + ASSERT_TRUE(rows.empty()); +} + }; // namespace McBopomofo