Bring in lookup-by-value introduced in the Mac version

Code from openvanilla/McBopomofo#372 and openvanilla/McBopomofo#374
openvanilla · Oct 4, 2023 · b1ef71e · b1ef71e
1 parent 81ca619
commit b1ef71e
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 1 deletion.
diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp
@@ -23,7 +23,9 @@
 
 #include "McBopomofoLM.h"
 #include <algorithm>
+#include <float.h>
 #include <iterator>
+#include <limits>
 
 namespace McBopomofo {
 
@@ -135,6 +137,20 @@ bool McBopomofoLM::hasUnigrams(const std::string& key)
     return !getUnigrams(key).empty();
 }
 
+std::string McBopomofoLM::getReading(const std::string& value)
+{
+    std::vector<ParselessLM::FoundReading> foundReadings = m_languageModel.getReadings(value);
+    double topScore = std::numeric_limits<double>::lowest();
+    std::string topValue;
+    for (const auto& foundReading : foundReadings) {
+        if (foundReading.score > topScore) {
+            topValue = foundReading.reading;
+            topScore = foundReading.score;
+        }
+    }
+    return topValue;
+}
+
 void McBopomofoLM::setPhraseReplacementEnabled(bool enabled)
 {
     m_phraseReplacementEnabled = enabled;

diff --git a/src/Engine/McBopomofoLM.h b/src/Engine/McBopomofoLM.h
@@ -105,6 +105,9 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
     const std::vector<std::string> associatedPhrasesForKey(const std::string& key);
     bool hasAssociatedPhrasesForKey(const std::string& key);
 
+    /// Returns the top-scored reading from the base model, given the value.
+    std::string getReading(const std::string& value);
+
 protected:
     /// Filters and converts the input unigrams and return a new list of unigrams.
     ///

diff --git a/src/Engine/ParselessLM.cpp b/src/Engine/ParselessLM.cpp
@@ -141,3 +141,54 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key)
 
     return db_->findFirstMatchingLine(key + " ") != nullptr;
 }
+
+std::vector<McBopomofo::ParselessLM::FoundReading> McBopomofo::ParselessLM::getReadings(const std::string& value)
+{
+    if (db_ == nullptr) {
+        return std::vector<McBopomofo::ParselessLM::FoundReading>();
+    }
+
+    std::vector<McBopomofo::ParselessLM::FoundReading> results;
+
+    // We append a space so that we only find rows with the exact value. We
+    // are taking advantage of the fact that a well-form row in this LM must
+    // be in the format of "key value score".
+    std::string actualValue = value + " ";
+
+    for (const auto& row : db_->reverseFindRows(actualValue)) {
+        std::string key;
+        double score = 0;
+
+        // Move ahead until we encounter the first space. This is the key.
+        auto it = row.begin();
+        while (it != row.end() && *it != ' ') {
+            ++it;
+        }
+
+        key = std::string(row.begin(), it);
+
+        // Read past the space.
+        if (it != row.end()) {
+            ++it;
+        }
+
+        if (it != row.end()) {
+            // Now it is the start of the value portion, but we move ahead
+            // until we encounter the second space to skip this part.
+            while (it != row.end() && *it != ' ') {
+                ++it;
+            }
+        }
+
+        // Read past the space. The remainder, if it exists, is the score.
+        if (it != row.end()) {
+            ++it;
+        }
+
+        if (it != row.end()) {
+            score = std::stod(std::string(it, row.end()));
+        }
+        results.emplace_back(McBopomofo::ParselessLM::FoundReading { key, score });
+    }
+    return results;
+}
diff --git a/src/Engine/ParselessLM.h b/src/Engine/ParselessLM.h
@@ -45,6 +45,13 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel {
         const std::string& key) override;
     bool hasUnigrams(const std::string& key) override;
 
+    struct FoundReading {
+        std::string reading;
+        double score;
+    };
+    // Look up reading by value. This is specific to ParselessLM only.
+    std::vector<FoundReading> getReadings(const std::string& value);
+
 private:
     int fd_ = -1;
     void* data_ = nullptr;

diff --git a/src/Engine/ParselessLMTest.cpp b/src/Engine/ParselessLMTest.cpp
@@ -53,6 +53,20 @@ TEST(ParselessLMTest, SanityCheckTest)
     unigrams = lm.getUnigrams("_punctuation_list");
     ASSERT_GT(unigrams.size(), 0);
 
+    std::vector<ParselessLM::FoundReading> found_readings;
+    found_readings = lm.getReadings("不存在的詞");
+    ASSERT_TRUE(found_readings.empty());
+
+    found_readings = lm.getReadings("讀音");
+    ASSERT_EQ(found_readings.size(), 1);
+
+    found_readings = lm.getReadings("鑰匙");
+    ASSERT_GT(found_readings.size(), 1);
+
+    found_readings = lm.getReadings("得");
+    ASSERT_GT(found_readings.size(), 1);
+    ASSERT_EQ(found_readings[0].reading, "ㄉㄜˊ");
+
     lm.close();
 }
 

diff --git a/src/Engine/ParselessPhraseDB.cpp b/src/Engine/ParselessPhraseDB.cpp
@@ -156,4 +156,44 @@ const char* ParselessPhraseDB::findFirstMatchingLine(
     return nullptr;
 }
 
+std::vector<std::string> ParselessPhraseDB::reverseFindRows(
+    const std::string_view& value)
+{
+    std::vector<std::string> rows;
+
+    const char* recordBegin = begin_;
+
+    while (recordBegin < end_) {
+        const char* ptr = recordBegin;
+
+        // skip over the key to find the field separator
+        while (ptr < end_ && *ptr != ' ') {
+            ++ptr;
+        }
+        // skip over the field separator. there should be just one, but loop just in case.
+        while (ptr < end_ && *ptr == ' ') {
+            ++ptr;
+        }
+
+        // now walk to the end of this record
+        const char* recordEnd = ptr;
+        while (recordEnd < end_ && *recordEnd != '\n') {
+            ++recordEnd;
+        }
+
+        if (ptr + value.length() < end_ && memcmp(ptr, value.data(), value.length()) == 0) {
+            // prefix match, add entire record to return value
+            rows.emplace_back(recordBegin, recordEnd - recordBegin);
+        }
+
+        // skip over to the next line start
+        recordBegin = recordEnd;
+        while (recordBegin < end_ && *recordBegin == '\n') {
+            ++recordBegin;
+        }
+    }
+
+    return rows;
+}
+
 }; // namespace McBopomofo
diff --git a/src/Engine/ParselessPhraseDB.h b/src/Engine/ParselessPhraseDB.h
@@ -25,7 +25,6 @@
 #define SOURCE_ENGINE_PARSELESSPHRASEDB_H_
 
 #include <cstddef>
-#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -51,6 +50,14 @@ class ParselessPhraseDB {
 
     const char* findFirstMatchingLine(const std::string_view& key);
 
+    // Find the rows whose text past the key column plus the field separator
+    // is a prefix match of the given value. For example, if the row is
+    // "foo bar -1.00", the values "b", "ba", "bar", "bar ", "bar -1.00" are
+    // are valid prefix matches, where as the value "barr" isn't. This
+    // performs linear scan since, unlike lookup-by-key, it cannot take
+    // advantage of the fact that the underlying data is sorted by keys.
+    std::vector<std::string> reverseFindRows(const std::string_view& value);
+
 private:
     const char* begin_;
     const char* end_;

diff --git a/src/Engine/ParselessPhraseDBTest.cpp b/src/Engine/ParselessPhraseDBTest.cpp
@@ -195,4 +195,30 @@ TEST(ParselessPhraseDBTest, StressTest)
     }
 }
 
+TEST(ParselessPhraseDBTest, LookUpByValue)
+{
+    std::string data = "a 1\nb 1 \nc 2\nd 3";
+    ParselessPhraseDB db(data.c_str(), data.length());
+
+    std::vector<std::string> rows;
+    rows = db.reverseFindRows("1");
+    ASSERT_EQ(rows, (std::vector<std::string> { "a 1", "b 1 " }));
+
+    rows = db.reverseFindRows("2");
+    ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));
+
+    // This is a quirk of the function, but is actually valid.
+    rows = db.reverseFindRows("2\n");
+    ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));
+
+    rows = db.reverseFindRows("22");
+    ASSERT_TRUE(rows.empty());
+
+    rows = db.reverseFindRows("3\n");
+    ASSERT_TRUE(rows.empty());
+
+    rows = db.reverseFindRows("4");
+    ASSERT_TRUE(rows.empty());
+}
+
 }; // namespace McBopomofo