From b1ef71ebeb3be686ebd4b2c5dd1efbd2c9269cd8 Mon Sep 17 00:00:00 2001
From: Lukhnos Liu <lukhnos@lukhnos.org>
Date: Tue, 3 Oct 2023 19:57:23 -0700
Subject: [PATCH] Bring in lookup-by-value introduced in the Mac version

Code from https://github.com/openvanilla/McBopomofo/pull/372 and
https://github.com/openvanilla/McBopomofo/pull/374
---
 src/Engine/McBopomofoLM.cpp          | 16 +++++++++
 src/Engine/McBopomofoLM.h            |  3 ++
 src/Engine/ParselessLM.cpp           | 51 ++++++++++++++++++++++++++++
 src/Engine/ParselessLM.h             |  7 ++++
 src/Engine/ParselessLMTest.cpp       | 14 ++++++++
 src/Engine/ParselessPhraseDB.cpp     | 40 ++++++++++++++++++++++
 src/Engine/ParselessPhraseDB.h       |  9 ++++-
 src/Engine/ParselessPhraseDBTest.cpp | 26 ++++++++++++++
 8 files changed, 165 insertions(+), 1 deletion(-)
diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp
index fd58a9a..f3e0787 100644
--- a/src/Engine/McBopomofoLM.cpp
+++ b/src/Engine/McBopomofoLM.cpp
@@ -23,7 +23,9 @@
 
 #include "McBopomofoLM.h"
 #include <algorithm>
+#include <float.h>
 #include <iterator>
+#include <limits>
 
 namespace McBopomofo {
 
@@ -135,6 +137,20 @@ bool McBopomofoLM::hasUnigrams(const std::string& key)
     return !getUnigrams(key).empty();
 }
 
+std::string McBopomofoLM::getReading(const std::string& value)
+{
+    std::vector<ParselessLM::FoundReading> foundReadings = m_languageModel.getReadings(value);
+    double topScore = std::numeric_limits<double>::lowest();
+    std::string topValue;
+    for (const auto& foundReading : foundReadings) {
+        if (foundReading.score > topScore) {
+            topValue = foundReading.reading;
+            topScore = foundReading.score;
+        }
+    }
+    return topValue;
+}
+
 void McBopomofoLM::setPhraseReplacementEnabled(bool enabled)
 {
     m_phraseReplacementEnabled = enabled;
diff --git a/src/Engine/McBopomofoLM.h b/src/Engine/McBopomofoLM.h
index 249a6de..80ab5e1 100644
--- a/src/Engine/McBopomofoLM.h
+++ b/src/Engine/McBopomofoLM.h
@@ -105,6 +105,9 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
     const std::vector<std::string> associatedPhrasesForKey(const std::string& key);
     bool hasAssociatedPhrasesForKey(const std::string& key);
 
+    /// Returns the top-scored reading from the base model, given the value.
+    std::string getReading(const std::string& value);
+
 protected:
     /// Filters and converts the input unigrams and return a new list of unigrams.
     ///
diff --git a/src/Engine/ParselessLM.cpp b/src/Engine/ParselessLM.cpp
index 9b1d3e5..dd2c2e6 100644
--- a/src/Engine/ParselessLM.cpp
+++ b/src/Engine/ParselessLM.cpp
@@ -141,3 +141,54 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key)
 
     return db_->findFirstMatchingLine(key + " ") != nullptr;
 }
+
+std::vector<McBopomofo::ParselessLM::FoundReading> McBopomofo::ParselessLM::getReadings(const std::string& value)
+{
+    if (db_ == nullptr) {
+        return std::vector<McBopomofo::ParselessLM::FoundReading>();
+    }
+
+    std::vector<McBopomofo::ParselessLM::FoundReading> results;
+
+    // We append a space so that we only find rows with the exact value. We
+    // are taking advantage of the fact that a well-form row in this LM must
+    // be in the format of "key value score".
+    std::string actualValue = value + " ";
+
+    for (const auto& row : db_->reverseFindRows(actualValue)) {
+        std::string key;
+        double score = 0;
+
+        // Move ahead until we encounter the first space. This is the key.
+        auto it = row.begin();
+        while (it != row.end() && *it != ' ') {
+            ++it;
+        }
+
+        key = std::string(row.begin(), it);
+
+        // Read past the space.
+        if (it != row.end()) {
+            ++it;
+        }
+
+        if (it != row.end()) {
+            // Now it is the start of the value portion, but we move ahead
+            // until we encounter the second space to skip this part.
+            while (it != row.end() && *it != ' ') {
+                ++it;
+            }
+        }
+
+        // Read past the space. The remainder, if it exists, is the score.
+        if (it != row.end()) {
+            ++it;
+        }
+
+        if (it != row.end()) {
+            score = std::stod(std::string(it, row.end()));
+        }
+        results.emplace_back(McBopomofo::ParselessLM::FoundReading { key, score });
+    }
+    return results;
+}
diff --git a/src/Engine/ParselessLM.h b/src/Engine/ParselessLM.h
index 8ac27c7..f215b9e 100644
--- a/src/Engine/ParselessLM.h
+++ b/src/Engine/ParselessLM.h
@@ -45,6 +45,13 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel {
         const std::string& key) override;
     bool hasUnigrams(const std::string& key) override;
 
+    struct FoundReading {
+        std::string reading;
+        double score;
+    };
+    // Look up reading by value. This is specific to ParselessLM only.
+    std::vector<FoundReading> getReadings(const std::string& value);
+
 private:
     int fd_ = -1;
     void* data_ = nullptr;
diff --git a/src/Engine/ParselessLMTest.cpp b/src/Engine/ParselessLMTest.cpp
index d73aaf6..3694d5b 100644
--- a/src/Engine/ParselessLMTest.cpp
+++ b/src/Engine/ParselessLMTest.cpp
@@ -53,6 +53,20 @@ TEST(ParselessLMTest, SanityCheckTest)
     unigrams = lm.getUnigrams("_punctuation_list");
     ASSERT_GT(unigrams.size(), 0);
 
+    std::vector<ParselessLM::FoundReading> found_readings;
+    found_readings = lm.getReadings("不存在的詞");
+    ASSERT_TRUE(found_readings.empty());
+
+    found_readings = lm.getReadings("讀音");
+    ASSERT_EQ(found_readings.size(), 1);
+
+    found_readings = lm.getReadings("鑰匙");
+    ASSERT_GT(found_readings.size(), 1);
+
+    found_readings = lm.getReadings("得");
+    ASSERT_GT(found_readings.size(), 1);
+    ASSERT_EQ(found_readings[0].reading, "ㄉㄜˊ");
+
     lm.close();
 }
 
diff --git a/src/Engine/ParselessPhraseDB.cpp b/src/Engine/ParselessPhraseDB.cpp
index c00cccb..c4ab86e 100644
--- a/src/Engine/ParselessPhraseDB.cpp
+++ b/src/Engine/ParselessPhraseDB.cpp
@@ -156,4 +156,44 @@ const char* ParselessPhraseDB::findFirstMatchingLine(
     return nullptr;
 }
 
+std::vector<std::string> ParselessPhraseDB::reverseFindRows(
+    const std::string_view& value)
+{
+    std::vector<std::string> rows;
+
+    const char* recordBegin = begin_;
+
+    while (recordBegin < end_) {
+        const char* ptr = recordBegin;
+
+        // skip over the key to find the field separator
+        while (ptr < end_ && *ptr != ' ') {
+            ++ptr;
+        }
+        // skip over the field separator. there should be just one, but loop just in case.
+        while (ptr < end_ && *ptr == ' ') {
+            ++ptr;
+        }
+
+        // now walk to the end of this record
+        const char* recordEnd = ptr;
+        while (recordEnd < end_ && *recordEnd != '\n') {
+            ++recordEnd;
+        }
+
+        if (ptr + value.length() < end_ && memcmp(ptr, value.data(), value.length()) == 0) {
+            // prefix match, add entire record to return value
+            rows.emplace_back(recordBegin, recordEnd - recordBegin);
+        }
+
+        // skip over to the next line start
+        recordBegin = recordEnd;
+        while (recordBegin < end_ && *recordBegin == '\n') {
+            ++recordBegin;
+        }
+    }
+
+    return rows;
+}
+
 }; // namespace McBopomofo
diff --git a/src/Engine/ParselessPhraseDB.h b/src/Engine/ParselessPhraseDB.h
index 6d2d001..436a6ca 100644
--- a/src/Engine/ParselessPhraseDB.h
+++ b/src/Engine/ParselessPhraseDB.h
@@ -25,7 +25,6 @@
 #define SOURCE_ENGINE_PARSELESSPHRASEDB_H_
 
 #include <cstddef>
-#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -51,6 +50,14 @@ class ParselessPhraseDB {
 
     const char* findFirstMatchingLine(const std::string_view& key);
 
+    // Find the rows whose text past the key column plus the field separator
+    // is a prefix match of the given value. For example, if the row is
+    // "foo bar -1.00", the values "b", "ba", "bar", "bar ", "bar -1.00" are
+    // are valid prefix matches, where as the value "barr" isn't. This
+    // performs linear scan since, unlike lookup-by-key, it cannot take
+    // advantage of the fact that the underlying data is sorted by keys.
+    std::vector<std::string> reverseFindRows(const std::string_view& value);
+
 private:
     const char* begin_;
     const char* end_;
diff --git a/src/Engine/ParselessPhraseDBTest.cpp b/src/Engine/ParselessPhraseDBTest.cpp
index 07c0b7d..a116989 100644
--- a/src/Engine/ParselessPhraseDBTest.cpp
+++ b/src/Engine/ParselessPhraseDBTest.cpp
@@ -195,4 +195,30 @@ TEST(ParselessPhraseDBTest, StressTest)
     }
 }
 
+TEST(ParselessPhraseDBTest, LookUpByValue)
+{
+    std::string data = "a 1\nb 1 \nc 2\nd 3";
+    ParselessPhraseDB db(data.c_str(), data.length());
+
+    std::vector<std::string> rows;
+    rows = db.reverseFindRows("1");
+    ASSERT_EQ(rows, (std::vector<std::string> { "a 1", "b 1 " }));
+
+    rows = db.reverseFindRows("2");
+    ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));
+
+    // This is a quirk of the function, but is actually valid.
+    rows = db.reverseFindRows("2\n");
+    ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));
+
+    rows = db.reverseFindRows("22");
+    ASSERT_TRUE(rows.empty());
+
+    rows = db.reverseFindRows("3\n");
+    ASSERT_TRUE(rows.empty());
+
+    rows = db.reverseFindRows("4");
+    ASSERT_TRUE(rows.empty());
+}
+
 }; // namespace McBopomofo