Skip to content

Commit

Permalink
Bring in lookup-by-value introduced in the Mac version
Browse files Browse the repository at this point in the history
  • Loading branch information
lukhnos committed Oct 4, 2023
1 parent 81ca619 commit b1ef71e
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 1 deletion.
16 changes: 16 additions & 0 deletions src/Engine/McBopomofoLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@

#include "McBopomofoLM.h"
#include <algorithm>
#include <float.h>
#include <iterator>
#include <limits>

namespace McBopomofo {

Expand Down Expand Up @@ -135,6 +137,20 @@ bool McBopomofoLM::hasUnigrams(const std::string& key)
return !getUnigrams(key).empty();
}

std::string McBopomofoLM::getReading(const std::string& value)
{
std::vector<ParselessLM::FoundReading> foundReadings = m_languageModel.getReadings(value);
double topScore = std::numeric_limits<double>::lowest();
std::string topValue;
for (const auto& foundReading : foundReadings) {
if (foundReading.score > topScore) {
topValue = foundReading.reading;
topScore = foundReading.score;
}
}
return topValue;
}

void McBopomofoLM::setPhraseReplacementEnabled(bool enabled)
{
m_phraseReplacementEnabled = enabled;
Expand Down
3 changes: 3 additions & 0 deletions src/Engine/McBopomofoLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
const std::vector<std::string> associatedPhrasesForKey(const std::string& key);
bool hasAssociatedPhrasesForKey(const std::string& key);

/// Returns the top-scored reading from the base model, given the value.
std::string getReading(const std::string& value);

protected:
/// Filters and converts the input unigrams and return a new list of unigrams.
///
Expand Down
51 changes: 51 additions & 0 deletions src/Engine/ParselessLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,54 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key)

return db_->findFirstMatchingLine(key + " ") != nullptr;
}

std::vector<McBopomofo::ParselessLM::FoundReading> McBopomofo::ParselessLM::getReadings(const std::string& value)
{
if (db_ == nullptr) {
return std::vector<McBopomofo::ParselessLM::FoundReading>();
}

std::vector<McBopomofo::ParselessLM::FoundReading> results;

// We append a space so that we only find rows with the exact value. We
// are taking advantage of the fact that a well-form row in this LM must
// be in the format of "key value score".
std::string actualValue = value + " ";

for (const auto& row : db_->reverseFindRows(actualValue)) {
std::string key;
double score = 0;

// Move ahead until we encounter the first space. This is the key.
auto it = row.begin();
while (it != row.end() && *it != ' ') {
++it;
}

key = std::string(row.begin(), it);

// Read past the space.
if (it != row.end()) {
++it;
}

if (it != row.end()) {
// Now it is the start of the value portion, but we move ahead
// until we encounter the second space to skip this part.
while (it != row.end() && *it != ' ') {
++it;
}
}

// Read past the space. The remainder, if it exists, is the score.
if (it != row.end()) {
++it;
}

if (it != row.end()) {
score = std::stod(std::string(it, row.end()));
}
results.emplace_back(McBopomofo::ParselessLM::FoundReading { key, score });
}
return results;
}
7 changes: 7 additions & 0 deletions src/Engine/ParselessLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel {
const std::string& key) override;
bool hasUnigrams(const std::string& key) override;

struct FoundReading {
std::string reading;
double score;
};
// Look up reading by value. This is specific to ParselessLM only.
std::vector<FoundReading> getReadings(const std::string& value);

private:
int fd_ = -1;
void* data_ = nullptr;
Expand Down
14 changes: 14 additions & 0 deletions src/Engine/ParselessLMTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@ TEST(ParselessLMTest, SanityCheckTest)
unigrams = lm.getUnigrams("_punctuation_list");
ASSERT_GT(unigrams.size(), 0);

std::vector<ParselessLM::FoundReading> found_readings;
found_readings = lm.getReadings("不存在的詞");
ASSERT_TRUE(found_readings.empty());

found_readings = lm.getReadings("讀音");
ASSERT_EQ(found_readings.size(), 1);

found_readings = lm.getReadings("鑰匙");
ASSERT_GT(found_readings.size(), 1);

found_readings = lm.getReadings("");
ASSERT_GT(found_readings.size(), 1);
ASSERT_EQ(found_readings[0].reading, "ㄉㄜˊ");

lm.close();
}

Expand Down
40 changes: 40 additions & 0 deletions src/Engine/ParselessPhraseDB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,44 @@ const char* ParselessPhraseDB::findFirstMatchingLine(
return nullptr;
}

std::vector<std::string> ParselessPhraseDB::reverseFindRows(
const std::string_view& value)
{
std::vector<std::string> rows;

const char* recordBegin = begin_;

while (recordBegin < end_) {
const char* ptr = recordBegin;

// skip over the key to find the field separator
while (ptr < end_ && *ptr != ' ') {
++ptr;
}
// skip over the field separator. there should be just one, but loop just in case.
while (ptr < end_ && *ptr == ' ') {
++ptr;
}

// now walk to the end of this record
const char* recordEnd = ptr;
while (recordEnd < end_ && *recordEnd != '\n') {
++recordEnd;
}

if (ptr + value.length() < end_ && memcmp(ptr, value.data(), value.length()) == 0) {
// prefix match, add entire record to return value
rows.emplace_back(recordBegin, recordEnd - recordBegin);
}

// skip over to the next line start
recordBegin = recordEnd;
while (recordBegin < end_ && *recordBegin == '\n') {
++recordBegin;
}
}

return rows;
}

}; // namespace McBopomofo
9 changes: 8 additions & 1 deletion src/Engine/ParselessPhraseDB.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_

#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>

Expand All @@ -51,6 +50,14 @@ class ParselessPhraseDB {

const char* findFirstMatchingLine(const std::string_view& key);

// Find the rows whose text past the key column plus the field separator
// is a prefix match of the given value. For example, if the row is
// "foo bar -1.00", the values "b", "ba", "bar", "bar ", "bar -1.00" are
// are valid prefix matches, where as the value "barr" isn't. This
// performs linear scan since, unlike lookup-by-key, it cannot take
// advantage of the fact that the underlying data is sorted by keys.
std::vector<std::string> reverseFindRows(const std::string_view& value);

private:
const char* begin_;
const char* end_;
Expand Down
26 changes: 26 additions & 0 deletions src/Engine/ParselessPhraseDBTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,30 @@ TEST(ParselessPhraseDBTest, StressTest)
}
}

TEST(ParselessPhraseDBTest, LookUpByValue)
{
std::string data = "a 1\nb 1 \nc 2\nd 3";
ParselessPhraseDB db(data.c_str(), data.length());

std::vector<std::string> rows;
rows = db.reverseFindRows("1");
ASSERT_EQ(rows, (std::vector<std::string> { "a 1", "b 1 " }));

rows = db.reverseFindRows("2");
ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));

// This is a quirk of the function, but is actually valid.
rows = db.reverseFindRows("2\n");
ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));

rows = db.reverseFindRows("22");
ASSERT_TRUE(rows.empty());

rows = db.reverseFindRows("3\n");
ASSERT_TRUE(rows.empty());

rows = db.reverseFindRows("4");
ASSERT_TRUE(rows.empty());
}

}; // namespace McBopomofo

0 comments on commit b1ef71e

Please sign in to comment.