Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bring in lookup-by-value introduced in the Mac version #86

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/Engine/McBopomofoLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@

#include "McBopomofoLM.h"
#include <algorithm>
#include <float.h>
#include <iterator>
#include <limits>

namespace McBopomofo {

Expand Down Expand Up @@ -135,6 +137,20 @@ bool McBopomofoLM::hasUnigrams(const std::string& key)
return !getUnigrams(key).empty();
}

std::string McBopomofoLM::getReading(const std::string& value)
{
std::vector<ParselessLM::FoundReading> foundReadings = m_languageModel.getReadings(value);
double topScore = std::numeric_limits<double>::lowest();
std::string topValue;
for (const auto& foundReading : foundReadings) {
if (foundReading.score > topScore) {
topValue = foundReading.reading;
topScore = foundReading.score;
}
}
return topValue;
}

void McBopomofoLM::setPhraseReplacementEnabled(bool enabled)
{
m_phraseReplacementEnabled = enabled;
Expand Down
3 changes: 3 additions & 0 deletions src/Engine/McBopomofoLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
const std::vector<std::string> associatedPhrasesForKey(const std::string& key);
bool hasAssociatedPhrasesForKey(const std::string& key);

/// Returns the top-scored reading from the base model, given the value.
std::string getReading(const std::string& value);

protected:
/// Filters and converts the input unigrams and return a new list of unigrams.
///
Expand Down
51 changes: 51 additions & 0 deletions src/Engine/ParselessLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,54 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key)

return db_->findFirstMatchingLine(key + " ") != nullptr;
}

std::vector<McBopomofo::ParselessLM::FoundReading> McBopomofo::ParselessLM::getReadings(const std::string& value)
{
if (db_ == nullptr) {
return std::vector<McBopomofo::ParselessLM::FoundReading>();
}

std::vector<McBopomofo::ParselessLM::FoundReading> results;

// We append a space so that we only find rows with the exact value. We
// are taking advantage of the fact that a well-form row in this LM must
// be in the format of "key value score".
std::string actualValue = value + " ";

for (const auto& row : db_->reverseFindRows(actualValue)) {
std::string key;
double score = 0;

// Move ahead until we encounter the first space. This is the key.
auto it = row.begin();
while (it != row.end() && *it != ' ') {
++it;
}

key = std::string(row.begin(), it);

// Read past the space.
if (it != row.end()) {
++it;
}

if (it != row.end()) {
// Now it is the start of the value portion, but we move ahead
// until we encounter the second space to skip this part.
while (it != row.end() && *it != ' ') {
++it;
}
}

// Read past the space. The remainder, if it exists, is the score.
if (it != row.end()) {
++it;
}

if (it != row.end()) {
score = std::stod(std::string(it, row.end()));
}
results.emplace_back(McBopomofo::ParselessLM::FoundReading { key, score });
}
return results;
}
7 changes: 7 additions & 0 deletions src/Engine/ParselessLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel {
const std::string& key) override;
bool hasUnigrams(const std::string& key) override;

struct FoundReading {
std::string reading;
double score;
};
// Look up reading by value. This is specific to ParselessLM only.
std::vector<FoundReading> getReadings(const std::string& value);

private:
int fd_ = -1;
void* data_ = nullptr;
Expand Down
14 changes: 14 additions & 0 deletions src/Engine/ParselessLMTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@ TEST(ParselessLMTest, SanityCheckTest)
unigrams = lm.getUnigrams("_punctuation_list");
ASSERT_GT(unigrams.size(), 0);

std::vector<ParselessLM::FoundReading> found_readings;
found_readings = lm.getReadings("不存在的詞");
ASSERT_TRUE(found_readings.empty());

found_readings = lm.getReadings("讀音");
ASSERT_EQ(found_readings.size(), 1);

found_readings = lm.getReadings("鑰匙");
ASSERT_GT(found_readings.size(), 1);

found_readings = lm.getReadings("得");
ASSERT_GT(found_readings.size(), 1);
ASSERT_EQ(found_readings[0].reading, "ㄉㄜˊ");

lm.close();
}

Expand Down
40 changes: 40 additions & 0 deletions src/Engine/ParselessPhraseDB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,44 @@ const char* ParselessPhraseDB::findFirstMatchingLine(
return nullptr;
}

std::vector<std::string> ParselessPhraseDB::reverseFindRows(
const std::string_view& value)
{
std::vector<std::string> rows;

const char* recordBegin = begin_;

while (recordBegin < end_) {
const char* ptr = recordBegin;

// skip over the key to find the field separator
while (ptr < end_ && *ptr != ' ') {
++ptr;
}
// skip over the field separator. there should be just one, but loop just in case.
while (ptr < end_ && *ptr == ' ') {
++ptr;
}

// now walk to the end of this record
const char* recordEnd = ptr;
while (recordEnd < end_ && *recordEnd != '\n') {
++recordEnd;
}

if (ptr + value.length() < end_ && memcmp(ptr, value.data(), value.length()) == 0) {
// prefix match, add entire record to return value
rows.emplace_back(recordBegin, recordEnd - recordBegin);
}

// skip over to the next line start
recordBegin = recordEnd;
while (recordBegin < end_ && *recordBegin == '\n') {
++recordBegin;
}
}

return rows;
}

}; // namespace McBopomofo
9 changes: 8 additions & 1 deletion src/Engine/ParselessPhraseDB.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_

#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>

Expand All @@ -51,6 +50,14 @@ class ParselessPhraseDB {

const char* findFirstMatchingLine(const std::string_view& key);

// Find the rows whose text past the key column plus the field separator
// is a prefix match of the given value. For example, if the row is
// "foo bar -1.00", the values "b", "ba", "bar", "bar ", "bar -1.00" are
// are valid prefix matches, where as the value "barr" isn't. This
// performs linear scan since, unlike lookup-by-key, it cannot take
// advantage of the fact that the underlying data is sorted by keys.
std::vector<std::string> reverseFindRows(const std::string_view& value);

private:
const char* begin_;
const char* end_;
Expand Down
26 changes: 26 additions & 0 deletions src/Engine/ParselessPhraseDBTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,30 @@ TEST(ParselessPhraseDBTest, StressTest)
}
}

TEST(ParselessPhraseDBTest, LookUpByValue)
{
std::string data = "a 1\nb 1 \nc 2\nd 3";
ParselessPhraseDB db(data.c_str(), data.length());

std::vector<std::string> rows;
rows = db.reverseFindRows("1");
ASSERT_EQ(rows, (std::vector<std::string> { "a 1", "b 1 " }));

rows = db.reverseFindRows("2");
ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));

// This is a quirk of the function, but is actually valid.
rows = db.reverseFindRows("2\n");
ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));

rows = db.reverseFindRows("22");
ASSERT_TRUE(rows.empty());

rows = db.reverseFindRows("3\n");
ASSERT_TRUE(rows.empty());

rows = db.reverseFindRows("4");
ASSERT_TRUE(rows.empty());
}

}; // namespace McBopomofo
Loading