Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor lookup-by-value, fix a bug, and add unit tests #374

Merged
merged 2 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 11 additions & 27 deletions Source/Engine/McBopomofoLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@

#include "McBopomofoLM.h"
#include <algorithm>
#include <iterator>
#include <float.h>
#include <iterator>
#include <limits>

namespace McBopomofo {

Expand Down Expand Up @@ -136,24 +137,18 @@ bool McBopomofoLM::hasUnigrams(const std::string& key)
return getUnigrams(key).size() > 0;
}

std::string McBopomofoLM::getReading(const std::string_view& value)
std::string McBopomofoLM::getReading(const std::string& value)
{
std::vector<std::string> records = m_languageModel.getReadings(value);

double highScore = -DBL_MAX;
std::string highScoringValue;
for (std::string record : records) {
std::vector<std::string_view> parts = split(record, ' ');
if (parts.size() == 3) {
double score = std::stod(std::string(parts[2]));
if (score > highScore) {
highScoringValue = std::string(parts[0]);
highScore = score;
}
std::vector<ParselessLM::FoundReading> foundReadings = m_languageModel.getReadings(value);
double topScore = std::numeric_limits<double>::lowest();
std::string topValue;
for (const auto& foundReading : foundReadings) {
if (foundReading.score > topScore) {
topValue = foundReading.reading;
topScore = foundReading.score;
}
}

return highScoringValue;
return topValue;
}

void McBopomofoLM::setPhraseReplacementEnabled(bool enabled)
Expand Down Expand Up @@ -222,15 +217,4 @@ bool McBopomofoLM::hasAssociatedPhrasesForKey(const std::string& key)
return m_associatedPhrases.hasValuesForKey(key);
}

std::vector<std::string_view> McBopomofoLM::split(const std::string_view& str, char delim) {
std::vector<std::string_view> strings;
size_t start;
size_t end = 0;
while ((start = str.find_first_not_of(delim, end)) != std::string_view::npos) {
end = str.find(delim, start);
strings.push_back(std::string_view(str.substr(start, end - start)));
}
return strings;
}

} // namespace McBopomofo
13 changes: 2 additions & 11 deletions Source/Engine/McBopomofoLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,8 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
const std::vector<std::string> associatedPhrasesForKey(const std::string& key);
bool hasAssociatedPhrasesForKey(const std::string& key);

/// Returns a list of readings that match a given value.
/// @param value A string representing the text to look up reading candidates for. For example,
/// if you pass "說", it returns a list of records that include ㄕㄨㄛ, ㄕㄨㄟˋ, and ㄩㄝˋ.
/// @return Best reading found for the string, or an empty string if no matches are found.
std::string getReading(const std::string_view& value);
/// Returns the top-scored reading from the base model, given the value.
std::string getReading(const std::string& value);

protected:
/// Filters and converts the input unigrams and return a new list of unigrams.
Expand All @@ -123,12 +120,6 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel {
const std::unordered_set<std::string>& excludedValues,
std::unordered_set<std::string>& insertedValues);

/// Splits a string into parts
/// @param str The string to split.
/// @param delim Delimiter character in the string to split on.
/// @return vector of split-up strings
std::vector<std::string_view> split(const std::string_view& str, char delim);

ParselessLM m_languageModel;
UserPhrasesLM m_userPhrases;
UserPhrasesLM m_excludedPhrases;
Expand Down
50 changes: 46 additions & 4 deletions Source/Engine/ParselessLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,53 @@ bool McBopomofo::ParselessLM::hasUnigrams(const std::string& key)
return db_->findFirstMatchingLine(key + " ") != nullptr;
}

std::vector<std::string> McBopomofo::ParselessLM::getReadings(const std::string_view& value)
std::vector<McBopomofo::ParselessLM::FoundReading> McBopomofo::ParselessLM::getReadings(const std::string& value)
{
if (db_ == nullptr) {
return std::vector<std::string>();
return std::vector<McBopomofo::ParselessLM::FoundReading>();
}

return db_->reverseFindRows(value);

std::vector<McBopomofo::ParselessLM::FoundReading> results;

// We append a space so that we only find rows with the exact value. We
// are taking advantage of the fact that a well-form row in this LM must
// be in the format of "key value score".
std::string actualValue = value + " ";

for (const auto& row : db_->reverseFindRows(actualValue)) {
std::string key;
double score = 0;

// Move ahead until we encounter the first space. This is the key.
auto it = row.begin();
while (it != row.end() && *it != ' ') {
++it;
}

key = std::string(row.begin(), it);

// Read past the space.
if (it != row.end()) {
++it;
}

if (it != row.end()) {
// Now it is the start of the value portion, but we move ahead
// until we encounter the second space to skip this part.
while (it != row.end() && *it != ' ') {
++it;
}
}

// Read past the space. The remainder, if it exists, is the score.
if (it != row.end()) {
++it;
}

if (it != row.end()) {
score = std::stod(std::string(it, row.end()));
}
results.emplace_back(McBopomofo::ParselessLM::FoundReading { key, score });
}
return results;
}
7 changes: 6 additions & 1 deletion Source/Engine/ParselessLM.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,12 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel {
const std::string& key) override;
bool hasUnigrams(const std::string& key) override;

std::vector<std::string> getReadings(const std::string_view& value);
struct FoundReading {
std::string reading;
double score;
};
// Look up reading by value. This is specific to ParselessLM only.
std::vector<FoundReading> getReadings(const std::string& value);

private:
int fd_ = -1;
Expand Down
14 changes: 14 additions & 0 deletions Source/Engine/ParselessLMTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@ TEST(ParselessLMTest, SanityCheckTest)
unigrams = lm.getUnigrams("_punctuation_list");
ASSERT_GT(unigrams.size(), 0);

std::vector<ParselessLM::FoundReading> found_readings;
found_readings = lm.getReadings("不存在的詞");
ASSERT_TRUE(found_readings.empty());

found_readings = lm.getReadings("讀音");
ASSERT_EQ(found_readings.size(), 1);

found_readings = lm.getReadings("鑰匙");
ASSERT_GT(found_readings.size(), 1);

found_readings = lm.getReadings("得");
ASSERT_GT(found_readings.size(), 1);
ASSERT_EQ(found_readings[0].reading, "ㄉㄜˊ");

lm.close();
}

Expand Down
10 changes: 5 additions & 5 deletions Source/Engine/ParselessPhraseDB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ std::vector<std::string> ParselessPhraseDB::reverseFindRows(

while (recordBegin < end_) {
const char* ptr = recordBegin;

// skip over the key to find the field separator
while (ptr < end_ && *ptr != ' ') {
++ptr;
Expand All @@ -181,19 +181,19 @@ std::vector<std::string> ParselessPhraseDB::reverseFindRows(
while (ptr < end_ && *ptr == ' ') {
++ptr;
}

// now walk to the end of this record
const char* recordEnd = ptr;
while (recordEnd < end_ && *recordEnd != '\n') {
++recordEnd;
}

if (ptr + value.length() < end_ && memcmp(ptr, value.data(), value.length()) == 0) {
// prefix match, add entire record to return value
rows.emplace_back(recordBegin, recordEnd - recordBegin);
}
// skip over the record separator. there should be just one, but loop just in case.

// skip over to the next line start
recordBegin = recordEnd;
while (recordBegin < end_ && *recordBegin == '\n') {
++recordBegin;
Expand Down
9 changes: 6 additions & 3 deletions Source/Engine/ParselessPhraseDB.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@ class ParselessPhraseDB {

const char* findFirstMatchingLine(const std::string_view& key);

// Find the rows that (prefix-)match the value, useful for returning all the
// ways a phrase or character can be pronounced. Note that this is a potentially-
// slow linear search that cannot take advantage of the pre-sorting.
// Find the rows whose text past the key column plus the field separator
// is a prefix match of the given value. For example, if the row is
// "foo bar -1.00", the values "b", "ba", "bar", "bar ", "bar -1.00" are
// are valid prefix matches, where as the value "barr" isn't. This
// performs linear scan since, unlike lookup-by-key, it cannot take
// advantage of the fact that the underlying data is sorted by keys.
std::vector<std::string> reverseFindRows(const std::string_view& value);

private:
Expand Down
26 changes: 26 additions & 0 deletions Source/Engine/ParselessPhraseDBTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,30 @@ TEST(ParselessPhraseDBTest, StressTest)
}
}

TEST(ParselessPhraseDBTest, LookUpByValue)
{
std::string data = "a 1\nb 1 \nc 2\nd 3";
ParselessPhraseDB db(data.c_str(), data.length());

std::vector<std::string> rows;
rows = db.reverseFindRows("1");
ASSERT_EQ(rows, (std::vector<std::string> { "a 1", "b 1 " }));

rows = db.reverseFindRows("2");
ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));

// This is a quirk of the function, but is actually valid.
rows = db.reverseFindRows("2\n");
ASSERT_EQ(rows, (std::vector<std::string> { "c 2" }));

rows = db.reverseFindRows("22");
ASSERT_TRUE(rows.empty());

rows = db.reverseFindRows("3\n");
ASSERT_TRUE(rows.empty());

rows = db.reverseFindRows("4");
ASSERT_TRUE(rows.empty());
}

}; // namespace McBopomofo
2 changes: 1 addition & 1 deletion Source/LanguageModelManager.mm
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ + (nullable NSString *)readingFor:(NSString *)phrase {
}

std::string reading = gLanguageModelMcBopomofo.getReading(phrase.UTF8String);
return !reading.empty() ? [NSString stringWithCString:reading.c_str() encoding:NSUTF8StringEncoding] : nil;
return !reading.empty() ? [NSString stringWithUTF8String:reading.c_str()] : nil;
}

@end