Skip to content

Commit

Permalink
Implement lt-merge --unmerge
Browse files Browse the repository at this point in the history
cf. HEAD^
  • Loading branch information
unhammer committed Dec 19, 2024
1 parent 57a297e commit a7a5cdb
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 3 deletions.
61 changes: 61 additions & 0 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2009,6 +2009,67 @@ FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
}


void
FSTProcessor::quoteUnmerge(InputFile &input, UFILE *output)
{
StreamReader reader(&input);
reader.alpha = &alphabet;
reader.add_unknowns = true;

UString surface;
while (!reader.at_eof) {
reader.next();
bool unmerging = false;
for (StreamReader::Reading &it : reader.readings) {
// TODO: look up in it.symbols instead (but need to make an alphabet then)
if(it.content.find(u"<MERGED>") != std::string::npos) {
unmerging = true;
}
}
write(reader.blank, output);
write(reader.wblank, output);
if(unmerging) {
// Just output the last reading (surface form), removing one level of escaping
StreamReader::Reading &lastReading = reader.readings.back(); // (we know there's at least one because of the above loop)
UString surface;
bool escaping = false;
for(auto &c : lastReading.content) {
if(escaping) {
surface += c;
escaping = false;
}
else if(c == u'\\') {
escaping = true;
}
else {
surface += c;
}
}
write(surface, output);
}
else {
if(reader.readings.size() > 0) {
// NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
// (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
u_fputc('^', output);
bool seen_reading = false;
for (StreamReader::Reading &it : reader.readings) {
if (seen_reading) {
u_fputc('/', output);
}
write(it.content, output);
seen_reading = true;
}
u_fputc('$', output);
}
}
if(reader.at_null) {
u_fputc('\0', output);
u_fflush(output);
}
}
}

bool
FSTProcessor::valid() const
{
Expand Down
1 change: 1 addition & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ class FSTProcessor
UString biltransfull(UStringView input_word, bool with_delim = true);
void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
void quoteMerge(InputFile& input, UFILE *output);
void quoteUnmerge(InputFile& input, UFILE *output);
std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
void SAO(InputFile& input, UFILE *output);
Expand Down
7 changes: 6 additions & 1 deletion lttoolbox/lt_merge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ int main(int argc, char *argv[])
FSTProcessor fstp;
fstp.setNullFlush(true); // cf. description of cli["null-flush"]
fstp.initBiltrans();
fstp.quoteMerge(input, output);
if(unmerge) {
fstp.quoteUnmerge(input, output);
}
else {
fstp.quoteMerge(input, output);
}

return 0;
}
14 changes: 14 additions & 0 deletions tests/lt_merge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,17 @@ class WordblankTest(MergeTest):
# Using r'' to avoid doubling escapes even more:
inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><MERGE_END>$']
expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»<MERGED>$']


class SimpleUnmergeTest(MergeTest):
procflags = ['--unmerge']
# Using r'' to avoid doubling escapes even more:
inputs = [r'^ikkje<adv>/ikkje$ ^«Se og Hør»<MERGED>/«Se og Hør»$ ^då<adv>/då$']
expectedOutputs = [r'^ikkje<adv>/ikkje$ «Se og Hør» ^då<adv>/då$']


class EscapedUnmergeTest(MergeTest):
procflags = ['--unmerge']
# Using r'' to avoid doubling escapes even more:
inputs = [r'^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$']
expectedOutputs = [r'^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»']
2 changes: 0 additions & 2 deletions tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose',
'lt_tmxproc', 'lt_merge']

# modules = ['lt_merge']


if __name__ == "__main__":
os.chdir(os.path.dirname(__file__))
Expand Down

0 comments on commit a7a5cdb

Please sign in to comment.