Skip to content

Commit

Permalink
$ echo '^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^så/så<adv>$ ^ve\[dig…
Browse files Browse the repository at this point in the history
…/v<adv>$^»/»<rquot><MERGE_END>$ ^bra/bra<adj>$' | lttoolbox/lt-proc -b ../apertium-nno-nob/nob-nno.autobil.bin 2>/dev/null

^ikke/ikke<adv>$ ^«så ve\[dig»/quote<MERGED>$ ^bra/bra<adj>$

$ echo '^ja/j<ij>$ ^«/«<lquot><START>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><END>$' | lttoolbox/lt-proc  -b ../apertium-nno-nob/nob-nno.autobil.bin 2>/dev/null
^ja/j<ij>$ ^«/«<lquot><START>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><END>$

but we'll need an unmerge-command that outputs the surface form – and
this should also be protected from the generator and biltrans (we
don't want them to require regex matches for whatever).

Would it make sense to have a special hidden tag for "just pass
through the surface form"? That would be helpful for other things as
well, e.g. merge-names
  • Loading branch information
unhammer committed Nov 25, 2024
1 parent f771c90 commit 7521997
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
81 changes: 81 additions & 0 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1936,6 +1936,87 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
return compose(result, ""_u, with_delim, mark);
}

void
FSTProcessor::merge_quote(InputFile& input, UFILE *output)
{
StreamReader reader(&input);
reader.alpha = &alphabet;
reader.add_unknowns = true;

bool merging = false;
UString surface;
while (!reader.at_eof) {
reader.next();

bool end_merging = false;
std::cerr << "CHUNK: " << reader.chunk << std::endl;

for (StreamReader::Reading &it : reader.readings) {
std::cerr << "Reading: " << it.content << std::endl;
// TODO: look up in it.symbols instead (but need to make an alphabet then)
if(it.content.find(u"<MERGE_BEG>") != std::string::npos) {
merging = true;
std::cerr << "\033[1;35mSTART MERGE\033[0m" << std::endl;
}
if(it.content.find(u"<MERGE_END>") != std::string::npos) {
end_merging = true;
}
}
if(merging) {
std::cerr << "\033[0;35mmerging\033[0m" << std::endl;
if(surface.size() > 0) {
surface += reader.blank;
for(auto &c : reader.wblank) {
if (escaped_chars.find(c) != escaped_chars.end()) {
surface += u'\\';
}
surface += c;
}
}
else {
// The initial blank should just be output before the merged LU:
write(reader.blank, output);
write(reader.wblank, output);
}
if(reader.readings.size() > 0) {
surface += reader.readings[0].content;
}
}
else {
write(reader.blank, output);
write(reader.wblank, output);
if(reader.readings.size() > 0) {
// TODO: How to differentiate EOF from a lexical unit with no readings like ^$ ?
// We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$
u_fputc('^', output);
bool seen_reading = false;
for (StreamReader::Reading &it : reader.readings) {
if (seen_reading) {
u_fputc('/', output);
}
write(it.content, output);
seen_reading = true;
}
u_fputc('$', output);
}
}
if(end_merging || reader.at_null) {
std::cerr << "\033[1;35msurface=\t" << surface << "\033[0m" << std::endl;
std::cerr << "\033[1;35mEND_MERGE\033[0m" << std::endl;
merging = false;
end_merging = false;
u_fputc('^', output);
write(surface, output);
write("/quote<MERGED>$"_u, output);
surface.clear();
if(reader.at_null) {
u_fputc('\0', output);
u_fflush(output);
}
}
}
}


bool
FSTProcessor::valid() const
Expand Down
1 change: 1 addition & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,7 @@ class FSTProcessor
UString biltrans(UStringView input_word, bool with_delim = true);
UString biltransfull(UStringView input_word, bool with_delim = true);
void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
void merge_quote(InputFile& input, UFILE *output);
std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
void SAO(InputFile& input, UFILE *output);
Expand Down
3 changes: 2 additions & 1 deletion lttoolbox/lt_proc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ int main(int argc, char *argv[])
case 'b':
fstp.initBiltrans();
checkValidity(fstp);
fstp.bilingual(input, output, bilmode);
// fstp.bilingual(input, output, bilmode);
fstp.merge_quote(input, output); // TODO: separate mode, is there use for an fst or should it just be a new command lt-merge that takes no fst?
break;

case 'e':
Expand Down

0 comments on commit 7521997

Please sign in to comment.