Skip to content

Commit

Permalink
new lt-merge command to merge LU's from BEG to END tag
Browse files Browse the repository at this point in the history
Since we need to unquote when generating before tf-inject, we need to
double-quote escaped chars here:

  $ echo '^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^til/til<pr>$ ^x\@y.com/x\@y.com<email>$^»/»<rquot><MERGE_END>$ ^da/da<adv>$' | lttoolbox/lt-merge
  ^ikke/ikke<adv>$ ^«til x\\\@y.com»/«til x\\\@y.com»<MERGED>$ ^da/da<adv>$

  $ echo '^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^veldig/veldig<adv>$[[/]]^»/»<rquot><MERGE_END>$' | lttoolbox/lt-merge
  ^«\[\[tf:i:a\]\]veldig\[\[\/\]\]»/«\[\[tf:i:a\]\]veldig\[\[\/\]\]»<MERGED>$

If we run this between analysis and wblank-attach, then after
the `lt-proc -b generator.bin` step we should have e.g.

  ^ikkje<adv>/ikkje$ ^«til x\\\@y.com»<MERGED>/«til x\\\@y.com»$ ^då<adv>/då$

which after `cg-proc -1 -n -g genprefs.bin` would turn into

  ikkje «til x\@y.com» då

Note how \\\@ turned into \@ – we removed one layer of quoting, but
this is still in the apertium stream so special chars stay quoted
until the final tf-inject.

TODO:

We need to be able to pass MERGED stuff unchanged through biltrans and
generator, would like to <re>.+</re><i><s n="MERGED"/></i> but . is
literal period in re(!) and even ANY_CHAR doesn't seem supported in
lt-proc -b. It should be possible to support with a `step_case_override`
in `FSTProcessor::biltrans`.

We need an `lt-merge --unmerge` to undo the merge:

  $ echo '^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$' | lt-merge --unmerge
  ^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»

which then becomes

  $ echo '^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»' |cg-proc -1ng nob-nno.genprefs.rlx.bin
  ikkje «[[tf:i:a]]s\^å[[/]]»

which tf-inject is happy to handle.
  • Loading branch information
unhammer committed Dec 10, 2024
1 parent 69eb42e commit 2bd9008
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ lttoolbox/liblttoolbox.so.*
/lttoolbox/lt-comp
/lttoolbox/lt-compose
/lttoolbox/lt-proc
/lttoolbox/lt-merge
/lttoolbox/lt-trim
/lttoolbox/Makefile
/lttoolbox/Makefile.in
Expand Down
3 changes: 3 additions & 0 deletions lttoolbox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB})
add_executable(lt-proc lt_proc.cc)
target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB})

add_executable(lt-merge lt_merge.cc)
target_link_libraries(lt-merge lttoolbox ${GETOPT_LIB})

add_executable(lt-expand lt_expand.cc)
target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB})

Expand Down
81 changes: 81 additions & 0 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1936,6 +1936,87 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
return compose(result, ""_u, with_delim, mark);
}

void
FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
{
StreamReader reader(&input);
reader.alpha = &alphabet;
reader.add_unknowns = true;

bool merging = false;
UString surface;
while (!reader.at_eof) {
reader.next();

bool end_merging = false;
std::cerr << "CHUNK: " << reader.chunk << std::endl;

for (StreamReader::Reading &it : reader.readings) {
std::cerr << "Reading: " << it.content << std::endl;
// TODO: look up in it.symbols instead (but need to make an alphabet then)
if(it.content.find(u"<MERGE_BEG>") != std::string::npos) {
merging = true;
std::cerr << "\033[1;35mSTART MERGE\033[0m" << std::endl;
}
if(it.content.find(u"<MERGE_END>") != std::string::npos) {
end_merging = true;
}
}
if(merging) {
std::cerr << "\033[0;35mmerging\033[0m" << std::endl;
if(surface.size() > 0) {
surface += reader.blank;
appendEscaped(surface, reader.wblank);
}
else {
// The initial blank should just be output before the merged LU:
write(reader.blank, output);
write(reader.wblank, output);
}
if(reader.readings.size() > 0) {
// Double-escape the form since we'll unescape during lt-unmerge:
appendEscaped(surface, reader.readings[0].content);
}
}
else {
write(reader.blank, output);
write(reader.wblank, output);
if(reader.readings.size() > 0) {
// NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
// (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
u_fputc('^', output);
bool seen_reading = false;
for (StreamReader::Reading &it : reader.readings) {
if (seen_reading) {
u_fputc('/', output);
}
write(it.content, output);
seen_reading = true;
}
u_fputc('$', output);
}
}
if(end_merging || reader.at_null) {
std::cerr << "\033[1;35msurface=\t" << surface << "\033[0m" << std::endl;
std::cerr << "\033[1;35mEND_MERGE\033[0m" << std::endl;
if (merging) {
u_fputc('^', output);
write(surface, output);
u_fputc('/', output);
write(surface, output);
write("<MERGED>$"_u, output);
merging = false;
}
end_merging = false;
surface.clear();
if(reader.at_null) {
u_fputc('\0', output);
u_fflush(output);
}
}
}
}


bool
FSTProcessor::valid() const
Expand Down
10 changes: 10 additions & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,15 @@ class FSTProcessor
}
}

void appendEscaped(UString& to, const UString& from) {
for(auto &c : from) {
if (escaped_chars.find(c) != escaped_chars.end()) {
to += u'\\';
}
to += c;
}
}

public:

/*
Expand Down Expand Up @@ -496,6 +505,7 @@ class FSTProcessor
UString biltrans(UStringView input_word, bool with_delim = true);
UString biltransfull(UStringView input_word, bool with_delim = true);
void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
void quoteMerge(InputFile& input, UFILE *output);
std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
void SAO(InputFile& input, UFILE *output);
Expand Down
40 changes: 40 additions & 0 deletions lttoolbox/lt-merge.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
.Dd December 10, 2024
.Dt LT-MERGE 1
.Os Apertium
.Sh NAME
.Nm lt-merge
.Nd lexical merger for Apertium
.Sh SYNOPSIS
.Nm lt-merge
.Op Fl u
.Op Ar input_file Op Ar output_file
.Sh DESCRIPTION
.Nm lt-merge
is the application responsible for merging and unmerging
lexical units
.Pp
It accomplishes this.
.Sh OPTIONS
.Bl -tag -width Ds
.It Fl u , Fl Fl unmerge
Run in reverse, this splits previously merged words.
.It Fl v , Fl Fl version
Display the version number.
.It Fl h , Fl Fl help
Display this help.
.El
\" .Sh FILES
\" .Bl -tag -width Ds
\" .It Ar input_file
\" The input compiled dictionary.
\" .El
.Sh SEE ALSO
.Xr apertium 1 ,
.Xr lt-proc 1
.Sh COPYRIGHT
Copyright \(co 2024 Universitat d'Alacant / Universidad de Alicante.
This is free software.
You may redistribute copies of it under the terms of
.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License .
.Sh BUGS
Many... lurking in the dark and waiting for you!
46 changes: 46 additions & 0 deletions lttoolbox/lt_merge.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (C) 2024 Universitat d'Alacant / Universidad de Alicante
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses/>.
*/
#include <lttoolbox/fst_processor.h>
#include <lttoolbox/file_utils.h>
#include <lttoolbox/cli.h>
#include <lttoolbox/lt_locale.h>
#include <iostream>


int main(int argc, char *argv[])
{
LtLocale::tryToSetLocale();
CLI cli("merge lexical units from the one tagged BEG until END", PACKAGE_VERSION);
cli.add_file_arg("input_file");
cli.add_file_arg("output_file");
cli.add_bool_arg('u', "unmerge", "Undo the merge");
cli.parse_args(argc, argv);

auto strs = cli.get_strs();
bool unmerge = cli.get_bools()["unmerge"];
InputFile input;
if (!cli.get_files()[1].empty()) {
input.open_or_exit(cli.get_files()[0].c_str());
}
UFILE* output = openOutTextFile(cli.get_files()[1]);

FSTProcessor fstp;
fstp.initBiltrans();
fstp.quoteMerge(input, output);

return 0;
}
37 changes: 37 additions & 0 deletions tests/lt_merge/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
import unittest
from basictest import ProcTest
import unittest

class MergeTest(unittest.TestCase, ProcTest):
inputs = ['^nochange<n>$']
expectedOutputs = ['^nochange<n>$']
procflags = []

def compileTest(self, tmpd):
return True # "pass"

def openProc(self, tmpd):
return self.openPipe('lt-merge', self.procflags+[])


class SimpleTest(MergeTest):
inputs = ['^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^så/så<adv>$ ^veldig/v<adv>$^»/»<rquot><MERGE_END>$ ^bra/bra<adj>$' ]
expectedOutputs = ['^ikke/ikke<adv>$ ^«så veldig»/«så veldig»<MERGED>$ ^bra/bra<adj>$']


class SingleTest(MergeTest):
inputs = ['^not/very<useful><MERGE_BEG><MERGE_END>$' ]
expectedOutputs = ['^not/not<MERGED>$']


class EscapeTest(MergeTest):
# Using r'' to avoid doubling escapes even more:
inputs = [r'^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^så/så<adv>$ ^ve\[dig/v<adv>$^»/»<rquot><MERGE_END>$ ^bra/bra<adj>$']
expectedOutputs = [r'^ikke/ikke<adv>$ ^«så ve\\\[dig»/«så ve\\\[dig»<MERGED>$ ^bra/bra<adj>$']


class WordblankTest(MergeTest):
# Using r'' to avoid doubling escapes even more:
inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><MERGE_END>$']
expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»<MERGED>$']
4 changes: 3 additions & 1 deletion tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

modules = ['lt_proc', 'lt_trim', 'lt_print', 'lt_comp', 'lt_append',
'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose',
'lt_tmxproc']
'lt_tmxproc', 'lt_merge']

# modules = ['lt_merge']


if __name__ == "__main__":
Expand Down

0 comments on commit 2bd9008

Please sign in to comment.