From c7b2eb255aeb779dc158e1238ed422732e0a6c6f Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Thu, 19 Dec 2024 15:54:47 +0100 Subject: [PATCH] Let lt-proc -b handle special ANY_CHAR tag ( from lsx) --- lttoolbox/fst_processor.cc | 11 ++++++++++- lttoolbox/fst_processor.h | 5 +++++ lttoolbox/state.h | 6 +++--- tests/data/pass-through.lsx | 20 ++++++++++++++++++++ tests/lt_proc/__init__.py | 21 +++++++++++++++++++++ 5 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 tests/data/pass-through.lsx diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index f4a1a76..ff0adf7 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -627,6 +627,8 @@ void FSTProcessor::load(FILE *input) { readTransducerSet(input, alphabetic_chars, alphabet, transducers); + alphabet.includeSymbol(""_u); + any_char = alphabet(""_u); } void @@ -1755,7 +1757,14 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) if (reader.readings[index].mark == '#') current_state.step('#'); for (size_t i = 0; i < symbols.size(); i++) { seenTags = seenTags || alphabet.isTag(symbols[i]); - current_state.step_case(symbols[i], beCaseSensitive(current_state)); + UString source; + alphabet.getSymbol(source, symbols[i]); + if(beCaseSensitive(current_state)) { // allow any_char + current_state.step_override(symbols[i], any_char, symbols[i]); + } + else { // include lower alt + current_state.step_override(symbols[i], towlower(symbols[i]), any_char, symbols[i]); + } if (current_state.isFinal(all_finals)) { queue_start = i; current_state.filterFinalsArray(result, diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index d53556f..a1a1cd7 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -252,6 +252,11 @@ class FSTProcessor */ int maxWeightClasses = INT_MAX; + /** + * The alphabet index of the tag + */ + int any_char; + /** * Prints an error of input stream and exits */ diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 56a7d34..31d2032 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -113,9 +113,9 @@ class State /** * Make a transition, but overriding the output symbol - * @param input symbol - * @param output symbol we expect to appear - * @param output symbol we want to appear + * @param input symbol read from infile + * @param output symbol from the FST + * @param output symbol we want to appear in outfile */ void apply_override(int const input, int const old_sym, int const new_sym); diff --git a/tests/data/pass-through.lsx b/tests/data/pass-through.lsx new file mode 100644 index 0000000..ba2c875 --- /dev/null +++ b/tests/data/pass-through.lsx @@ -0,0 +1,20 @@ + + + + + + + + + + foo + + + +
+ + + + +
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index cae6568..d816f06 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -479,6 +479,11 @@ class BiltransGarbage(ProcTest): inputs = ['^$'] expectedOutputs = ['^$'] +class BiltransSimple(ProcTest): + procflags = ['-b', '-z'] + inputs = ['^abc$'] + expectedOutputs = ['^abc/ab$'] + class SlashesInTags(ProcTest): procdix = 'data/slash-tags.dix' procflags = ['-b', '-z'] @@ -496,5 +501,21 @@ class SlashesInTags(ProcTest): '^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$', '^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$'] +class BiltransAnyChar(ProcTest): + procdix = 'data/pass-through.lsx' + procflags = ['-b', '-z'] + # Using r'' to avoid doubling escapes even more: + inputs = [r'^simple$'] + expectedOutputs = [r'^simple/simple$'] + + +class BiltransAnyCharEscapes(ProcTest): + procdix = 'data/pass-through.lsx' + procflags = ['-b', '-z'] + # Using r'' to avoid doubling escapes even more: + inputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$'] + expectedOutputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$'] + + # These fail on some systems: #from null_flush_invalid_stream_format import *