diff --git a/nisaba/scripts/abjad_alphabet/BUILD.bazel b/nisaba/scripts/abjad_alphabet/BUILD.bazel index c1f44c80..593e3757 100644 --- a/nisaba/scripts/abjad_alphabet/BUILD.bazel +++ b/nisaba/scripts/abjad_alphabet/BUILD.bazel @@ -59,18 +59,16 @@ nisaba_compile_multi_grm_py( "utf8": "reversible_roman_utf8.far", }, data = [ + ":nfc.far", + ":nfc_utf8.far", "//nisaba/scripts/abjad_alphabet/data/Arab:reversible_roman.tsv", - "//nisaba/scripts/abjad_alphabet/data/Arab:visual_norm.tsv", ], visibility = ["//visibility:public"], deps = [ ":util", - ":visual_norm_common", "//nisaba/scripts/utils:file", "//nisaba/scripts/utils:rewrite", - "//nisaba/scripts/utils:rule", "@org_opengrm_pynini//pynini", - "@org_opengrm_pynini//pynini/lib:byte", ], ) @@ -263,13 +261,9 @@ py_test( srcs_version = "PY3", deps = [ ":util", - "@org_opengrm_pynini//pynini", + "//nisaba/scripts/utils:test_util", "@io_abseil_py//absl/testing:absltest", "@io_abseil_py//absl/testing:parameterized", - "//nisaba/scripts/utils:file", - # TODO: Remove this dependency by using OpenFar/OpenFstFromFar calls. - "//nisaba/scripts/utils:test_util", - "@io_abseil_py//absl/logging", ], ) @@ -316,6 +310,7 @@ py_library( deps = [ ":util", "//nisaba/scripts/utils:far", + "//nisaba/scripts/utils:rewrite", "@org_opengrm_pynini//pynini", ], ) diff --git a/nisaba/scripts/abjad_alphabet/__init__.py b/nisaba/scripts/abjad_alphabet/__init__.py index 0f93d3d0..eca80ed9 100644 --- a/nisaba/scripts/abjad_alphabet/__init__.py +++ b/nisaba/scripts/abjad_alphabet/__init__.py @@ -14,6 +14,9 @@ """Python APIs for abjad / alphabet grammars.""" +# TODO: This library currently only supports `byte` tokens. Consider +# supporting `utf8` tokens too. + import pathlib import re import string @@ -21,6 +24,7 @@ import pynini from nisaba.scripts.abjad_alphabet import util as u from nisaba.scripts.utils import far +from nisaba.scripts.utils import rewrite class _FarStore(object): @@ -37,7 +41,9 @@ def __init__(self) -> None: def ToReversibleRoman() -> far.Far.FstWrapper: - return _FARS.reversible_roman.Fst('FROM_ARAB') + fst = u.open_fst_from_far('reversible_roman', 'FROM_ARAB', 'byte') + # Allows out of script characters to pass through. + return far.Far.FstWrapper(rewrite.Rewrite(fst)) def FromReversibleRoman() -> far.Far.FstWrapper: @@ -70,7 +76,7 @@ def __init__(self, self._nfc = Nfc() self._visual_norm = VisualNorm(tag) except KeyError as error: - raise TagError('Unsupported language/script: {}'.format(error)) + raise TagError(f'Unsupported language/script: {tag}') from error else: self.accept_pat = re.compile(r'[^{}]+'.format(re.escape(ignore))) diff --git a/nisaba/scripts/abjad_alphabet/randgen_test.py b/nisaba/scripts/abjad_alphabet/randgen_test.py index e5cba46b..9091e323 100644 --- a/nisaba/scripts/abjad_alphabet/randgen_test.py +++ b/nisaba/scripts/abjad_alphabet/randgen_test.py @@ -16,35 +16,33 @@ import itertools -from absl import logging - from absl.testing import absltest from absl.testing import parameterized from nisaba.scripts.abjad_alphabet import util as u -from nisaba.scripts.utils import file as uf -from nisaba.scripts.utils import test_util as ut +from nisaba.scripts.utils import test_util -class FstRandgenTest(parameterized.TestCase, ut.FstRandgenTestCase): +class FstRandgenTest(parameterized.TestCase, test_util.FstRandgenTestCase): @parameterized.parameters('byte', 'utf8') def test_romanization_roundtrip(self, token_type: str): - far = uf.OpenFar(u.FAR_DIR, 'reversible_roman', token_type) + nfc = u.open_fst_from_far('nfc', 'ARAB', token_type) + far = u.open_far('reversible_roman', token_type) natv_to_latin = far['FROM_ARAB'] latin_to_natv = far['TO_ARAB'] - round_trip = natv_to_latin @ latin_to_natv - self.AssertFstProbablyFunctional(round_trip, token_type) + self.AssertFstProbablyIdentity( + [natv_to_latin, latin_to_natv], token_type, nfc) @parameterized.parameters(itertools.product( - u.LANGS, ('visual_norm', 'reading_norm'), ('byte', 'utf8'))) - def test_visual_or_reading_norm(self, lang: str, far_name: str, + ('visual_norm', 'reading_norm'), u.LANGS, ('byte', 'utf8'))) + def test_visual_or_reading_norm(self, far_name: str, lang: str, token_type: str): - fst = uf.OpenFstFromFar(u.FAR_DIR, far_name, token_type, lang) + fst = u.open_fst_from_far(far_name, lang, token_type) self.AssertFstProbablyFunctional(fst, token_type) @parameterized.parameters('byte', 'utf8') def test_nfc(self, token_type: str): - fst = uf.OpenFstFromFar(u.FAR_DIR, 'nfc', token_type, 'ARAB') + fst = u.open_fst_from_far('nfc', 'ARAB', token_type) self.AssertFstProbablyFunctional(fst, token_type) diff --git a/nisaba/scripts/abjad_alphabet/reversible_roman.py b/nisaba/scripts/abjad_alphabet/reversible_roman.py index 0f1d29bd..d6006a20 100644 --- a/nisaba/scripts/abjad_alphabet/reversible_roman.py +++ b/nisaba/scripts/abjad_alphabet/reversible_roman.py @@ -30,39 +30,24 @@ import pynini from pynini.export import multi_grm -from pynini.lib import byte from nisaba.scripts.abjad_alphabet import util -from nisaba.scripts.abjad_alphabet import visual_norm_common from nisaba.scripts.utils import file from nisaba.scripts.utils import rewrite -from nisaba.scripts.utils import rule def generator_main(exporter_map: multi_grm.ExporterMapping): """FSTs for language-agnostic reversible romanization of abjad/alphabets.""" - # Compile romanisation transducer. In the direction to Latin, NFC and then - # visual normalization are applied. They are not required in the opposite - # direction. for token_type in ('byte', 'utf8'): with pynini.default_token_type(token_type): - exporter = exporter_map[token_type] - sigma = byte.BYTE - if token_type == 'utf8': - sigma = util.sigma_from_common_data_files() + nfc = util.open_fst_from_far('nfc', 'ARAB', token_type) - # TODO: Currently, `prefix=(‘presentation_forms’,)` takes too long - # to process, so it is not specified in `script_common_fsts()` even though - # it should be included. - script_common_fsts = visual_norm_common.script_common_fsts(sigma) - roman_mapping_file = util.LANG_DIR / 'reversible_roman.tsv' - roman_fst = rule.fst_from_rule_file(roman_mapping_file, sigma) - fsts = script_common_fsts + [roman_fst] - exporter['FROM_ARAB'] = rewrite.ComposeFsts(fsts, sigma) + roman_tsv = util.LANG_DIR / 'reversible_roman.tsv' + roman = file.StringFile(roman_tsv).star.optimize() - # Transforming Latin to native is simpler. - roman_strings = file.StringFile(roman_mapping_file) - roman_inv_fst = pynini.invert(roman_strings).star - exporter['TO_ARAB'] = roman_inv_fst.optimize() + exporter = exporter_map[token_type] + # NFC is used for romanization, not de-romanization. + exporter['FROM_ARAB'] = rewrite.ComposeFsts([nfc, roman]) + exporter['TO_ARAB'] = roman.invert() if __name__ == '__main__': diff --git a/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto b/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto index e8081b4b..b8415f05 100644 --- a/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto +++ b/nisaba/scripts/abjad_alphabet/testdata/reversible_roman.textproto @@ -18,19 +18,24 @@ # Currently, test strings are gathered from ALA-LC (Urdu) specification. # TODO: Add test strings from rest of the languages as well. Out of script -# characters are pass through. -rewrite { - rule: "FROM_ARAB" - input: "Abæ آب" - output: "Abæ ʼ͟āb" -} +# characters should be pass through. + +# rewrite { +# rule: "FROM_ARAB" +# input: "Abæ آب" +# output: "Abæ ʼ͟āb" +# } + +# TODO: Like NFC, visual norm should also be applied before romanization +# (Urdu U). However, this is not currently done as the build takes too long. +# Furthermore, Brahmic scripts do not apply visual norm before ISO. + +# rewrite { +# rule: "FROM_ARAB" +# input: "عضوُ" +# output: "ʻẓʉ" +# } -# Visual normalization applied prior to romanization (Urdu U). -rewrite { - rule: "FROM_ARAB" - input: "عضوُ" - output: "ʻẓʉ" -} rewrite { rule: "FROM_ARAB" input: "عضۇ" @@ -42,7 +47,7 @@ rewrite { output: "" } -# NFC + visual normalization applied prior to romanization. +# NFC applied prior to romanization. rewrite { rule: "FROM_ARAB" input: "آپ" diff --git a/nisaba/scripts/abjad_alphabet/util.py b/nisaba/scripts/abjad_alphabet/util.py index 6e51f94f..b6acacee 100644 --- a/nisaba/scripts/abjad_alphabet/util.py +++ b/nisaba/scripts/abjad_alphabet/util.py @@ -50,6 +50,11 @@ def sigma_from_common_data_files() -> pynini.Fst: return uc.derive_sigma(chars) +def open_far(far_name: str, token_type: str) -> pynini.Far: + """Loads Abjad-Alphabet FAR specified by `far_name`.""" + return uf.OpenFar(FAR_DIR, far_name, token_type) + + def open_fst_from_far(far_name: str, fst_name: str, token_type: str) -> pynini.Fst: """Loads FST given by `fst_name` from FAR specified by `far_name`.""" diff --git a/nisaba/scripts/brahmic/iso.py b/nisaba/scripts/brahmic/iso.py index ea5f9119..1db83f36 100644 --- a/nisaba/scripts/brahmic/iso.py +++ b/nisaba/scripts/brahmic/iso.py @@ -161,6 +161,13 @@ def _script_fsts(script: str, token_type: str) -> Tuple[p.Fst, p.Fst]: # out. nfc = u.OpenFstFromBrahmicFar('nfc', script, token_type) from_nfced_script = rw.ComposeFsts([nfc, from_script]) + + # TODO: The NFC form of Gurmukhi SHA is , which currently has + # the same romanization defined in the Guru/consonant. So NFC on TO_GURU is + # required currently. However that need not be the case. We could consider + # moving the SHA from common consonant mapping to script specific files + # without adding that to Gurmukhi. That would then align this code with + # Arabic, which does not do NFC on TO_ARAB. to_nfced_script = rw.ComposeFsts([to_script, nfc]) return (from_nfced_script, to_nfced_script) @@ -177,6 +184,9 @@ def generator_main(exporter_map: multi_grm.ExporterMapping): script = script.upper() exporter[f'FROM_{script}'] = from_script exporter[f'TO_{script}'] = to_script + # TODO: Following rewrite assumes 'byte' token type. It should be + # made available to 'utf8' as well. The corresponding 'utf8_test' is + # missing as well. exporter['FROM_BRAHMIC'] = rw.Rewrite(p.union(*from_script_fsts)) diff --git a/nisaba/scripts/utils/test_util.py b/nisaba/scripts/utils/test_util.py index 9cc2df9c..0a12253f 100644 --- a/nisaba/scripts/utils/test_util.py +++ b/nisaba/scripts/utils/test_util.py @@ -284,10 +284,17 @@ def _AssertFstSampledBehavior( with pynini.default_token_type(token_type): for ilabels in _OlabelsIter(input_samples): input_str_fsa = _LabelListToStringFsa(ilabels) + output_str = rewrite.ComposeFsts([input_str_fsa] + fsts) + + # Please note that `norm` is not idempotent if it is Arabic NFC which + # cannot handle the reordering of a large number of SHADDA, FATHA, + # FATHATAN, KASRA, etc.. Even though this is only a theoretical + # possibility, for randgen test, the number of NFCs in the round trip + # should be the same as the count of NFCs applied to the input before + # the assert function. if norm_fst: - input_str_fsa @= norm_fst - output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts) - assert_function(input_str_fsa, output_fst) + input_str_fsa = rewrite.ComposeFsts([input_str_fsa, norm_fst]) + assert_function(input_str_fsa, output_str) class FstTestCase(absltest.TestCase):