diff --git a/nisaba/scripts/natural_translit/utils/BUILD.bazel b/nisaba/scripts/natural_translit/utils/BUILD.bazel index 3bcaa2de..cebcc21a 100644 --- a/nisaba/scripts/natural_translit/utils/BUILD.bazel +++ b/nisaba/scripts/natural_translit/utils/BUILD.bazel @@ -246,6 +246,30 @@ py_test( ], ) +py_library( + name = "alignment2", + srcs = ["alignment2.py"], + deps = [ + ":expression", + ":operation", + ":type_op", + ], +) + +py_test( + name = "alignment2_test", + srcs = ["alignment2_test.py"], + main = "alignment2_test.py", + deps = [ + ":alignment2", + ":expression", + ":operation", + ":symbol", + ":test_op", + "@io_abseil_py//absl/testing:absltest", + ], +) + py_library( name = "test_op", srcs = ["test_op.py"], diff --git a/nisaba/scripts/natural_translit/utils/alignment2.py b/nisaba/scripts/natural_translit/utils/alignment2.py new file mode 100644 index 00000000..fd5b2571 --- /dev/null +++ b/nisaba/scripts/natural_translit/utils/alignment2.py @@ -0,0 +1,358 @@ +# Copyright 2024 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Alignment class for defining a relation between two expressions. + +Attributes: + alias: Alias of the Expression. + left: Left side of the Alignment. + right: Right side of the Alignment. + preceding: Preceding context. + following: Following context. + from_bos: If True, preceding context starts from the beginning of the + sequence. + to_eos: If True, the following context ends at the end of the sequence. + operation: Operation that represents the relation between the sides of the + alignment. + priority: Priority of the alignment. When the applied cost of multiple + rules are equal, the rule with the highest priority will be applied. + applied_cost: Cost of the alignment when it's applied in context. + source: Source of the alignment. + +For inspection and debugging purposes, alignments are represented as strings in + the following format: + +([] : [], operation) + +Alignments can be defined in an inventory as a set of rules to build grammars, + or to assess the structural correspondence of two expressions. + +Example: + `([grapheme:nasal] a:b [grapheme:vowel], alignable (0.00))` means that + expression `a` is rewritten as expression `b` with `alignable` operation with + 0 cost when it's preceded by a grapheme that corresponds to a nasal and + followed by a grapheme that corresponds to a vowel. + +Alignment sources: + ALIGNER: Alignments from an aligner output that doesn't correspond to a + predefined rule. Eg. identity or token boundary alignments. + CONSTANT: Alignment class constants. + ENGLISH: English alignables. + FOREIGN: Alignables for foreign languages other than English + LEXICON: Alignables that come from a lexicon that will be prioritised over + other rules. Eg. frequent affixes or high profile entity names. + NATIVE: Alignables for the native language. + SPELLOUT: Alignables for spelled out letters. + UNSPECIFIED = Alignments from an unspecified source. +""" + +import enum +# from typing import Union +from nisaba.scripts.natural_translit.utils import expression as exp +from nisaba.scripts.natural_translit.utils import operation as op +# from nisaba.scripts.natural_translit.utils +# import type_op as ty + + +class Alignment(exp.Expression): + """An Expression that represents an alignment of exp.Expressions.""" + + class Source(enum.StrEnum): + ALIGNER = 'aligner' + CONSTANT = 'constant' + ENGLISH = 'english' + FOREIGN = 'foreign' + LEXICON = 'lexicon' + NATIVE = 'native' + SPELLOUT = 'spellout' + UNSPECIFIED = 'unspecified' + + def __init__(self, alias: str = ''): + super().__init__(alias) + self.left = exp.Expression.ANY + self.right = exp.Expression.ANY + self.preceding = exp.Expression.ANY + self.following = exp.Expression.ANY + self.from_bos = False + self.to_eos = False + self.operation = op.Operation.COMMON.unassigned + self.priority = 0 + self.applied_cost = self.operation.base_cost + self.source = Alignment.Source.UNSPECIFIED + + def _side_str(self, side: exp.Expression) -> str: + if side.is_any() or len(side) != 1: + return str(side) + return side.item(0).text + + def _context_str(self, context: exp.Expression) -> str: + if context.is_any() or not isinstance(context, Alignment): + return '' + return '%s:%s' % ( + self._side_str(context.left), + self._side_str(context.right), + ) + + def _pre_str(self) -> str: + text = self._context_str(self.preceding) + prefix = exp.Atomic.CTRL.bos.text if self.from_bos else '' + return '[%s%s] ' % (prefix, text) if text else '' + + def _fol_str(self) -> str: + text = self._context_str(self.following) + suffix = exp.Atomic.CTRL.eos.text if self.to_eos else '' + return ' [%s%s]' % (text, suffix) if text else '' + + def __str__(self): + if self.operation.is_assigned(): + operation = ', %s' % str(self.operation) + else: + operation = '' + return '(%s%s:%s%s%s)' % ( + self._pre_str(), + self._side_str(self.left), + self._side_str(self.right), + self._fol_str(), + operation, + ) + + def tsv_row(self) -> str: + return '\t'.join([ + self.alias, + ''.join([item.text for item in self.left]), + ''.join([item.text for item in self.right]), + str(self.operation.match), + str(self.applied_cost), + ]) + + def _set_side(self, side: exp.Expression.OR_SYMBOL) -> exp.Expression: + if not isinstance(side, exp.Expression): + return exp.Atomic.read(side) + return side + + @classmethod + def simple( + cls, + left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, + right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, + ) -> 'Alignment': + alignment = cls() + alignment.left = alignment._set_side(left) + alignment.right = alignment._set_side(right) + return alignment + +# @classmethod +# def constant( +# cls, +# alias: str = '', +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# operation: op.Operation = op.Operation.COMMON.unassigned, +# ) -> 'Alignment': +# alignment = cls(alias) +# alignment.left = alignment._set_side(left) +# alignment.right = alignment._set_side(right) +# alignment.operation = operation +# alignment.source = Alignment.Source.CONSTANT +# return alignment + + # TODO: Expand context to allow Cat and Or of alignments. + # Eg. a rule with `preceding=((vowel_grapheme:any) | (any:vowel_phoneme)))` + # will apply if it's preceded by an alignment that has a vowel grapheme on the + # left side or a vowel phoneme on the right side, regardless of what they are + # aligned with. +# def _set_context( +# self, left: exp.Expression.OR_SYMBOL, right: exp.Expression.OR_SYMBOL +# ) -> 'Alignment': +# if left.is_any() and right.is_any(): +# return Alignment.ANY +# return Alignment.simple(left, right) + +# @classmethod +# def rule( +# cls, +# alias: str = '', +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# from_bos: bool = False, +# to_eos: bool = False, +# operation: op.Operation = op.Operation.COMMON.alignable, +# priority: int = 0, +# applied_cost: Union[float, ty.Nothing] = ty.UNSPECIFIED, +# source: Source = Source.UNSPECIFIED, +# ) -> 'Alignment': +# rule = cls() +# rule.alias = alias +# rule.left = rule._set_side(left) +# rule.right = rule._set_side(right) +# rule.preceding = rule._set_context(preceding_left, preceding_right) +# rule.following = rule._set_context(following_left, following_right) +# rule.from_bos = from_bos +# rule.to_eos = to_eos +# rule.operation = operation +# rule.priority = priority +# if isinstance(applied_cost, float): +# rule.applied_cost = applied_cost +# else: +# rule.applied_cost = rule.operation.base_cost +# rule.source = source +# return rule + +# @classmethod +# def deletion( +# cls, +# alias: str = '', +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# from_bos: bool = False, +# to_eos: bool = False, +# operation: op.Operation = op.Operation.COMMON.deletion, +# priority: int = 0, +# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED, +# source: Source = Source.UNSPECIFIED, +# ) -> 'Alignment': +# return cls.rule( +# alias, +# left, +# exp.Atomic.CTRL.eps, +# preceding_left, +# preceding_right, +# following_left, +# following_right, +# from_bos, +# to_eos, +# operation, +# priority, +# applied_cost, +# source, +# ) + +# @classmethod +# def insertion( +# cls, +# alias: str = '', +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# from_bos: bool = False, +# to_eos: bool = False, +# operation: op.Operation = op.Operation.COMMON.insertion, +# priority: int = 0, +# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED, +# source: Source = Source.UNSPECIFIED, +# ) -> 'Alignment': +# return cls.rule( +# alias, +# exp.Atomic.CTRL.eps, +# right, +# preceding_left, +# preceding_right, +# following_left, +# following_right, +# from_bos, +# to_eos, +# operation, +# priority, +# applied_cost, +# source, +# ) + +# @classmethod +# def interchangeable( +# cls, +# alias: str = '', +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, +# from_bos: bool = False, +# to_eos: bool = False, +# operation: op.Operation = op.Operation.COMMON.interchangeable, +# priority: int = 0, +# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED, +# source: Source = Source.UNSPECIFIED, +# ) -> tuple['Alignment', 'Alignment']: +# common = ( +# preceding_left, +# preceding_right, +# following_left, +# following_right, +# from_bos, +# to_eos, +# operation, +# priority, +# applied_cost, +# source, +# ) +# left_to_right = cls.rule(alias + '_l2r', left, right, *common) +# right_to_left = cls.rule(alias + '_r2l', right, left, *common) +# return left_to_right, right_to_left + +# def is_any(self) -> bool: +# return self.left.is_any() and self.right.is_any() + +# def is_eps(self) -> bool: +# return self.left.is_eps() and self.right.is_eps() + +# def is_nor(self) -> bool: +# return self.left.is_nor() and self.right.is_nor() + +# def _copy_context( +# self, context: exp.Expression +# ) -> tuple[exp.Expression, exp.Expression]: +# if isinstance(context, Alignment): +# return context.left.copy(), context.right.copy() +# return exp.Expression.ANY, exp.Expression.ANY + +# def copy(self) -> 'Alignment': +# if ( +# self == Alignment.ANY +# or self == Alignment.EPSILON +# or self == Alignment.ERROR +# ): +# return self +# return Alignment.rule( +# self.alias, +# self.left.copy(), +# self.right.copy(), +# *self._copy_context(self.preceding), +# *self._copy_context(self.following), +# self.from_bos, +# self.to_eos, +# self.operation, +# self.priority, +# self.applied_cost, +# self.source, +# ) + + +# Alignment.ANY = Alignment.constant('any') +# Alignment.EPSILON = Alignment.constant( +# 'empty', exp.Atomic.CTRL.eps, exp.Atomic.CTRL.eps +# ) +# Alignment.ERROR = Alignment.constant( +# 'error', +# exp.Atomic.CTRL.nor, exp.Atomic.CTRL.nor, op.Operation.COMMON.error +# ) diff --git a/nisaba/scripts/natural_translit/utils/alignment2_test.py b/nisaba/scripts/natural_translit/utils/alignment2_test.py new file mode 100644 index 00000000..90cc8689 --- /dev/null +++ b/nisaba/scripts/natural_translit/utils/alignment2_test.py @@ -0,0 +1,125 @@ +# Copyright 2024 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from absl.testing import absltest +from nisaba.scripts.natural_translit.utils import alignment2 as alg +from nisaba.scripts.natural_translit.utils import expression as exp +from nisaba.scripts.natural_translit.utils import operation as op +from nisaba.scripts.natural_translit.utils import symbol as sym +from nisaba.scripts.natural_translit.utils import test_op + + +def _basic_sym(char): + return sym.Symbol(alias=char, text=char, raw=char) + + +_SYM = sym.Symbol.Inventory( + 'symbol', + sym.Symbol('nul', '◌', index=123, name='NULL'), + _basic_sym('a'), + _basic_sym('b'), + _basic_sym('c'), + _basic_sym('d'), +) + + +def _atomic_inventory() -> exp.Atomic.Inventory: + atomics = exp.Atomic.Inventory('atomic') + atomics.make_suppl('atm_sym', {exp.Atomic.read(sym): sym for sym in _SYM}) + atomics.add_symbols(*atomics.atm_sym) + return atomics + + +_ATM = _atomic_inventory() + + +class AlignmentTest(test_op.TestCase): + + def test_alignment(self): + test = alg.Alignment('test') + self.assertEqual(test.operation, op.Operation.COMMON.unassigned) + + # def test_simple(self): + # simple = alg.Alignment.simple(_SYM.a, _ATM.b + _ATM.c) + # self.assertIsInstance(simple.left, exp.Atomic) + # self.assertEqual(simple.operation, op.Operation.COMMON.unassigned) + # self.AssertStrEqual(simple, '(a:(b c))') + + # def test_constants(self): + # self.assertTrue(alg.Alignment.ANY.is_any()) + # self.AssertStrEqual(alg.Alignment.ANY, '(🝓*:🝓*)') + # self.assertTrue(alg.Alignment.ANY.preceding.is_any()) + # self.assertEqual(alg.Alignment.ANY.source, alg.Alignment.Source.CONSTANT) + # self.assertTrue(alg.Alignment.EPSILON.is_eps()) + # self.AssertStrEqual(alg.Alignment.EPSILON, '(⍷:⍷)') + # self.assertTrue(alg.Alignment.ERROR.is_nor()) + # self.AssertStrEqual(alg.Alignment.ERROR, '(⍜:⍜)') + # self.assertEqual(alg.Alignment.ERROR.operation, op.Operation.COMMON.error) + + # def test_rule(self): + # rule = alg.Alignment.rule( + # 'test', + # _ATM.a, + # _ATM.b, + # preceding_left=_ATM.c, + # following_right=_ATM.d, + # applied_cost=0.1, + # ) + # self.AssertEquivalent(rule.left, _ATM.a) + # self.AssertEquivalent(rule.right, _ATM.b) + # self.assertEqual(rule.operation, op.Operation.COMMON.alignable) + # self.AssertStrEqual(rule, '([c:🝓*] a:b [🝓*:d], alignable (0.000))') + # self.AssertStrEqual(rule.tsv_row(), 'test a b alignable (0.000) 0.1') + + # def test_copy(self): + # rule1 = alg.Alignment.rule( + # 'test', + # _ATM.a, + # _ATM.b, + # preceding_left=_ATM.c, + # applied_cost=0.1, + # ) + # rule2 = rule1.copy() + # self.assertIs(alg.Alignment.ERROR.copy(), alg.Alignment.ERROR) + # self.assertIs(alg.Alignment().copy().preceding, alg.Alignment.ANY) + # self.assertIs(rule2.following, alg.Alignment.ANY) + # self.AssertStrEqual(rule1, rule2) + # self.assertEqual(rule1.applied_cost, rule2.applied_cost) + # self.assertEqual(rule1.source, rule2.source) + + # def test_deletion(self): + # rule = alg.Alignment.deletion( + # 'a_deletion', + # _ATM.a, + # preceding_right=_ATM.b, + # from_bos=True, + # ) + # self.AssertStrEqual(rule, '([⍄🝓*:b] a:⍷, deletion (1.000))') + + # def test_insertion(self): + # rule = alg.Alignment.insertion( + # 'a_insertion', + # _ATM.a, + # following_right=_ATM.b, + # to_eos=True, + # ) + # self.AssertStrEqual(rule, '(⍷:a [🝓*:b⍃], insertion (1.000))') + + # def test_interchangeable(self): + # rule1, rule2 = alg.Alignment.interchangeable('a_b', _ATM.a, _ATM.b) + # self.AssertStrEqual(rule1, '(a:b, interchangeable (0.100))') + # self.AssertStrEqual(rule2, '(b:a, interchangeable (0.100))') + +if __name__ == '__main__': + absltest.main() diff --git a/nisaba/scripts/natural_translit/utils/expression.py b/nisaba/scripts/natural_translit/utils/expression.py index 9c1a1b65..02cdc270 100644 --- a/nisaba/scripts/natural_translit/utils/expression.py +++ b/nisaba/scripts/natural_translit/utils/expression.py @@ -32,6 +32,8 @@ def __init__(self, alias: str = ''): self.index = hash(self) def __str__(self) -> str: + if self.is_any(): + return '🝓*' # U+1F753 return self.text def _str_items_list(self, *items: ...) -> list[str]: @@ -326,6 +328,8 @@ def is_suffix(self, other: 'Expression.OR_SYMBOL') -> bool: return self._symbol_contains(other) def copy(self) -> 'Expression': + if self == Expression.ANY: + return self return Expression(self.alias) def __add__(self, other: 'Expression') -> 'Cat': diff --git a/nisaba/scripts/natural_translit/utils/expression_test.py b/nisaba/scripts/natural_translit/utils/expression_test.py index 28331e1e..ad72b2c8 100644 --- a/nisaba/scripts/natural_translit/utils/expression_test.py +++ b/nisaba/scripts/natural_translit/utils/expression_test.py @@ -49,10 +49,12 @@ def test_atomic_read(self): self.assertIs(exp.Atomic.read(exp.Atomic.CTRL.unk), exp.Atomic.CTRL.unk) self.assertIs(_ATM.a.symbol, _SYM.a) - def test_control(self): + def test_constants(self): self.assertTrue(exp.Atomic.CTRL.unk.is_control()) self.assertTrue(exp.Atomic.CTRL.eps.is_eps()) self.assertTrue(exp.Atomic.CTRL.nor.is_nor()) + self.assertTrue(exp.Expression.ANY.is_any()) + self.AssertStrEqual(exp.Expression.ANY, '🝓*') def test_symbol_inventory_lookup(self): self.assertEqual(_ATM.lookup(_ATM.a, 'atm_sym'), _SYM.a) @@ -160,6 +162,7 @@ def test_copy(self): exp1_copy = exp1.copy() cat1 = exp.Cat(_ATM.a, _ATM.b) cat1_copy = cat1.copy() + self.assertIs(exp.Expression.ANY.copy(), exp.Expression.ANY) self.assertIsNot(exp1, exp1_copy) self.assertIs(exp.Atomic.CTRL.eps.copy(), exp.Atomic.CTRL.eps) self.assertIsNot(_ATM.a.copy(), _ATM.a) diff --git a/nisaba/scripts/natural_translit/utils/operation.py b/nisaba/scripts/natural_translit/utils/operation.py index dcfcb182..dcc84630 100644 --- a/nisaba/scripts/natural_translit/utils/operation.py +++ b/nisaba/scripts/natural_translit/utils/operation.py @@ -98,7 +98,7 @@ def __init__(self, alias: str, cost: float): self.unexpected = self def __str__(self): - return '%s (%d)' % (self.text, self.base_cost) + return '%s (%.3f)' % (self.text, self.base_cost) def is_assigned(self) -> bool: return not ( @@ -193,7 +193,14 @@ def add_operations(self, *operations: 'Operation') -> None: self.add_item(operation) def __str__(self): - return '\n'.join([operation.text for operation in self]) + return ( + '\n'.join([ + str(operation) + for operation in self + if operation.match == operation + ]) + + '\n' + ) # Current base costs in the common operation inventory are rough estimations and diff --git a/nisaba/scripts/natural_translit/utils/operation_test.py b/nisaba/scripts/natural_translit/utils/operation_test.py index 2cc9f9d8..f8317c92 100644 --- a/nisaba/scripts/natural_translit/utils/operation_test.py +++ b/nisaba/scripts/natural_translit/utils/operation_test.py @@ -63,12 +63,24 @@ def test_inventory(self): self.assertEqual(_OPS.alignable.inventory, _OPS) self.assertEqual(_OPS.index_lookup(101), _OPS.alignable) self.assertEqual(_OPS.index_lookup(120), _OPS.error) + self.assertEqual( + str(_OPS), + 'alignable (0.000)\n' + 'boundary (0.125)\n' + 'deletion (1.000)\n' + 'error (100.000)\n' + 'identity (0.100)\n' + 'insertion (1.000)\n' + 'interchangeable (0.100)\n' + 'punctuation (0.250)\n' + 'substitution (1.250)\n' + 'unassigned (10.000)\n', + ) def test_add_operations(self): - test = op.Operation('test', 1) - _OPS.add_operations(test) - self.assertIn(test, _OPS) - self.assertFalse(test.inventory, _OPS) + ops2 = op.Operation.Inventory('copy', *_OPS) + self.assertIn(_OPS.alignable, ops2) + self.assertNotEqual(_OPS.alignable.inventory, ops2) if __name__ == '__main__':