Skip to content

Commit

Permalink
[natural_translit] Add English letters to Latin inventory and English…
Browse files Browse the repository at this point in the history
… language params.

PiperOrigin-RevId: 720686055
  • Loading branch information
isingoo authored and copybara-github committed Jan 30, 2025
1 parent 018a0d5 commit 4dd30df
Show file tree
Hide file tree
Showing 12 changed files with 3,606 additions and 99 deletions.
10 changes: 10 additions & 0 deletions nisaba/scripts/natural_translit/language_params/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ py_library(
],
)

py_library(
name = "en",
srcs = ["en.py"],
deps = [
"//nisaba/scripts/natural_translit/phonology/inventories:x_mul",
"//nisaba/scripts/natural_translit/script:grapheme",
"//nisaba/scripts/natural_translit/script/inventories:latn",
],
)

py_library(
name = "gu",
srcs = ["gu.py"],
Expand Down
231 changes: 231 additions & 0 deletions nisaba/scripts/natural_translit/language_params/en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# Copyright 2024 Nisaba Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Language parameters for English."""

from nisaba.scripts.natural_translit.phonology.inventories import x_mul
from nisaba.scripts.natural_translit.script import grapheme as g
from nisaba.scripts.natural_translit.script.inventories import latn as l


def _grapheme_inventory() -> g.Grapheme.Inventory:
"""Builds a grapheme inventory for English."""
latn = l.GRAPHEMES
ph = x_mul.INVENTORY
gr = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn, 'en')
lowercase = [
latn.a,
latn.b,
latn.c,
latn.d,
latn.e,
latn.f,
latn.g,
latn.h,
latn.i,
latn.j,
latn.k,
latn.l,
latn.m,
latn.n,
latn.o,
latn.p,
latn.q,
latn.r,
latn.s,
latn.t,
latn.u,
latn.v,
latn.w,
latn.x,
latn.y,
latn.z,
]
gr.import_as_feature_pairs(
g.Grapheme.GR_FEATURES.case.lower,
g.Grapheme.GR_FEATURES.case.upper,
*((lower, lower.upper) for lower in lowercase)
)
gr.make_iterable_suppl('letter', *gr.upper, *gr.lower)

# Descriptive features from common one-to-one grapheme-phoneme mappings.
# Many-to-many values for subbstrings, universal mappings such
# as vowel reduction, and common phonological operations such as
# palatalization will be matched through g2p/g2g alignables and/or built-in
# phonological rules.

# Initial values retrieved on 2025-01-24 from:
# https://en.wikipedia.org/wiki/English_orthography#Spelling-to-sound_correspondences

# Vowels.
# Technically all durations are {short, long} through union of lax and tense/
# heavy phonemes, but it's left as {any} for now.

gr.a.update_descriptives_from_symbol(
# lax: man, tense: mane, heavy: mar, heavy-r: mare
ph.ae, # lax nucleus
ph.e, # tense nucleus
ph.iy, # tense glide
ph.aw, # heavy nucleus
ph.eh, # heavy-r nucleus
ph.ec, # heavy-r glide
)
gr.e.update_descriptives_from_symbol(
# lax: met, tense: meet, heavy: her, heavy-r: here
ph.eh, # lax nucleus
ph.i, # tense nucleus
ph.ex, # heavy nucleus
ph.iy, # heavy-r nucleus
ph.ec, # heavy-r glide
)
gr.i.update_descriptives_from_symbol(
# lax: win, tense: wine, heavy: fir, heavy-r: fire
ph.iy, # lax nucleus; tense, heavy-r glide
ph.a, # tense, heavy-r nucleus
ph.ex, # heavy nucleus
ph.ec, # heavy-r second glide
)
gr.o.update_descriptives_from_symbol(
# lax: mop, tense: mope, heavy: for, heavy-r: fore
ph.ow, # lax nucleus
ph.o, # tense nucleus
ph.uv, # tense glide
ph.oh, # heavy, heavy-r nucleus
)
gr.u.update_descriptives_from_symbol(
# lax: hug, push, tense: huge, heavy: cur, heavy-r: cure
ph.ah, # lax nucleus
ph.uv, # lax, heavy-r nucleus
ph.u, # tense nucleus
ph.ec, # heavy-r glide
)

# Consonants.
gr.b.update_descriptives_from_symbol(ph.b)
gr.c.update_descriptives_from_symbol(
ph.s, # city
ph.k, # cat
)
gr.d.update_descriptives_from_symbol(
ph.d, # dog
)
gr.f.update_descriptives_from_symbol(
ph.f, # fine
)
gr.g.update_descriptives_from_symbol(
ph.g, # get
ph.d, # gin stop
ph.zh, # gin fricative
)
gr.h.update_descriptives_from_symbol(
ph.h, # honey
)
gr.j.update_descriptives_from_symbol(
ph.d, # jump stop
ph.zh, # jump fricative
ph.y, # hallelujah
ph.h, # jalapeno
)
gr.k.update_descriptives_from_symbol(
ph.k, # key
)
gr.l.update_descriptives_from_symbol(
ph.l, # line
)
gr.m.update_descriptives_from_symbol(
ph.m, # mine
)
gr.n.update_descriptives_from_symbol(
ph.n, # name
)
gr.p.update_descriptives_from_symbol(
ph.p, # pill
)
gr.q.update_descriptives_from_symbol(
ph.k, # quick
)
gr.r.update_descriptives_from_symbol(
ph.r, # red
)
gr.s.update_descriptives_from_symbol(
ph.s, # saw
ph.z, # prison
ph.sh, # sugar
ph.zh, # vision
)
gr.t.update_descriptives_from_symbol(
ph.t, # ten, righteous stop
ph.sh, # ration, righteous fricative
ph.zh, # equation
)
gr.v.update_descriptives_from_symbol(
ph.v, # vine
)
gr.w.update_descriptives_from_symbol(
ph.w, # water
)
gr.x.update_descriptives_from_symbol(
ph.k, # box, anxious, luxurious(gb) stop
ph.s, # box fricative
ph.g, # anxiety, luxurious(us) stop
ph.z, # anxiety fricative
ph.zh, # luxurious fricative
ph.sh, # anxious fricative
)
gr.y.update_descriptives_from_symbol(
ph.y, # yes
gr.i, # flynn, fry, fyrd, pyre
)
gr.z.update_descriptives_from_symbol(
ph.z, # zoo
ph.zh, # seizure
ph.t, # schizophrenia stop
ph.s, # schizophrenia fricative
)
for upper in gr.upper:
upper.update_descriptives_from_symbol(upper.lower)
vowels = [gr.a, gr.e, gr.i, gr.o, gr.u, gr.y]
consonants = [
gr.b,
gr.c,
gr.d,
gr.f,
gr.g,
gr.h,
gr.j,
gr.k,
gr.l,
gr.m,
gr.n,
gr.p,
gr.q,
gr.r,
gr.s,
gr.t,
gr.v,
gr.w,
gr.x,
gr.y,
gr.z,
]
gr.make_iterable_suppl('vowel', *vowels, *(v.upper for v in vowels))
gr.make_iterable_suppl(
'consonant', *consonants, *(c.upper for c in consonants)
)
return gr.sync_atomics(
[gr.upper, gr.lower, gr.letter, gr.vowel, gr.consonant]
)


GRAPHEMES = _grapheme_inventory()
56 changes: 33 additions & 23 deletions nisaba/scripts/natural_translit/phonology/phonological_symbol.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

"""Classes for phonological symbols."""
from __future__ import annotations

from nisaba.scripts.natural_translit.phonology import descriptive_features
from nisaba.scripts.natural_translit.utils import expression as exp
Expand Down Expand Up @@ -41,14 +42,31 @@ def __init__(
ft.Feature.Profile(self.PH_DESCRIPTIVE_FEATURES, 'new')
)

def descriptives(self) -> ft.Feature.Profile:
return self.features.phonology_descriptive

def update_descriptives(
self, *features: ft.Feature.ITERABLE
) -> PhonologicalSymbol:
"""Updates the descriptive features of the PhonologicalSymbol."""
self.descriptives().update(*features)
return self

def update_descriptives_from_symbol(
self, *symbols: PhonologicalSymbol
) -> PhonologicalSymbol:
"""Updates the descriptives from the union of the given symbols."""
self.update_descriptives(*(s.descriptives() for s in symbols))
return self

class Inventory(sym.Symbol.Inventory):
"""Phonological symbol inventory."""

def __init__(self, alias: str, typed: ty.TypeOrNothing = ty.UNSPECIFIED):
super().__init__(alias, typed=ty.type_check(typed, PhonologicalSymbol))
self.atomics = i.Inventory()

def _add_symbol_and_atomic(self, symbol: 'PhonologicalSymbol') -> bool:
def _add_symbol_and_atomic(self, symbol: PhonologicalSymbol) -> bool:
"""Adds a phonological symbol to the inventory."""
return self._add_symbol(symbol) and self.atomics.add_item(
exp.Atomic.get_instance(symbol)
Expand All @@ -59,34 +77,33 @@ def or_from_suppl(self, suppl: ty.IterableThing) -> bool:
return self.atomics.add_suppl(exp.Or(*suppl, alias=suppl.alias))

def sync_atomics(
self, or_suppls: ty.ListOrNothing = ty.UNSPECIFIED
) -> 'PhonologicalSymbol.Inventory':
self, update_ors_from_suppls: ty.ListOrNothing = ty.UNSPECIFIED
) -> PhonologicalSymbol.Inventory:
"""Syncs the atomic inventory with the symbol inventory.
Updates the features of the atomic instance of each phonological symbol
to match the features of the symbol. Optionally updates the members of
Or supplements in atomics to include all members of the given supplements
in the list. For example, if an `inventory.vowel` iterable and a
corresponding `inventory.atomics.vowel` Or were initiated as `[a, e, i]`
and `(a | e | i)`, and later `[o, u]` was added to `inventory.vowel`,
this function will update `inventory.atomics.vowel` to
and `(a | e | i)` respectively, and later `[o, u]` was added to
`inventory.vowel`, this function will update `inventory.atomics.vowel` to
`(a | e | i | o | u)`.
Updates the atomic inventory with the supplements.
Args:
or_suppls: Optional list iterable supplements. If specified, the
corresponding Or supplement of the given supplements will be updated to
include all symbols.
update_ors_from_suppls: Optional list of iterable supplements. When
specified,
- if there's no corresponding Or in the atomics, a new one is created.
- if there is a corresponding Or, it's updated to include all symbols
in the given supplement.
Returns:
The inventory.
"""
for atomic in self.atomics:
for profile in atomic.features:
profile.update(atomic.symbol.features.get(profile.inventory))
for suppl in ty.type_check(or_suppls, []):
for suppl in ty.type_check(update_ors_from_suppls, []):
if suppl.alias not in self.atomics.suppl_aliases:
self.or_from_suppl(suppl)
self.atomics.get(suppl.alias).add(*suppl)
Expand Down Expand Up @@ -130,7 +147,7 @@ def copy(
language: str = '',
alias: str = '',
ipa: str = '',
) -> 'Phon':
) -> Phon:
"""Creates a copy of the Phon."""
return Phon(
language=language if language else self.language,
Expand All @@ -141,13 +158,6 @@ def copy(
features=self.features.copy(),
)

def update_descriptives(
self, *features: ft.Feature.ITERABLE
) -> 'Phon':
"""Updates the descriptive features of the Phon."""
self.features.phonology_descriptive.update(*features)
return self

class Inventory(PhonologicalSymbol.Inventory):
"""Phon inventory."""

Expand All @@ -156,14 +166,14 @@ def __init__(self, language: str = ''):
super().__init__(alias=language, typed=Phon)
self.language = language

def _add_phoneme(self, phoneme: 'Phon') -> bool:
def _add_phoneme(self, phoneme: Phon) -> bool:
"""Adds a phoneme to the inventory."""
phoneme.index = Phon.ReservedIndex.PHONEME_PREFIX + len(self) + 1
return self._add_symbol_and_atomic(phoneme)

def add_phonemes(
self, *phonemes: 'Phon', list_alias: str = ''
) -> list['Phon']:
self, *phonemes: Phon, list_alias: str = ''
) -> list[Phon]:
phs = [ph for ph in phonemes if self._add_phoneme(ph)]
if list_alias:
self.make_iterable_suppl(list_alias, *phs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _test_inventory() -> po.Phon.Inventory:
ph_inv.make_iterable_suppl('close_like', ph_inv.e)
ph_inv.or_from_suppl(ph_inv.close_like)
ph_inv.close_like.add(ph_inv.i)
return ph_inv.sync_atomics(or_suppls=[ph_inv.vowel, ph_inv.close_like])
return ph_inv.sync_atomics([ph_inv.vowel, ph_inv.close_like])


_TEST = _test_inventory()
Expand Down
4 changes: 1 addition & 3 deletions nisaba/scripts/natural_translit/script/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,9 @@ py_library(
name = "grapheme",
srcs = ["grapheme.py"],
deps = [
"//nisaba/scripts/natural_translit/phonology:descriptive_features",
"//nisaba/scripts/natural_translit/utils:expression",
"//nisaba/scripts/natural_translit/phonology:phonological_symbol",
"//nisaba/scripts/natural_translit/utils:feature",
"//nisaba/scripts/natural_translit/utils:inventory",
"//nisaba/scripts/natural_translit/utils:symbol",
"//nisaba/scripts/natural_translit/utils:type_op",
requirement("pycountry"),
],
Expand Down
Loading

0 comments on commit 4dd30df

Please sign in to comment.