Skip to content

Commit

Permalink
Merge pull request #19 from bretttolbert/catalan-contd
Browse files Browse the repository at this point in the history
improved Catalan support
  • Loading branch information
bretttolbert authored Dec 22, 2023
2 parents a3c8591 + 4294660 commit 006e7c9
Show file tree
Hide file tree
Showing 38 changed files with 5,465 additions and 2,876 deletions.
38 changes: 38 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,43 @@
# verbecc Changelog

- 1.9.3 [22 December 2023]
- Improved Catalan support
- Added more missing verb conjugation templates
- Can now conjugate 8578 verbs using 42 templates
- TODO: Still missing templates for 38 out of 8616 verbs
- See `test_inflector_ca.test_all_verbs_have_templates`
- Made Catalan inflector template matching more loose
- Added `verbecc.string_utils.get_common_letter_count`
- Now matches if template ending has len()-1 chars in common with infinitive ending (not counting accents)
- E.g. Not just `aure` and `eure` match, but also `çar` and `cer`
- Enhanced conjugation template syntax: now supports stem-changing verbs through two new features:
- 1. New stem-modifying XML attribute: `modify-stem="strip-accents"`
- 2. Stem-modifying delete operator '`-`'
- e.g.
```xml
<template name="conèix:er" modify-stem="strip-accents">
<passat-simple>
<p><i>--guí</i></p>
```
- With the above template, `conèix` + `--guí` = `jo coneguí` and `reconèix` + `--guí` = `jo reconeguí`
- Added `gender` flag to support feminine pronouns
- Modified to put `-` placeholder in conjugation for tenses that aren't conjugated e.g.
```python
"caldre",
"indicatiu",
"present",
False,
"f",
[
"-",
"-",
"ella cal",
"-",
"-",
"elles calen",
]
```

- 1.9.2 [17 December 2023]
- Fixed bug in new `localization` module
- Renamed `localize_mood` and `localize_tense` to `xmood` and `xtense`
Expand Down
85 changes: 47 additions & 38 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,61 @@ build-backend = "setuptools.build_meta"

[project]
name = "verbecc"
version = "1.9.2"
version = "1.9.3"
dependencies = [
"lxml",
"lxml-stubs",
"cython",
"numpy",
"scipy",
"scikit-learn"
"scikit-learn",
]
requires-python = ">=3.8"
authors = [
{name = "Brett Tolbert", email = "[email protected]"},
]
maintainers = [
{name = "Brett Tolbert", email = "[email protected]"}
]
authors = [{ name = "Brett Tolbert", email = "[email protected]" }]
maintainers = [{ name = "Brett Tolbert", email = "[email protected]" }]
description = "Verbs Completely Conjugated: machine learning conjugator for Catalan, French, Italian, Portuguese, Romanian and Spanish"
readme = "README.md"
license = {file = "LICENSE.txt"}
keywords= ['verb,', 'conjugator', 'conjugation', 'Catalan', 'French', 'Italian', 'Portuguese', 'Romanian', 'Spanish']
license = { file = "LICENSE.txt" }
keywords = [
'verb,',
'conjugator',
'conjugation',
'Catalan',
'French',
'Italian',
'Portuguese',
'Romanian',
'Spanish',
]
classifiers = [
'Development Status :: 5 - Production/Stable',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Linguistic',
'Topic :: Education',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Utilities',
'Intended Audience :: Developers',
'Intended Audience :: End Users/Desktop',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Natural Language :: Catalan',
'Natural Language :: French',
'Natural Language :: Italian',
'Natural Language :: Portuguese',
'Natural Language :: Romanian',
'Natural Language :: Spanish',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11'
'Development Status :: 5 - Production/Stable',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Linguistic',
'Topic :: Education',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Utilities',
'Intended Audience :: Developers',
'Intended Audience :: End Users/Desktop',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Natural Language :: Catalan',
'Natural Language :: French',
'Natural Language :: Italian',
'Natural Language :: Portuguese',
'Natural Language :: Romanian',
'Natural Language :: Spanish',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]

[project.urls]
Expand All @@ -67,3 +73,6 @@ trained_models = ['verbecc/data/models/*']

[tool.setuptools.packages]
find = {}

[project.scripts]
verbecc-train-models = 'verbecc.utils:train_models'
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

from lxml import etree, objectify
import os

Expand All @@ -8,35 +6,38 @@
(e.g. compound tenses) from the mlconjug conjugation XML files
"""

working_dir = '../verbecc/data'
working_dir = "../verbecc/data"
in_file = "conjugations-ro.xml"
out_file = "conjugations-ro.mod.xml"


def remove_mood_tense(root, tenses_to_remove):
removed_elem_cnt = 0
for template_elem in root:
if template_elem.tag == 'template':
if template_elem.tag == "template":
for mood_elem in template_elem:
for tense_elem in mood_elem:
if tense_elem.tag in tenses_to_remove:
mood_elem.remove(tense_elem)
removed_elem_cnt += 1
print("removed {} elements".format(removed_elem_cnt))


def remove_mood(root, moods_to_remove):
removed_elem_cnt = 0
for template_elem in root:
if template_elem.tag == 'template':
if template_elem.tag == "template":
for mood_elem in template_elem:
if mood_elem.tag in moods_to_remove:
template_elem.remove(mood_elem)
removed_elem_cnt += 1
print("removed {} elements".format(removed_elem_cnt))


def move_tense(root, tense_name, old_mood, new_mood, remove_old_mood):
moved_elem_cnt = 0
for template_elem in root:
if template_elem.tag == 'template':
if template_elem.tag == "template":
tense_elem_to_move = None
# find tense to move
for mood_elem in template_elem:
Expand All @@ -56,19 +57,27 @@ def move_tense(root, tense_name, old_mood, new_mood, remove_old_mood):
moved_elem_cnt += 1
print("moved {} elements".format(moved_elem_cnt))


def main():
parser = etree.XMLParser(dtd_validation=False, encoding='utf-8')
parser = etree.XMLParser(dtd_validation=False, encoding="utf-8")
tree = etree.parse(os.path.join(working_dir, in_file), parser)
root = tree.getroot()

remove_mood(root, ['Conditional'])
#remove_mood_tense(root, ['perfect'])
#move_tense(root, 'Viitor-II-popular', 'Viitor', 'Indicativ', True)

with open(os.path.join(working_dir, out_file), 'wb') as f:
remove_mood(root, ["Conditional"])
# remove_mood_tense(root, ['perfect'])
# move_tense(root, 'Viitor-II-popular', 'Viitor', 'Indicativ', True)

with open(os.path.join(working_dir, out_file), "wb") as f:
objectify.deannotate(root, cleanup_namespaces=True)
xml = etree.tostring(tree, encoding='utf-8', method='xml', pretty_print=True, xml_declaration=True)
xml = etree.tostring(
tree,
encoding="utf-8",
method="xml",
pretty_print=True,
xml_declaration=True,
)
f.write(xml)

if __name__ == '__main__':

if __name__ == "__main__":
main()
2 changes: 0 additions & 2 deletions tests/test_conjugation_template.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

from unittest.mock import patch

import pytest
Expand Down
37 changes: 18 additions & 19 deletions tests/test_conjugator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

import pytest
from verbecc import conjugator
from verbecc import string_utils
Expand Down Expand Up @@ -693,16 +691,17 @@ def test_conjugator_conjugate_imperatif_passe_with_etre():
},
}

test_conj_data = [
("manger", expected_resp_conj_manger),
("pouvoir", expected_resp_conj_pouvoir),
("Pouvoir", expected_resp_conj_pouvoir),
("pleuvoir", expected_resp_conj_pleuvoir),
("Se lever", expected_resp_conj_se_lever),
]


@pytest.mark.parametrize("infinitive,expected_resp", test_conj_data)
@pytest.mark.parametrize(
"infinitive,expected_resp",
[
("manger", expected_resp_conj_manger),
("pouvoir", expected_resp_conj_pouvoir),
("Pouvoir", expected_resp_conj_pouvoir),
("pleuvoir", expected_resp_conj_pleuvoir),
("Se lever", expected_resp_conj_se_lever),
],
)
def test_conjugator_conjugate(infinitive, expected_resp):
assert cg.conjugate(infinitive) == expected_resp

Expand All @@ -722,14 +721,14 @@ def test_conjugator_find_template_template_not_found():
cg.find_template("oops")


test_get_verbs_that_start_with_data = [
("lev", ["lever", "léviger", "levretter"]),
("Se lev", ["se lever", "se léviger", "se levretter"]),
("s'aim", ["s'aimanter", "s'aimer"]),
]


@pytest.mark.parametrize("query,expected_resp", test_get_verbs_that_start_with_data)
@pytest.mark.parametrize(
"query,expected_resp",
[
("lev", ["lever", "léviger", "levretter"]),
("Se lev", ["se lever", "se léviger", "se levretter"]),
("s'aim", ["s'aimanter", "s'aimer"]),
],
)
def test_conjugator_get_verbs_that_start_with(query, expected_resp):
assert set(cg.get_verbs_that_start_with(query, max_results=10)) == set(
expected_resp
Expand Down
Loading

0 comments on commit 006e7c9

Please sign in to comment.