Merge pull request #19 from bretttolbert/catalan-contd

improved Catalan support
bretttolbert · Dec 22, 2023 · 006e7c9 · 006e7c9
2 parents a3c8591 + 4294660
commit 006e7c9
Show file tree

Hide file tree

Showing 38 changed files with 5,465 additions and 2,876 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,43 @@
 # verbecc Changelog
 
+- 1.9.3 [22 December 2023]
+  - Improved Catalan support
+    - Added more missing verb conjugation templates
+      - Can now conjugate 8578 verbs using 42 templates
+      - TODO: Still missing templates for 38 out of 8616 verbs
+      - See `test_inflector_ca.test_all_verbs_have_templates`
+    - Made Catalan inflector template matching more loose
+      - Added `verbecc.string_utils.get_common_letter_count`
+      - Now matches if template ending has len()-1 chars in common with infinitive ending (not counting accents)
+      - E.g. Not just `aure` and `eure` match, but also `çar` and `cer`
+  - Enhanced conjugation template syntax: now supports stem-changing verbs through two new features:
+    - 1. New stem-modifying XML attribute: `modify-stem="strip-accents"`
+    - 2. Stem-modifying delete operator '`-`'
+    - e.g. 
+    ```xml
+    <template name="conèix:er" modify-stem="strip-accents">
+    		<passat-simple>
+			<p><i>--guí</i></p>
+    ```
+    - With the above template, `conèix` + `--guí` = `jo coneguí` and `reconèix` + `--guí` = `jo reconeguí`
+  - Added `gender` flag to support feminine pronouns
+  - Modified to put `-` placeholder in conjugation for tenses that aren't conjugated e.g.
+  ```python
+    "caldre",
+    "indicatiu",
+    "present",
+    False,
+    "f",
+    [
+        "-",
+        "-",
+        "ella cal",
+        "-",
+        "-",
+        "elles calen",
+    ]
+  ```
+
 - 1.9.2 [17 December 2023]
   - Fixed bug in new `localization` module
   - Renamed `localize_mood` and `localize_tense` to `xmood` and `xtense`

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,55 +4,61 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "verbecc"
-version = "1.9.2"
+version = "1.9.3"
 dependencies = [
   "lxml",
   "lxml-stubs",
   "cython",
   "numpy",
   "scipy",
-  "scikit-learn"
+  "scikit-learn",
 ]
 requires-python = ">=3.8"
-authors = [
-  {name = "Brett Tolbert", email = "[email protected]"},
-]
-maintainers = [
-  {name = "Brett Tolbert", email = "[email protected]"}
-]
+authors = [{ name = "Brett Tolbert", email = "[email protected]" }]
+maintainers = [{ name = "Brett Tolbert", email = "[email protected]" }]
 description = "Verbs Completely Conjugated: machine learning conjugator for Catalan, French, Italian, Portuguese, Romanian and Spanish"
 readme = "README.md"
-license = {file = "LICENSE.txt"}
-    keywords= ['verb,', 'conjugator', 'conjugation', 'Catalan', 'French', 'Italian', 'Portuguese', 'Romanian', 'Spanish']
+license = { file = "LICENSE.txt" }
+keywords = [
+  'verb,',
+  'conjugator',
+  'conjugation',
+  'Catalan',
+  'French',
+  'Italian',
+  'Portuguese',
+  'Romanian',
+  'Spanish',
+]
 classifiers = [
-    'Development Status :: 5 - Production/Stable',
-    'Topic :: Software Development :: Libraries :: Python Modules',
-    'Topic :: Text Processing :: Linguistic',
-    'Topic :: Education',
-    'Topic :: Scientific/Engineering :: Artificial Intelligence',
-    'Topic :: Utilities',
-    'Intended Audience :: Developers',
-    'Intended Audience :: End Users/Desktop',
-    'Intended Audience :: Education',
-    'Intended Audience :: Science/Research',
-    'License :: OSI Approved :: GNU General Public License (GPL)',
-    'Natural Language :: Catalan',
-    'Natural Language :: French',
-    'Natural Language :: Italian',
-    'Natural Language :: Portuguese',
-    'Natural Language :: Romanian',
-    'Natural Language :: Spanish',
-    'Operating System :: OS Independent',
-    'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3.3',
-    'Programming Language :: Python :: 3.4',
-    'Programming Language :: Python :: 3.5',
-    'Programming Language :: Python :: 3.6',
-    'Programming Language :: Python :: 3.7',
-    'Programming Language :: Python :: 3.8',
-    'Programming Language :: Python :: 3.9',
-    'Programming Language :: Python :: 3.10',
-    'Programming Language :: Python :: 3.11'
+  'Development Status :: 5 - Production/Stable',
+  'Topic :: Software Development :: Libraries :: Python Modules',
+  'Topic :: Text Processing :: Linguistic',
+  'Topic :: Education',
+  'Topic :: Scientific/Engineering :: Artificial Intelligence',
+  'Topic :: Utilities',
+  'Intended Audience :: Developers',
+  'Intended Audience :: End Users/Desktop',
+  'Intended Audience :: Education',
+  'Intended Audience :: Science/Research',
+  'License :: OSI Approved :: GNU General Public License (GPL)',
+  'Natural Language :: Catalan',
+  'Natural Language :: French',
+  'Natural Language :: Italian',
+  'Natural Language :: Portuguese',
+  'Natural Language :: Romanian',
+  'Natural Language :: Spanish',
+  'Operating System :: OS Independent',
+  'Programming Language :: Python :: 3',
+  'Programming Language :: Python :: 3.3',
+  'Programming Language :: Python :: 3.4',
+  'Programming Language :: Python :: 3.5',
+  'Programming Language :: Python :: 3.6',
+  'Programming Language :: Python :: 3.7',
+  'Programming Language :: Python :: 3.8',
+  'Programming Language :: Python :: 3.9',
+  'Programming Language :: Python :: 3.10',
+  'Programming Language :: Python :: 3.11',
 ]
 
 [project.urls]
@@ -67,3 +73,6 @@ trained_models = ['verbecc/data/models/*']
 
 [tool.setuptools.packages]
 find = {}
+
+[project.scripts]
+verbecc-train-models = 'verbecc.utils:train_models'
diff --git a/utils/conjugation_xml_filter.py → scripts/conjugation_xml_filter.py b/utils/conjugation_xml_filter.py → scripts/conjugation_xml_filter.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 from lxml import etree, objectify
 import os
 
@@ -8,35 +6,38 @@
 (e.g. compound tenses) from the mlconjug conjugation XML files
 """
 
-working_dir = '../verbecc/data'
+working_dir = "../verbecc/data"
 in_file = "conjugations-ro.xml"
 out_file = "conjugations-ro.mod.xml"
 
+
 def remove_mood_tense(root, tenses_to_remove):
     removed_elem_cnt = 0
     for template_elem in root:
-        if template_elem.tag == 'template':
+        if template_elem.tag == "template":
             for mood_elem in template_elem:
                 for tense_elem in mood_elem:
                     if tense_elem.tag in tenses_to_remove:
                         mood_elem.remove(tense_elem)
                         removed_elem_cnt += 1
     print("removed {} elements".format(removed_elem_cnt))
 
+
 def remove_mood(root, moods_to_remove):
     removed_elem_cnt = 0
     for template_elem in root:
-        if template_elem.tag == 'template':
+        if template_elem.tag == "template":
             for mood_elem in template_elem:
                 if mood_elem.tag in moods_to_remove:
                     template_elem.remove(mood_elem)
                     removed_elem_cnt += 1
     print("removed {} elements".format(removed_elem_cnt))
 
+
 def move_tense(root, tense_name, old_mood, new_mood, remove_old_mood):
     moved_elem_cnt = 0
     for template_elem in root:
-        if template_elem.tag == 'template':
+        if template_elem.tag == "template":
             tense_elem_to_move = None
             # find tense to move
             for mood_elem in template_elem:
@@ -56,19 +57,27 @@ def move_tense(root, tense_name, old_mood, new_mood, remove_old_mood):
                         moved_elem_cnt += 1
     print("moved {} elements".format(moved_elem_cnt))
 
+
 def main():
-    parser = etree.XMLParser(dtd_validation=False, encoding='utf-8')
+    parser = etree.XMLParser(dtd_validation=False, encoding="utf-8")
     tree = etree.parse(os.path.join(working_dir, in_file), parser)
     root = tree.getroot()
-
-    remove_mood(root, ['Conditional'])
-    #remove_mood_tense(root, ['perfect'])
-    #move_tense(root, 'Viitor-II-popular', 'Viitor', 'Indicativ', True)
 
-    with open(os.path.join(working_dir, out_file), 'wb') as f:
+    remove_mood(root, ["Conditional"])
+    # remove_mood_tense(root, ['perfect'])
+    # move_tense(root, 'Viitor-II-popular', 'Viitor', 'Indicativ', True)
+
+    with open(os.path.join(working_dir, out_file), "wb") as f:
         objectify.deannotate(root, cleanup_namespaces=True)
-        xml = etree.tostring(tree, encoding='utf-8', method='xml', pretty_print=True, xml_declaration=True)
+        xml = etree.tostring(
+            tree,
+            encoding="utf-8",
+            method="xml",
+            pretty_print=True,
+            xml_declaration=True,
+        )
         f.write(xml)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/tests/test_conjugation_template.py b/tests/test_conjugation_template.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 from unittest.mock import patch
 
 import pytest

diff --git a/tests/test_conjugator.py b/tests/test_conjugator.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 import pytest
 from verbecc import conjugator
 from verbecc import string_utils
@@ -693,16 +691,17 @@ def test_conjugator_conjugate_imperatif_passe_with_etre():
     },
 }
 
-test_conj_data = [
-    ("manger", expected_resp_conj_manger),
-    ("pouvoir", expected_resp_conj_pouvoir),
-    ("Pouvoir", expected_resp_conj_pouvoir),
-    ("pleuvoir", expected_resp_conj_pleuvoir),
-    ("Se lever", expected_resp_conj_se_lever),
-]
-
 
-@pytest.mark.parametrize("infinitive,expected_resp", test_conj_data)
+@pytest.mark.parametrize(
+    "infinitive,expected_resp",
+    [
+        ("manger", expected_resp_conj_manger),
+        ("pouvoir", expected_resp_conj_pouvoir),
+        ("Pouvoir", expected_resp_conj_pouvoir),
+        ("pleuvoir", expected_resp_conj_pleuvoir),
+        ("Se lever", expected_resp_conj_se_lever),
+    ],
+)
 def test_conjugator_conjugate(infinitive, expected_resp):
     assert cg.conjugate(infinitive) == expected_resp
 
@@ -722,14 +721,14 @@ def test_conjugator_find_template_template_not_found():
         cg.find_template("oops")
 
 
-test_get_verbs_that_start_with_data = [
-    ("lev", ["lever", "léviger", "levretter"]),
-    ("Se lev", ["se lever", "se léviger", "se levretter"]),
-    ("s'aim", ["s'aimanter", "s'aimer"]),
-]
-
-
-@pytest.mark.parametrize("query,expected_resp", test_get_verbs_that_start_with_data)
+@pytest.mark.parametrize(
+    "query,expected_resp",
+    [
+        ("lev", ["lever", "léviger", "levretter"]),
+        ("Se lev", ["se lever", "se léviger", "se levretter"]),
+        ("s'aim", ["s'aimanter", "s'aimer"]),
+    ],
+)
 def test_conjugator_get_verbs_that_start_with(query, expected_resp):
     assert set(cg.get_verbs_that_start_with(query, max_results=10)) == set(
         expected_resp