From 1cbdbd10ce2abb52eef4f88a471af23f7d591a68 Mon Sep 17 00:00:00 2001 From: gram Date: Wed, 27 Sep 2023 14:37:12 +0200 Subject: [PATCH 1/3] Drop abydos --- README.md | 4 ---- constraints.txt | 10 ---------- setup.py | 3 --- tests/test_external.py | 10 ---------- textdistance/libraries.py | 6 ------ 5 files changed, 33 deletions(-) delete mode 100644 constraints.txt diff --git a/README.md b/README.md index 6fd96f2..b42a079 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,6 @@ hamming('text', 'testit') Supported libraries: -1. [abydos](https://github.com/chrislit/abydos) 1. [Distance](https://github.com/doukremt/distance) 1. [jellyfish](https://github.com/jamesturk/jellyfish) 1. [py_stringmatching](https://github.com/anhaidgroup/py_stringmatching) @@ -245,13 +244,11 @@ Without extras installation: | DamerauLevenshtein | rapidfuzz | 0.00312 | | DamerauLevenshtein | jellyfish | 0.00591 | | DamerauLevenshtein | pyxdameraulevenshtein | 0.03335 | -| DamerauLevenshtein | abydos | 0.63278 | | DamerauLevenshtein | **textdistance** | 0.83524 | | Hamming | Levenshtein | 0.00038 | | Hamming | rapidfuzz | 0.00044 | | Hamming | jellyfish | 0.00091 | | Hamming | distance | 0.00812 | -| Hamming | abydos | 0.00902 | | Hamming | **textdistance** | 0.03531 | | Jaro | rapidfuzz | 0.00092 | | Jaro | jellyfish | 0.00191 | @@ -265,7 +262,6 @@ Without extras installation: | Levenshtein | pylev | 0.15688 | | Levenshtein | distance | 0.28669 | | Levenshtein | **textdistance** | 0.53902 | -| Levenshtein | abydos | 1.25783 | Total: 24 libs. diff --git a/constraints.txt b/constraints.txt deleted file mode 100644 index d8cec6a..0000000 --- a/constraints.txt +++ /dev/null @@ -1,10 +0,0 @@ -abydos # https://github.com/chrislit/abydos -distance # https://github.com/doukremt/distance -jellyfish # https://github.com/jamesturk/jellyfish -numpy -py_stringmatching # https://github.com/anhaidgroup/py_stringmatching -pylev # https://github.com/toastdriven/pylev -python-Levenshtein # https://github.com/ztane/python-Levenshtein -pyxDamerauLevenshtein # https://github.com/gfairchild/pyxDamerauLevenshtein -tabulate - diff --git a/setup.py b/setup.py index 486d517..9441124 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,6 @@ extras = { # enough for simple usage 'extras': [ - 'abydos', 'jellyfish', # for DamerauLevenshtein 'numpy', # for SmithWaterman and other 'python-Levenshtein', # for Jaro and Levenshtein @@ -18,7 +17,6 @@ # needed for benchmarking, optimization and testing 'benchmark': [ # common - 'abydos', 'jellyfish', 'numpy', 'python-Levenshtein', @@ -68,7 +66,6 @@ 'rapidfuzz>=2.6.0', # only same length, any iterators of hashable elements 'jellyfish', # only strings, any length 'distance', # only same length, any iterators - 'abydos', # any iterators ], 'Jaro': [ 'rapidfuzz>=2.6.0', # any iterators of hashable elements diff --git a/tests/test_external.py b/tests/test_external.py index 47b2002..14e5b20 100644 --- a/tests/test_external.py +++ b/tests/test_external.py @@ -12,15 +12,7 @@ libraries = prototype.clone() -# numpy throws a bunch of warning about abydos using `np.int` isntead of `int`. -ABYDOS_WARNINGS = ( - 'ignore:`np.int` is a deprecated alias', - 'ignore:`np.float` is a deprecated alias', - 'ignore:Using or importing the ABCs', -) - -@pytest.mark.filterwarnings(*ABYDOS_WARNINGS) @pytest.mark.external @pytest.mark.parametrize('alg', libraries.get_algorithms()) @hypothesis.settings(deadline=None) @@ -45,7 +37,6 @@ def test_compare(left, right, alg): assert isclose(int_result, ext_result), str(lib) -@pytest.mark.filterwarnings(*ABYDOS_WARNINGS) @pytest.mark.external @pytest.mark.parametrize('alg', libraries.get_algorithms()) @hypothesis.given( @@ -80,7 +71,6 @@ def test_qval(left, right, alg, qval): assert isclose(int_result, ext_result), f'{lib}({repr(s1)}, {repr(s2)})' -@pytest.mark.filterwarnings(*ABYDOS_WARNINGS) @pytest.mark.external @pytest.mark.parametrize('alg', libraries.get_algorithms()) @hypothesis.given( diff --git a/textdistance/libraries.py b/textdistance/libraries.py index 39045aa..a0b9c16 100644 --- a/textdistance/libraries.py +++ b/textdistance/libraries.py @@ -167,17 +167,12 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary): reg = prototype.register alg = 'DamerauLevenshtein' -reg(alg, LibraryBase( - 'abydos.distance', 'DamerauLevenshtein', presets={}, attr='dist_abs', - conditions=dict(restricted=False), -)) reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance', conditions=dict(restricted=True))) reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance', conditions=dict(restricted=False))) reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance', conditions=dict(restricted=False))) reg(alg, LibraryBase('rapidfuzz.distance.OSA', 'distance', conditions=dict(restricted=True))) alg = 'Hamming' -reg(alg, LibraryBase('abydos.distance', 'Hamming', presets={}, attr='dist_abs')) reg(alg, SameLengthLibrary('distance', 'hamming')) reg(alg, SameLengthTextLibrary('Levenshtein', 'hamming')) reg(alg, TextLibrary('jellyfish', 'hamming_distance')) @@ -197,7 +192,6 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary): # reg(alg, TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True))) alg = 'Levenshtein' -reg(alg, LibraryBase('abydos.distance', 'Levenshtein', presets={}, attr='dist_abs')) reg(alg, LibraryBase('distance', 'levenshtein')) reg(alg, LibraryBase('pylev', 'levenshtein')) reg(alg, TextLibrary('jellyfish', 'levenshtein_distance')) From 82cef6626f85f5821b9a3a2fa76cf382f88dcd71 Mon Sep 17 00:00:00 2001 From: gram Date: Wed, 27 Sep 2023 14:37:30 +0200 Subject: [PATCH 2/3] enable external tests on CI --- .github/workflows/main.yml | 40 +++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6ec3e95..cd5181c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -45,26 +45,26 @@ jobs: repo-token: ${{ github.token }} - run: task pytest-pure - # pytest-external: - # runs-on: ubuntu-latest - # strategy: - # fail-fast: false - # matrix: - # python-version: - # - "3.8" - # - "3.9" - # - "3.10" - # - "3.11" - # # - "3.12.0-rc.1" - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-python@v4 - # with: - # python-version: ${{ matrix.python-version }} - # - uses: arduino/setup-task@v1 - # with: - # repo-token: ${{ github.token }} - # - run: task pytest-external + pytest-external: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + # - "3.12.0-rc.1" + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - uses: arduino/setup-task@v1 + with: + repo-token: ${{ github.token }} + - run: task pytest-external markdownlint-cli: runs-on: ubuntu-latest From e66149fff6b1adae65f51294edb646ded1e074b5 Mon Sep 17 00:00:00 2001 From: gram Date: Thu, 28 Sep 2023 09:08:35 +0200 Subject: [PATCH 3/3] skip unicode tests for jellyfish --- tests/test_external.py | 18 +++++++++++++++++- textdistance/libraries.py | 14 +++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/tests/test_external.py b/tests/test_external.py index 14e5b20..cd7fddb 100644 --- a/tests/test_external.py +++ b/tests/test_external.py @@ -1,8 +1,12 @@ +from __future__ import annotations + # built-in +import string from math import isclose # external import hypothesis +import hypothesis.strategies import pytest # project @@ -22,6 +26,12 @@ ) def test_compare(left, right, alg): for lib in libraries.get_libs(alg): + + if lib.module_name == 'jellyfish': + ascii = set(string.printable) + if (set(left) | set(right)) - ascii: + continue + conditions = lib.conditions or {} internal_func = getattr(textdistance, alg)(external=False, **conditions) external_func = lib.get_function() @@ -44,8 +54,14 @@ def test_compare(left, right, alg): right=hypothesis.strategies.text(min_size=1), ) @pytest.mark.parametrize('qval', (None, 1, 2, 3)) -def test_qval(left, right, alg, qval): +def test_qval(left: str, right: str, alg: str, qval: int | None) -> None: for lib in libraries.get_libs(alg): + + if lib.module_name == 'jellyfish': + ascii = set(string.printable) + if (set(left) | set(right)) - ascii: + continue + conditions = lib.conditions or {} internal_func = getattr(textdistance, alg)(external=False, **conditions) external_func = lib.get_function() diff --git a/textdistance/libraries.py b/textdistance/libraries.py index a0b9c16..23d9393 100644 --- a/textdistance/libraries.py +++ b/textdistance/libraries.py @@ -39,12 +39,12 @@ def optimize(self) -> None: # sort libs by speed self.libs[alg].sort(key=lambda lib: libs_names.index([lib.module_name, lib.func_name])) - def get_algorithms(self) -> list: + def get_algorithms(self) -> list[str]: """Get list of available algorithms. """ return list(self.libs.keys()) - def get_libs(self, alg) -> list[LibraryBase]: + def get_libs(self, alg: str) -> list[LibraryBase]: """Get libs list for algorithm """ if alg not in self.libs: @@ -69,7 +69,7 @@ def __init__( *, presets: dict[str, Any] | None = None, attr: str | None = None, - conditions: dict[str, Any] | None = None, + conditions: dict[str, bool] | None = None, ) -> None: self.module_name = module_name self.func_name = func_name @@ -89,7 +89,7 @@ def check_conditions(self, obj: object, *sequences: Sequence) -> bool: return True - def prepare(self, *sequences) -> tuple: + def prepare(self, *sequences: Sequence) -> tuple: return sequences @property @@ -128,7 +128,7 @@ def __str__(self) -> str: class TextLibrary(LibraryBase): - def check_conditions(self, obj, *sequences: Sequence) -> bool: + def check_conditions(self, obj: object, *sequences: Sequence) -> bool: if not super().check_conditions(obj, *sequences): return False @@ -142,7 +142,7 @@ def check_conditions(self, obj, *sequences: Sequence) -> bool: return False return True - def prepare(self, *sequences) -> tuple: + def prepare(self, *sequences: Sequence) -> tuple: # convert list of letters to string if isinstance(sequences[0], (tuple, list)): sequences = tuple(map(lambda x: ''.join(x), sequences)) @@ -150,7 +150,7 @@ def prepare(self, *sequences) -> tuple: class SameLengthLibrary(LibraryBase): - def check_conditions(self, obj, *sequences: Sequence) -> bool: + def check_conditions(self, obj: object, *sequences: Sequence) -> bool: if not super().check_conditions(obj, *sequences): return False # compare only same length iterators