diff --git a/LICENSE b/LICENSE index d90e98b..86bc13a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 mmb L (Python port https://github.com/mammothb/symspellpy) +Copyright (c) 2024 mmb L (Python port https://github.com/mammothb/symspellpy) Copyright (c) 2021 Wolf Garbe (Original C# implementation https://github.com/wolfgarbe/SymSpell) Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/README.md b/README.md index 2abb6ef..fa25c45 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ symspellpy
[![codecov](https://codecov.io/gh/mammothb/symspellpy/branch/master/graph/badge.svg)](https://codecov.io/gh/mammothb/symspellpy) ======== -symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.1, which provides much higher speed and lower memory consumption. Unit tests +symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.2, which provides much higher speed and lower memory consumption. Unit tests from the original project are implemented to ensure the accuracy of the port. Please note that the port has not been optimized for speed. diff --git a/docs/api/abstract_distance_comparer.rst b/docs/api/abstract_distance_comparer.rst new file mode 100644 index 0000000..f21a79f --- /dev/null +++ b/docs/api/abstract_distance_comparer.rst @@ -0,0 +1,9 @@ +************************** +abstract_distance_comparer +************************** + +Distance comparer interface +=========================== + +.. autoclass:: symspellpy.abstract_distance_comparer.AbstractDistanceComparer + :members: diff --git a/docs/api/editdistance.rst b/docs/api/editdistance.rst index 2d82e1e..70630bd 100644 --- a/docs/api/editdistance.rst +++ b/docs/api/editdistance.rst @@ -17,9 +17,6 @@ EditDistance class Distance comparer classes ========================= -.. autoclass:: symspellpy.editdistance.AbstractDistanceComparer - :members: - .. autoclass:: symspellpy.editdistance.DamerauOsa :members: diff --git a/docs/api/index.rst b/docs/api/index.rst index 33e2345..fa9caeb 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -11,5 +11,6 @@ Modules :maxdepth: 2 helpers.rst + abstract_distance_comparer.rst editdistance.rst symspellpy.rst diff --git a/docs/conf.py b/docs/conf.py index 9bd8175..9fd4aa3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,113 +1,114 @@ -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os.path -import sys - -sys.path.insert(0, os.path.abspath("..")) - -import symspellpy - -# -- Project information ----------------------------------------------------- - -project = "symspellpy" -copyright = "2021, mmb L, Wolf Garbe" -author = "mmb L, Wolf Garbe" - -# The short X.Y version -version = "" -# The full version, including alpha/beta/rc tags -release = symspellpy.__version__ - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.napoleon", - "sphinx.ext.viewcode", - "sphinx_autodoc_typehints", -] -# numpydoc_class_members_toctree = False -# numpydoc_show_inherited_class_members = False -highlight_language = "none" - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = None - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinxdoc" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = "symspellpydoc" +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os.path +import sys + +sys.path.insert(0, os.path.abspath("..")) + +import symspellpy + +# -- Project information ----------------------------------------------------- + +project = "symspellpy" +copyright = "2021, mmb L, Wolf Garbe" +author = "mmb L, Wolf Garbe" + +# The short X.Y version +version = "" +# The full version, including alpha/beta/rc tags +release = symspellpy.__version__ + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx_autodoc_typehints", +] +# numpydoc_class_members_toctree = False +# numpydoc_show_inherited_class_members = False +highlight_language = "none" + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = ".rst" + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinxdoc" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ["_static"] +html_static_path = [] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "symspellpydoc" diff --git a/docs/examples/custom_distance_comparer.rst b/docs/examples/custom_distance_comparer.rst new file mode 100644 index 0000000..1d4dee2 --- /dev/null +++ b/docs/examples/custom_distance_comparer.rst @@ -0,0 +1,30 @@ +************************ +Custom distance comparer +************************ + +Basic usage +=========== + +Create a comparer class which satisfies the interface specified by +:class:`~symspellpy.abstract_distance_comparer.AbstractDistanceComparer`: + +.. code-block:: python + + from symspellpy.abstract_distance_comparer import AbstractDistanceComparer + from symspellpy.editdistance import DistanceAlgorithm, EditDistance + + class CustomComparer(AbstractDistanceComparer): + def distance(self, string1, string_2, max_distance): + # Compare distance between string_1 and string_2 + return -1 if distance > max_distance else distance + + custom_comparer = Editdistance(DistanceAlgorithm.USER_PROVIDED, CustomComparer()) + sym_spell = SymSpell(distance_comparer=custom_comparer) + dictionary_path = pkg_resources.resource_filename( + "symspellpy", "frequency_bigramdictionary_en_243_342.txt" + ) + sym_spell.load_bigram_dictionary(dictionary_path, 0, 2) + + # Print out first 5 elements to demonstrate that dictionary is + # successfully loaded + print(list(islice(sym_spell.bigrams.items(), 5))) diff --git a/docs/examples/index.rst b/docs/examples/index.rst index f77a0bd..5129e01 100644 --- a/docs/examples/index.rst +++ b/docs/examples/index.rst @@ -8,6 +8,7 @@ Examples :maxdepth: 2 dictionary.rst + custom_distance_comparer.rst lookup.rst lookup_compound.rst word_segmentation.rst diff --git a/docs/index.rst b/docs/index.rst index 0178fe8..911e92b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,7 +18,7 @@ symspellpy ********** -symspellpy is a Python port of SymSpell_ v6.7, a Symmetric Delete +symspellpy is a Python port of SymSpell_ v6.7.2, a Symmetric Delete spelling correction algorithm which provides much higher speed and lower memory consumption. diff --git a/pyproject.toml b/pyproject.toml index 3ebb40c..f2fff1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,43 @@ [build-system] -requires = ["setuptools>=58.0.4", "wheel"] +requires = ["setuptools>=61.0.0", "wheel"] build-backend = "setuptools.build_meta" +[project] +name = "symspellpy" +dynamic = ["version"] +dependencies = [ + "editdistpy>=0.1.3", +] +requires-python = ">=3.8" +authors = [ + {name = "mmb L"}, +] +description = "Python SymSpell" +readme = "README.md" +license = {file = "LICENSE"} +keywords = ["spellchecker", "symspell", "word segmentation"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Natural Language :: English", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.urls] +Repository = "https://github.com/mammothb/symspellpy" +Documentation = "https://symspellpy.readthedocs.io/en/latest" +Changelog = "https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md" + [tool.basedpyright] +ignore = ["tests"] pythonVersion = "3.8" reportUnusedCallResult = "none" @@ -17,3 +52,13 @@ indent-style = "space" line-ending = "auto" quote-style = "double" skip-magic-trailing-comma = false + +[tool.setuptools.dynamic] +version = {attr = "symspellpy.__version__"} + +[tool.setuptools.packages.find] +where = ["."] +include = ["symspellpy"] + +[tool.setuptools.package-data] +symspellpy = ["frequency_*.txt"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5e20998..0000000 --- a/setup.cfg +++ /dev/null @@ -1,40 +0,0 @@ -[metadata] -name = symspellpy -version = attr: symspellpy.__version__ -description = Python SymSpell -long_description = file: README.md -long_description_content_type = text/markdown -author = mmb L -url = https://github.com/mammothb/symspellpy -project_urls = - Documentation = https://symspellpy.readthedocs.io/en/latest/ - Changelog = https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md -keywords = spellchecker, symspell, word segmentation -license = MIT -classifiers = - Development Status :: 4 - Beta - Intended Audience :: Developers - Intended Audience :: Education - Natural Language :: English - License :: OSI Approved :: MIT License - Programming Language :: Python - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - -[options] -zip_safe = False -include_package_data = True -packages = find: -python_requires = >=3.8 -install_requires = - editdistpy>=0.1.3 - -[options.package_data] -symspellpy = frequency_*.txt - -[options.packages.find] -include = symspellpy diff --git a/symspellpy/__init__.py b/symspellpy/__init__.py index 0a8c25d..f4b9455 100644 --- a/symspellpy/__init__.py +++ b/symspellpy/__init__.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -19,7 +19,7 @@ .. moduleauthor:: Wolf Garbe """ -__version__ = "6.7.7" +__version__ = "6.7.8-rc1" from . import editdistance, helpers, logging from .symspellpy import SymSpell diff --git a/symspellpy/abstract_distance_comparer.py b/symspellpy/abstract_distance_comparer.py new file mode 100644 index 0000000..ba839fe --- /dev/null +++ b/symspellpy/abstract_distance_comparer.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod + + +class AbstractDistanceComparer(ABC): + """An interface to compute relative distance between two strings.""" + + @abstractmethod + def distance(self, string_1: str, string_2: str, max_distance: int) -> int: + """Returns a measure of the distance between two strings. + + Args: + string_1: One of the strings to compare. + string_2: The other string to compare. + max_distance: The maximum distance that is of interest. + + Returns: + -1 if the distance is greater than the max_distance, 0 if the strings + are equivalent, otherwise a positive number whose magnitude + increases as difference between the strings increases. + """ diff --git a/symspellpy/composition.py b/symspellpy/composition.py index c9ebd33..e47169f 100644 --- a/symspellpy/composition.py +++ b/symspellpy/composition.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/symspellpy/editdistance.py b/symspellpy/editdistance.py index e38e558..b25fd7b 100644 --- a/symspellpy/editdistance.py +++ b/symspellpy/editdistance.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -18,13 +18,14 @@ :synopsis: Module for edit distance algorithms. """ -from abc import ABC, abstractmethod +import warnings from enum import Enum -from typing import List +from typing import List, Optional from editdistpy import damerau_osa, levenshtein from symspellpy import helpers +from symspellpy.abstract_distance_comparer import AbstractDistanceComparer class DistanceAlgorithm(Enum): @@ -34,6 +35,7 @@ class DistanceAlgorithm(Enum): DAMERAU_OSA = 1 #: Damerau optimal string alignment algorithm LEVENSHTEIN_FAST = 2 #: Fast Levenshtein algorithm. DAMERAU_OSA_FAST = 3 #: Fast Damerau optimal string alignment algorithm + USER_PROVIDED = 4 #: User provided custom edit distance algorithm class EditDistance: @@ -53,7 +55,16 @@ class EditDistance: ValueError: If `algorithm` specifies an invalid distance algorithm. """ - def __init__(self, algorithm: DistanceAlgorithm) -> None: + def __init__( + self, + algorithm: DistanceAlgorithm, + comparer: Optional[AbstractDistanceComparer] = None, + ) -> None: + if algorithm != DistanceAlgorithm.USER_PROVIDED and comparer is not None: + warnings.warn( + f"A comparer is passed in but algorithm is not {DistanceAlgorithm.USER_PROVIDED.value}. A built-in comparer will be used." + ) + self._distance_comparer: AbstractDistanceComparer self._algorithm = algorithm if algorithm == DistanceAlgorithm.LEVENSHTEIN: @@ -64,6 +75,12 @@ def __init__(self, algorithm: DistanceAlgorithm) -> None: self._distance_comparer = LevenshteinFast() elif algorithm == DistanceAlgorithm.DAMERAU_OSA_FAST: self._distance_comparer = DamerauOsaFast() + elif algorithm == DistanceAlgorithm.USER_PROVIDED: + if not isinstance(comparer, AbstractDistanceComparer): + raise ValueError( + f"{algorithm.value} selected but no comparer passed in." + ) + self._distance_comparer = comparer else: raise ValueError("unknown distance algorithm") @@ -82,25 +99,6 @@ def compare(self, string_1: str, string_2: str, max_distance: int) -> int: return self._distance_comparer.distance(string_1, string_2, max_distance) -class AbstractDistanceComparer(ABC): - """An interface to compute relative distance between two strings.""" - - @abstractmethod - def distance(self, string_1: str, string_2: str, max_distance: int) -> int: - """Returns a measure of the distance between two strings. - - Args: - string_1: One of the strings to compare. - string_2: The other string to compare. - max_distance: The maximum distance that is of interest. - - Returns: - -1 if the distance is greater than the max_distance, 0 if the strings - are equivalent, otherwise a positive number whose magnitude - increases as difference between the strings increases. - """ - - class Levenshtein(AbstractDistanceComparer): """Provides Levenshtein algorithm for computing edit distance metric between two strings. diff --git a/symspellpy/helpers.py b/symspellpy/helpers.py index 8d36079..ddac517 100644 --- a/symspellpy/helpers.py +++ b/symspellpy/helpers.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/symspellpy/logging.py b/symspellpy/logging.py index 818e1a0..64b9b92 100644 --- a/symspellpy/logging.py +++ b/symspellpy/logging.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/symspellpy/pickle_mixin.py b/symspellpy/pickle_mixin.py index 59b010a..2e17158 100644 --- a/symspellpy/pickle_mixin.py +++ b/symspellpy/pickle_mixin.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/symspellpy/suggest_item.py b/symspellpy/suggest_item.py index 6fc595f..e71128c 100644 --- a/symspellpy/suggest_item.py +++ b/symspellpy/suggest_item.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index 4b41b87..486626d 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -84,6 +84,7 @@ def __init__( max_dictionary_edit_distance: int = 2, prefix_length: int = 7, count_threshold: int = 1, + distance_comparer: Optional[EditDistance] = None, ) -> None: if max_dictionary_edit_distance < 0: raise ValueError("max_dictionary_edit_distance cannot be negative") @@ -95,6 +96,10 @@ def __init__( ) if count_threshold < 0: raise ValueError("count_threshold cannot be negative") + if distance_comparer is None: + self.distance_comparer = EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST) + else: + self.distance_comparer = distance_comparer self._words: Dict[str, int] = {} self._below_threshold_words: Dict[str, int] = {} self._bigrams: Dict[str, int] = {} @@ -104,7 +109,7 @@ def __init__( self._max_dictionary_edit_distance = max_dictionary_edit_distance self._prefix_length = prefix_length self._count_threshold = count_threshold - self._distance_algorithm = DistanceAlgorithm.DAMERAU_OSA_FAST + # self._distance_algorithm = DistanceAlgorithm.DAMERAU_OSA_FAST self._max_length = 0 @property @@ -129,19 +134,6 @@ def deletes(self) -> Dict[str, List[str]]: """ return self._deletes - @property - def distance_algorithm(self) -> DistanceAlgorithm: - """The current distance algorithm.""" - return self._distance_algorithm - - @distance_algorithm.setter - def distance_algorithm(self, value: DistanceAlgorithm) -> None: - if not isinstance(value, DistanceAlgorithm): - raise TypeError( - "can only assign DistanceAlgorithm type values to distance_algorithm" - ) - self._distance_algorithm = value - @property def entry_count(self) -> int: """Number of unique correct spelling words.""" @@ -445,7 +437,6 @@ def early_exit(): candidates.append(phrase[:phrase_prefix_len]) else: candidates.append(phrase) - distance_comparer = EditDistance(self._distance_algorithm) while candidate_pointer < len(candidates): candidate = candidates[candidate_pointer] candidate_pointer += 1 @@ -577,7 +568,7 @@ def early_exit(): if suggestion in considered_suggestions: continue considered_suggestions.add(suggestion) - distance = distance_comparer.compare( + distance = self.distance_comparer.compare( phrase, suggestion, max_edit_distance_2 ) if distance < 0: @@ -683,7 +674,6 @@ def lookup_compound( ) suggestions = [] suggestion_parts: List[SuggestItem] = [] - distance_comparer = EditDistance(self._distance_algorithm) # translate every item to its best suggestion, otherwise it remains # unchanged @@ -761,7 +751,7 @@ def lookup_compound( continue # select best suggestion for split pair tmp_term = f"{suggestions_1[0].term} {suggestions_2[0].term}" - tmp_distance = distance_comparer.compare( + tmp_distance = self.distance_comparer.compare( terms_1[i], tmp_term, max_edit_distance ) if tmp_distance < 0: @@ -858,7 +848,7 @@ def lookup_compound( joined_term = helpers.case_transfer_similar(phrase, joined_term) suggestion = SuggestItem( joined_term, - distance_comparer.compare(phrase, joined_term, 2**31 - 1), + self.distance_comparer.compare(phrase, joined_term, 2**31 - 1), int(joined_count), ) return [suggestion] diff --git a/symspellpy/verbosity.py b/symspellpy/verbosity.py index 89b6816..4bf52bd 100644 --- a/symspellpy/verbosity.py +++ b/symspellpy/verbosity.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2022 mmb L (Python port) +# Copyright (c) 2024 mmb L (Python port) # Copyright (c) 2021 Wolf Garbe (Original C# implementation) # # Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/tests/fortests/below_threshold_dict.txt b/tests/fortests/below_threshold_dict.txt new file mode 100644 index 0000000..c2e74b4 --- /dev/null +++ b/tests/fortests/below_threshold_dict.txt @@ -0,0 +1,3 @@ +below 8 +threshold 10 +word 10 diff --git a/tests/test_editdistance.py b/tests/test_editdistance.py index 1042862..63d83af 100644 --- a/tests/test_editdistance.py +++ b/tests/test_editdistance.py @@ -3,8 +3,8 @@ import pytest +from symspellpy.abstract_distance_comparer import AbstractDistanceComparer from symspellpy.editdistance import ( - AbstractDistanceComparer, DamerauOsa, DamerauOsaFast, DistanceAlgorithm, @@ -62,6 +62,11 @@ def expected_damerau_osa(string_1, string_2, max_distance): return distance if distance <= max_distance else -1 +class CustomDistanceComparer(AbstractDistanceComparer): + def distance(self, string_1: str, string_2: str, max_distance: int) -> int: + return -2 + + @pytest.fixture( params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"] ) @@ -140,6 +145,11 @@ def test_unknown_distance_algorithm(self): _ = EditDistance(2) assert "unknown distance algorithm" == str(excinfo.value) + def test_missing_custom_comparer(self): + with pytest.raises(ValueError) as excinfo: + _ = EditDistance(DistanceAlgorithm.USER_PROVIDED) + assert "no comparer passed in" in str(excinfo.value) + def test_abstract_distance_comparer(self): with pytest.raises(TypeError) as excinfo: comparer = AbstractDistanceComparer() @@ -148,6 +158,11 @@ def test_abstract_distance_comparer(self): "Can't instantiate abstract class AbstractDistanceComparer" ) + def test_warn_when_builtin_comparer_override_custom_comparer(self): + with pytest.warns(UserWarning, match="A built-in comparer will be used.$"): + comparer = CustomDistanceComparer() + edit_distance = EditDistance(DistanceAlgorithm.LEVENSHTEIN, comparer) + def test_internal_distance_comparer(self, get_edit_distance): edit_distance, expected = get_edit_distance assert isinstance(edit_distance._distance_comparer, expected) @@ -162,6 +177,15 @@ def test_comparer_match_ref(self, get_comparer, get_strings): s1, s2, max_distance ) + def test_editdistance_use_custom_comparer(self, get_strings): + strings, max_distance = get_strings + comparer = CustomDistanceComparer() + edit_distance = EditDistance(DistanceAlgorithm.USER_PROVIDED, comparer) + + for s1 in strings: + for s2 in strings: + assert -2 == comparer.distance(s1, s2, max_distance) + def test_comparer_null_distance(self, get_comparer, get_short_and_long_strings): comparer, _ = get_comparer diff --git a/tests/test_symspellpy.py b/tests/test_symspellpy.py index b7c4cd7..ab48be6 100644 --- a/tests/test_symspellpy.py +++ b/tests/test_symspellpy.py @@ -4,11 +4,13 @@ import pytest from symspellpy import SymSpell, Verbosity -from symspellpy.editdistance import DistanceAlgorithm +from symspellpy.abstract_distance_comparer import AbstractDistanceComparer +from symspellpy.editdistance import DistanceAlgorithm, EditDistance from symspellpy.helpers import DictIO FORTESTS_DIR = Path(__file__).resolve().parent / "fortests" BAD_DICT_PATH = FORTESTS_DIR / "bad_dict.txt" +BELOW_THRESHOLD_DICT_PATH = FORTESTS_DIR / "below_threshold_dict.txt" BIG_MODIFIED_PATH = FORTESTS_DIR / "big_modified.txt" BIG_WORDS_PATH = FORTESTS_DIR / "big_words.txt" NON_EN_DICT_PATH = FORTESTS_DIR / "non_en_dict.txt" @@ -35,6 +37,11 @@ def get_dictionary_stream(request): yield dict_stream, request.param +class CustomDistanceComparer(AbstractDistanceComparer): + def distance(self, string_1: str, string_2: str, max_distance: int) -> int: + return 0 + + class TestSymSpellPy: def test_negative_max_dictionary_edit_distance(self): with pytest.raises(ValueError) as excinfo: @@ -63,26 +70,13 @@ def test_negative_count_threshold(self): _ = SymSpell(1, 3, -1) assert "count_threshold cannot be negative" == str(excinfo.value) - @pytest.mark.parametrize( - "algorithm", - [ - DistanceAlgorithm.LEVENSHTEIN, - DistanceAlgorithm.DAMERAU_OSA, - DistanceAlgorithm.LEVENSHTEIN_FAST, - DistanceAlgorithm.DAMERAU_OSA_FAST, - ], - ) - def test_set_distance_algorithm(self, symspell_default, algorithm): - symspell_default.distance_algorithm = algorithm - assert algorithm == symspell_default.distance_algorithm - - def test_set_invalid_distance_algorithm(self, symspell_default): - with pytest.raises(TypeError) as excinfo: - symspell_default.distance_algorithm = 1 - assert ( - "can only assign DistanceAlgorithm type values to distance_algorithm" - == str(excinfo.value) + def test_set_distance_comparer(self): + distance_comparer = EditDistance( + DistanceAlgorithm.USER_PROVIDED, CustomDistanceComparer() ) + sym_spell = SymSpell(distance_comparer=distance_comparer) + + assert distance_comparer == sym_spell.distance_comparer @pytest.mark.parametrize("symspell_short", [None, 0], indirect=True) def test_create_dictionary_entry_negative_count(self, symspell_short): @@ -196,6 +190,15 @@ def test_load_dictionary_count(self, symspell_default, dictionary_path): assert 82834 == symspell_default.word_count assert 676094 == symspell_default.entry_count + @pytest.mark.parametrize("symspell_short", [10], indirect=True) + def test_load_dictionary_below_threshold(self, symspell_short): + symspell_short.load_dictionary(BELOW_THRESHOLD_DICT_PATH, 0, 1) + + assert 1 == len(symspell_short.below_threshold_words) + assert 8 == symspell_short.below_threshold_words["below"] + + assert 2 == symspell_short.word_count + def test_load_dictionary_separator(self, symspell_default): assert symspell_default.load_dictionary(SEPARATOR_DICT_PATH, 0, 1, SEPARATOR) assert 5 == symspell_default.word_count