diff --git a/LICENSE b/LICENSE
index d90e98b..86bc13a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2022 mmb L (Python port https://github.com/mammothb/symspellpy)
+Copyright (c) 2024 mmb L (Python port https://github.com/mammothb/symspellpy)
Copyright (c) 2021 Wolf Garbe (Original C# implementation https://github.com/wolfgarbe/SymSpell)
Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/README.md b/README.md
index 2abb6ef..fa25c45 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ symspellpy
[![codecov](https://codecov.io/gh/mammothb/symspellpy/branch/master/graph/badge.svg)](https://codecov.io/gh/mammothb/symspellpy)
========
-symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.1, which provides much higher speed and lower memory consumption. Unit tests
+symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.2, which provides much higher speed and lower memory consumption. Unit tests
from the original project are implemented to ensure the accuracy of the port.
Please note that the port has not been optimized for speed.
diff --git a/docs/api/abstract_distance_comparer.rst b/docs/api/abstract_distance_comparer.rst
new file mode 100644
index 0000000..f21a79f
--- /dev/null
+++ b/docs/api/abstract_distance_comparer.rst
@@ -0,0 +1,9 @@
+**************************
+abstract_distance_comparer
+**************************
+
+Distance comparer interface
+===========================
+
+.. autoclass:: symspellpy.abstract_distance_comparer.AbstractDistanceComparer
+ :members:
diff --git a/docs/api/editdistance.rst b/docs/api/editdistance.rst
index 2d82e1e..70630bd 100644
--- a/docs/api/editdistance.rst
+++ b/docs/api/editdistance.rst
@@ -17,9 +17,6 @@ EditDistance class
Distance comparer classes
=========================
-.. autoclass:: symspellpy.editdistance.AbstractDistanceComparer
- :members:
-
.. autoclass:: symspellpy.editdistance.DamerauOsa
:members:
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 33e2345..fa9caeb 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -11,5 +11,6 @@ Modules
:maxdepth: 2
helpers.rst
+ abstract_distance_comparer.rst
editdistance.rst
symspellpy.rst
diff --git a/docs/conf.py b/docs/conf.py
index 9bd8175..9fd4aa3 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,113 +1,114 @@
-# -*- coding: utf-8 -*-
-#
-# Configuration file for the Sphinx documentation builder.
-#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import os.path
-import sys
-
-sys.path.insert(0, os.path.abspath(".."))
-
-import symspellpy
-
-# -- Project information -----------------------------------------------------
-
-project = "symspellpy"
-copyright = "2021, mmb L, Wolf Garbe"
-author = "mmb L, Wolf Garbe"
-
-# The short X.Y version
-version = ""
-# The full version, including alpha/beta/rc tags
-release = symspellpy.__version__
-
-
-# -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
- "sphinx.ext.autodoc",
- "sphinx.ext.napoleon",
- "sphinx.ext.viewcode",
- "sphinx_autodoc_typehints",
-]
-# numpydoc_class_members_toctree = False
-# numpydoc_show_inherited_class_members = False
-highlight_language = "none"
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinxdoc"
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further. For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself. Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "symspellpydoc"
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os.path
+import sys
+
+sys.path.insert(0, os.path.abspath(".."))
+
+import symspellpy
+
+# -- Project information -----------------------------------------------------
+
+project = "symspellpy"
+copyright = "2021, mmb L, Wolf Garbe"
+author = "mmb L, Wolf Garbe"
+
+# The short X.Y version
+version = ""
+# The full version, including alpha/beta/rc tags
+release = symspellpy.__version__
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.viewcode",
+ "sphinx_autodoc_typehints",
+]
+# numpydoc_class_members_toctree = False
+# numpydoc_show_inherited_class_members = False
+highlight_language = "none"
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = ".rst"
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinxdoc"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ["_static"]
+html_static_path = []
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself. Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "symspellpydoc"
diff --git a/docs/examples/custom_distance_comparer.rst b/docs/examples/custom_distance_comparer.rst
new file mode 100644
index 0000000..1d4dee2
--- /dev/null
+++ b/docs/examples/custom_distance_comparer.rst
@@ -0,0 +1,30 @@
+************************
+Custom distance comparer
+************************
+
+Basic usage
+===========
+
+Create a comparer class which satisfies the interface specified by
+:class:`~symspellpy.abstract_distance_comparer.AbstractDistanceComparer`:
+
+.. code-block:: python
+
+ from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
+ from symspellpy.editdistance import DistanceAlgorithm, EditDistance
+
+ class CustomComparer(AbstractDistanceComparer):
+ def distance(self, string1, string_2, max_distance):
+ # Compare distance between string_1 and string_2
+ return -1 if distance > max_distance else distance
+
+ custom_comparer = Editdistance(DistanceAlgorithm.USER_PROVIDED, CustomComparer())
+ sym_spell = SymSpell(distance_comparer=custom_comparer)
+ dictionary_path = pkg_resources.resource_filename(
+ "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
+ )
+ sym_spell.load_bigram_dictionary(dictionary_path, 0, 2)
+
+ # Print out first 5 elements to demonstrate that dictionary is
+ # successfully loaded
+ print(list(islice(sym_spell.bigrams.items(), 5)))
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index f77a0bd..5129e01 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -8,6 +8,7 @@ Examples
:maxdepth: 2
dictionary.rst
+ custom_distance_comparer.rst
lookup.rst
lookup_compound.rst
word_segmentation.rst
diff --git a/docs/index.rst b/docs/index.rst
index 0178fe8..911e92b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -18,7 +18,7 @@
symspellpy
**********
-symspellpy is a Python port of SymSpell_ v6.7, a Symmetric Delete
+symspellpy is a Python port of SymSpell_ v6.7.2, a Symmetric Delete
spelling correction algorithm which provides much higher speed and lower
memory consumption.
diff --git a/pyproject.toml b/pyproject.toml
index 3ebb40c..f2fff1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,43 @@
[build-system]
-requires = ["setuptools>=58.0.4", "wheel"]
+requires = ["setuptools>=61.0.0", "wheel"]
build-backend = "setuptools.build_meta"
+[project]
+name = "symspellpy"
+dynamic = ["version"]
+dependencies = [
+ "editdistpy>=0.1.3",
+]
+requires-python = ">=3.8"
+authors = [
+ {name = "mmb L"},
+]
+description = "Python SymSpell"
+readme = "README.md"
+license = {file = "LICENSE"}
+keywords = ["spellchecker", "symspell", "word segmentation"]
+classifiers = [
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Education",
+ "Natural Language :: English",
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+]
+
+[project.urls]
+Repository = "https://github.com/mammothb/symspellpy"
+Documentation = "https://symspellpy.readthedocs.io/en/latest"
+Changelog = "https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md"
+
[tool.basedpyright]
+ignore = ["tests"]
pythonVersion = "3.8"
reportUnusedCallResult = "none"
@@ -17,3 +52,13 @@ indent-style = "space"
line-ending = "auto"
quote-style = "double"
skip-magic-trailing-comma = false
+
+[tool.setuptools.dynamic]
+version = {attr = "symspellpy.__version__"}
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["symspellpy"]
+
+[tool.setuptools.package-data]
+symspellpy = ["frequency_*.txt"]
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 5e20998..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,40 +0,0 @@
-[metadata]
-name = symspellpy
-version = attr: symspellpy.__version__
-description = Python SymSpell
-long_description = file: README.md
-long_description_content_type = text/markdown
-author = mmb L
-url = https://github.com/mammothb/symspellpy
-project_urls =
- Documentation = https://symspellpy.readthedocs.io/en/latest/
- Changelog = https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md
-keywords = spellchecker, symspell, word segmentation
-license = MIT
-classifiers =
- Development Status :: 4 - Beta
- Intended Audience :: Developers
- Intended Audience :: Education
- Natural Language :: English
- License :: OSI Approved :: MIT License
- Programming Language :: Python
- Programming Language :: Python :: 3
- Programming Language :: Python :: 3.8
- Programming Language :: Python :: 3.9
- Programming Language :: Python :: 3.10
- Programming Language :: Python :: 3.11
- Programming Language :: Python :: 3.12
-
-[options]
-zip_safe = False
-include_package_data = True
-packages = find:
-python_requires = >=3.8
-install_requires =
- editdistpy>=0.1.3
-
-[options.package_data]
-symspellpy = frequency_*.txt
-
-[options.packages.find]
-include = symspellpy
diff --git a/symspellpy/__init__.py b/symspellpy/__init__.py
index 0a8c25d..f4b9455 100644
--- a/symspellpy/__init__.py
+++ b/symspellpy/__init__.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -19,7 +19,7 @@
.. moduleauthor:: Wolf Garbe
"""
-__version__ = "6.7.7"
+__version__ = "6.7.8-rc1"
from . import editdistance, helpers, logging
from .symspellpy import SymSpell
diff --git a/symspellpy/abstract_distance_comparer.py b/symspellpy/abstract_distance_comparer.py
new file mode 100644
index 0000000..ba839fe
--- /dev/null
+++ b/symspellpy/abstract_distance_comparer.py
@@ -0,0 +1,20 @@
+from abc import ABC, abstractmethod
+
+
+class AbstractDistanceComparer(ABC):
+ """An interface to compute relative distance between two strings."""
+
+ @abstractmethod
+ def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
+ """Returns a measure of the distance between two strings.
+
+ Args:
+ string_1: One of the strings to compare.
+ string_2: The other string to compare.
+ max_distance: The maximum distance that is of interest.
+
+ Returns:
+ -1 if the distance is greater than the max_distance, 0 if the strings
+ are equivalent, otherwise a positive number whose magnitude
+ increases as difference between the strings increases.
+ """
diff --git a/symspellpy/composition.py b/symspellpy/composition.py
index c9ebd33..e47169f 100644
--- a/symspellpy/composition.py
+++ b/symspellpy/composition.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/symspellpy/editdistance.py b/symspellpy/editdistance.py
index e38e558..b25fd7b 100644
--- a/symspellpy/editdistance.py
+++ b/symspellpy/editdistance.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -18,13 +18,14 @@
:synopsis: Module for edit distance algorithms.
"""
-from abc import ABC, abstractmethod
+import warnings
from enum import Enum
-from typing import List
+from typing import List, Optional
from editdistpy import damerau_osa, levenshtein
from symspellpy import helpers
+from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
class DistanceAlgorithm(Enum):
@@ -34,6 +35,7 @@ class DistanceAlgorithm(Enum):
DAMERAU_OSA = 1 #: Damerau optimal string alignment algorithm
LEVENSHTEIN_FAST = 2 #: Fast Levenshtein algorithm.
DAMERAU_OSA_FAST = 3 #: Fast Damerau optimal string alignment algorithm
+ USER_PROVIDED = 4 #: User provided custom edit distance algorithm
class EditDistance:
@@ -53,7 +55,16 @@ class EditDistance:
ValueError: If `algorithm` specifies an invalid distance algorithm.
"""
- def __init__(self, algorithm: DistanceAlgorithm) -> None:
+ def __init__(
+ self,
+ algorithm: DistanceAlgorithm,
+ comparer: Optional[AbstractDistanceComparer] = None,
+ ) -> None:
+ if algorithm != DistanceAlgorithm.USER_PROVIDED and comparer is not None:
+ warnings.warn(
+ f"A comparer is passed in but algorithm is not {DistanceAlgorithm.USER_PROVIDED.value}. A built-in comparer will be used."
+ )
+
self._distance_comparer: AbstractDistanceComparer
self._algorithm = algorithm
if algorithm == DistanceAlgorithm.LEVENSHTEIN:
@@ -64,6 +75,12 @@ def __init__(self, algorithm: DistanceAlgorithm) -> None:
self._distance_comparer = LevenshteinFast()
elif algorithm == DistanceAlgorithm.DAMERAU_OSA_FAST:
self._distance_comparer = DamerauOsaFast()
+ elif algorithm == DistanceAlgorithm.USER_PROVIDED:
+ if not isinstance(comparer, AbstractDistanceComparer):
+ raise ValueError(
+ f"{algorithm.value} selected but no comparer passed in."
+ )
+ self._distance_comparer = comparer
else:
raise ValueError("unknown distance algorithm")
@@ -82,25 +99,6 @@ def compare(self, string_1: str, string_2: str, max_distance: int) -> int:
return self._distance_comparer.distance(string_1, string_2, max_distance)
-class AbstractDistanceComparer(ABC):
- """An interface to compute relative distance between two strings."""
-
- @abstractmethod
- def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
- """Returns a measure of the distance between two strings.
-
- Args:
- string_1: One of the strings to compare.
- string_2: The other string to compare.
- max_distance: The maximum distance that is of interest.
-
- Returns:
- -1 if the distance is greater than the max_distance, 0 if the strings
- are equivalent, otherwise a positive number whose magnitude
- increases as difference between the strings increases.
- """
-
-
class Levenshtein(AbstractDistanceComparer):
"""Provides Levenshtein algorithm for computing edit distance metric between
two strings.
diff --git a/symspellpy/helpers.py b/symspellpy/helpers.py
index 8d36079..ddac517 100644
--- a/symspellpy/helpers.py
+++ b/symspellpy/helpers.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/symspellpy/logging.py b/symspellpy/logging.py
index 818e1a0..64b9b92 100644
--- a/symspellpy/logging.py
+++ b/symspellpy/logging.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
diff --git a/symspellpy/pickle_mixin.py b/symspellpy/pickle_mixin.py
index 59b010a..2e17158 100644
--- a/symspellpy/pickle_mixin.py
+++ b/symspellpy/pickle_mixin.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/symspellpy/suggest_item.py b/symspellpy/suggest_item.py
index 6fc595f..e71128c 100644
--- a/symspellpy/suggest_item.py
+++ b/symspellpy/suggest_item.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py
index 4b41b87..486626d 100644
--- a/symspellpy/symspellpy.py
+++ b/symspellpy/symspellpy.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -84,6 +84,7 @@ def __init__(
max_dictionary_edit_distance: int = 2,
prefix_length: int = 7,
count_threshold: int = 1,
+ distance_comparer: Optional[EditDistance] = None,
) -> None:
if max_dictionary_edit_distance < 0:
raise ValueError("max_dictionary_edit_distance cannot be negative")
@@ -95,6 +96,10 @@ def __init__(
)
if count_threshold < 0:
raise ValueError("count_threshold cannot be negative")
+ if distance_comparer is None:
+ self.distance_comparer = EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST)
+ else:
+ self.distance_comparer = distance_comparer
self._words: Dict[str, int] = {}
self._below_threshold_words: Dict[str, int] = {}
self._bigrams: Dict[str, int] = {}
@@ -104,7 +109,7 @@ def __init__(
self._max_dictionary_edit_distance = max_dictionary_edit_distance
self._prefix_length = prefix_length
self._count_threshold = count_threshold
- self._distance_algorithm = DistanceAlgorithm.DAMERAU_OSA_FAST
+ # self._distance_algorithm = DistanceAlgorithm.DAMERAU_OSA_FAST
self._max_length = 0
@property
@@ -129,19 +134,6 @@ def deletes(self) -> Dict[str, List[str]]:
"""
return self._deletes
- @property
- def distance_algorithm(self) -> DistanceAlgorithm:
- """The current distance algorithm."""
- return self._distance_algorithm
-
- @distance_algorithm.setter
- def distance_algorithm(self, value: DistanceAlgorithm) -> None:
- if not isinstance(value, DistanceAlgorithm):
- raise TypeError(
- "can only assign DistanceAlgorithm type values to distance_algorithm"
- )
- self._distance_algorithm = value
-
@property
def entry_count(self) -> int:
"""Number of unique correct spelling words."""
@@ -445,7 +437,6 @@ def early_exit():
candidates.append(phrase[:phrase_prefix_len])
else:
candidates.append(phrase)
- distance_comparer = EditDistance(self._distance_algorithm)
while candidate_pointer < len(candidates):
candidate = candidates[candidate_pointer]
candidate_pointer += 1
@@ -577,7 +568,7 @@ def early_exit():
if suggestion in considered_suggestions:
continue
considered_suggestions.add(suggestion)
- distance = distance_comparer.compare(
+ distance = self.distance_comparer.compare(
phrase, suggestion, max_edit_distance_2
)
if distance < 0:
@@ -683,7 +674,6 @@ def lookup_compound(
)
suggestions = []
suggestion_parts: List[SuggestItem] = []
- distance_comparer = EditDistance(self._distance_algorithm)
# translate every item to its best suggestion, otherwise it remains
# unchanged
@@ -761,7 +751,7 @@ def lookup_compound(
continue
# select best suggestion for split pair
tmp_term = f"{suggestions_1[0].term} {suggestions_2[0].term}"
- tmp_distance = distance_comparer.compare(
+ tmp_distance = self.distance_comparer.compare(
terms_1[i], tmp_term, max_edit_distance
)
if tmp_distance < 0:
@@ -858,7 +848,7 @@ def lookup_compound(
joined_term = helpers.case_transfer_similar(phrase, joined_term)
suggestion = SuggestItem(
joined_term,
- distance_comparer.compare(phrase, joined_term, 2**31 - 1),
+ self.distance_comparer.compare(phrase, joined_term, 2**31 - 1),
int(joined_count),
)
return [suggestion]
diff --git a/symspellpy/verbosity.py b/symspellpy/verbosity.py
index 89b6816..4bf52bd 100644
--- a/symspellpy/verbosity.py
+++ b/symspellpy/verbosity.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2022 mmb L (Python port)
+# Copyright (c) 2024 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/tests/fortests/below_threshold_dict.txt b/tests/fortests/below_threshold_dict.txt
new file mode 100644
index 0000000..c2e74b4
--- /dev/null
+++ b/tests/fortests/below_threshold_dict.txt
@@ -0,0 +1,3 @@
+below 8
+threshold 10
+word 10
diff --git a/tests/test_editdistance.py b/tests/test_editdistance.py
index 1042862..63d83af 100644
--- a/tests/test_editdistance.py
+++ b/tests/test_editdistance.py
@@ -3,8 +3,8 @@
import pytest
+from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
from symspellpy.editdistance import (
- AbstractDistanceComparer,
DamerauOsa,
DamerauOsaFast,
DistanceAlgorithm,
@@ -62,6 +62,11 @@ def expected_damerau_osa(string_1, string_2, max_distance):
return distance if distance <= max_distance else -1
+class CustomDistanceComparer(AbstractDistanceComparer):
+ def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
+ return -2
+
+
@pytest.fixture(
params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"]
)
@@ -140,6 +145,11 @@ def test_unknown_distance_algorithm(self):
_ = EditDistance(2)
assert "unknown distance algorithm" == str(excinfo.value)
+ def test_missing_custom_comparer(self):
+ with pytest.raises(ValueError) as excinfo:
+ _ = EditDistance(DistanceAlgorithm.USER_PROVIDED)
+ assert "no comparer passed in" in str(excinfo.value)
+
def test_abstract_distance_comparer(self):
with pytest.raises(TypeError) as excinfo:
comparer = AbstractDistanceComparer()
@@ -148,6 +158,11 @@ def test_abstract_distance_comparer(self):
"Can't instantiate abstract class AbstractDistanceComparer"
)
+ def test_warn_when_builtin_comparer_override_custom_comparer(self):
+ with pytest.warns(UserWarning, match="A built-in comparer will be used.$"):
+ comparer = CustomDistanceComparer()
+ edit_distance = EditDistance(DistanceAlgorithm.LEVENSHTEIN, comparer)
+
def test_internal_distance_comparer(self, get_edit_distance):
edit_distance, expected = get_edit_distance
assert isinstance(edit_distance._distance_comparer, expected)
@@ -162,6 +177,15 @@ def test_comparer_match_ref(self, get_comparer, get_strings):
s1, s2, max_distance
)
+ def test_editdistance_use_custom_comparer(self, get_strings):
+ strings, max_distance = get_strings
+ comparer = CustomDistanceComparer()
+ edit_distance = EditDistance(DistanceAlgorithm.USER_PROVIDED, comparer)
+
+ for s1 in strings:
+ for s2 in strings:
+ assert -2 == comparer.distance(s1, s2, max_distance)
+
def test_comparer_null_distance(self, get_comparer, get_short_and_long_strings):
comparer, _ = get_comparer
diff --git a/tests/test_symspellpy.py b/tests/test_symspellpy.py
index b7c4cd7..ab48be6 100644
--- a/tests/test_symspellpy.py
+++ b/tests/test_symspellpy.py
@@ -4,11 +4,13 @@
import pytest
from symspellpy import SymSpell, Verbosity
-from symspellpy.editdistance import DistanceAlgorithm
+from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
+from symspellpy.editdistance import DistanceAlgorithm, EditDistance
from symspellpy.helpers import DictIO
FORTESTS_DIR = Path(__file__).resolve().parent / "fortests"
BAD_DICT_PATH = FORTESTS_DIR / "bad_dict.txt"
+BELOW_THRESHOLD_DICT_PATH = FORTESTS_DIR / "below_threshold_dict.txt"
BIG_MODIFIED_PATH = FORTESTS_DIR / "big_modified.txt"
BIG_WORDS_PATH = FORTESTS_DIR / "big_words.txt"
NON_EN_DICT_PATH = FORTESTS_DIR / "non_en_dict.txt"
@@ -35,6 +37,11 @@ def get_dictionary_stream(request):
yield dict_stream, request.param
+class CustomDistanceComparer(AbstractDistanceComparer):
+ def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
+ return 0
+
+
class TestSymSpellPy:
def test_negative_max_dictionary_edit_distance(self):
with pytest.raises(ValueError) as excinfo:
@@ -63,26 +70,13 @@ def test_negative_count_threshold(self):
_ = SymSpell(1, 3, -1)
assert "count_threshold cannot be negative" == str(excinfo.value)
- @pytest.mark.parametrize(
- "algorithm",
- [
- DistanceAlgorithm.LEVENSHTEIN,
- DistanceAlgorithm.DAMERAU_OSA,
- DistanceAlgorithm.LEVENSHTEIN_FAST,
- DistanceAlgorithm.DAMERAU_OSA_FAST,
- ],
- )
- def test_set_distance_algorithm(self, symspell_default, algorithm):
- symspell_default.distance_algorithm = algorithm
- assert algorithm == symspell_default.distance_algorithm
-
- def test_set_invalid_distance_algorithm(self, symspell_default):
- with pytest.raises(TypeError) as excinfo:
- symspell_default.distance_algorithm = 1
- assert (
- "can only assign DistanceAlgorithm type values to distance_algorithm"
- == str(excinfo.value)
+ def test_set_distance_comparer(self):
+ distance_comparer = EditDistance(
+ DistanceAlgorithm.USER_PROVIDED, CustomDistanceComparer()
)
+ sym_spell = SymSpell(distance_comparer=distance_comparer)
+
+ assert distance_comparer == sym_spell.distance_comparer
@pytest.mark.parametrize("symspell_short", [None, 0], indirect=True)
def test_create_dictionary_entry_negative_count(self, symspell_short):
@@ -196,6 +190,15 @@ def test_load_dictionary_count(self, symspell_default, dictionary_path):
assert 82834 == symspell_default.word_count
assert 676094 == symspell_default.entry_count
+ @pytest.mark.parametrize("symspell_short", [10], indirect=True)
+ def test_load_dictionary_below_threshold(self, symspell_short):
+ symspell_short.load_dictionary(BELOW_THRESHOLD_DICT_PATH, 0, 1)
+
+ assert 1 == len(symspell_short.below_threshold_words)
+ assert 8 == symspell_short.below_threshold_words["below"]
+
+ assert 2 == symspell_short.word_count
+
def test_load_dictionary_separator(self, symspell_default):
assert symspell_default.load_dictionary(SEPARATOR_DICT_PATH, 0, 1, SEPARATOR)
assert 5 == symspell_default.word_count