From b6180ae78137f883e1ee91b6902315c7c2793105 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 5 Jul 2021 05:51:39 +0200 Subject: [PATCH] changed default value of kwarg max_n_matches to #strings in master --- CHANGELOG.md | 7 +++++++ README.md | 2 +- string_grouper/string_grouper.py | 6 +++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e73d33fd..f884329a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +## [0.5.1?] - 2021-07-05 + +* Improved the performance of the function `match_most_similar`. +* Changed the default value of the keyword argument `max_n_matches` to the number of strings in `master`. (`max_n_matches` is now defined as the maximum number of matches allowed per string in `duplicates` \[or `master` if `duplicates` is not given\]). + ## [0.5.0] - 2021-06-11 ### Added diff --git a/README.md b/README.md index 1b18c3c9..198669c2 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`. * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`. - * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given). + * **`max_n_matches`**: The maximum number of matches allowed per string in `duplicates` (or `master` if `duplicates` is not given). Default is the number of strings in `master`. * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match. Defaults to `0.8` * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 2ac60825..14e14428 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -13,7 +13,6 @@ DEFAULT_NGRAM_SIZE: int = 3 DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) DEFAULT_REGEX: str = r'[,-./]|\s' -DEFAULT_MAX_N_MATCHES: int = 20 DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1 DEFAULT_IGNORE_CASE: bool = True # ignores case by default @@ -209,7 +208,8 @@ class StringGrouperConfig(NamedTuple): (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision than np.float64.) :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'. - :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. + :param max_n_matches: int. The maximum number of matches allowed per string in `duplicates` (or `master` + is duplicates is not given). Default will be set by StringGrouper. :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match. Defaults to 0.8. :param number_of_processes: int. The number of processes used by the cosine similarity calculation. @@ -297,7 +297,7 @@ def __init__(self, master: pd.Series, self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) if self._config.max_n_matches is None: - self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) + self._max_n_matches = len(self._master) else: self._max_n_matches = self._config.max_n_matches