From 21c029e904230acb311b8ce7a903bb28cd6a8d61 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 10:18:11 +0200 Subject: [PATCH 01/14] started moving 0.5 to modernity. Works on 3.10 and 3.11 --- .gitignore | 5 + setup.py | 2 +- string_grouper/__init__.py | 10 +- string_grouper/string_grouper.py | 566 +++++++---- string_grouper/test/test_string_grouper.py | 909 +++++++++++------- string_grouper_utils/__init__.py | 7 +- string_grouper_utils/string_grouper_utils.py | 142 +-- .../test/test_string_grouper_utils.py | 339 ++++--- 8 files changed, 1255 insertions(+), 725 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..605c29c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +env/ +env*/ +__pycache__/ +*.pyc +*.egg-info \ No newline at end of file diff --git a/setup.py b/setup.py index ec4bbb1..29b0270 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,6 @@ , 'scipy' , 'scikit-learn' , 'numpy' - , 'sparse_dot_topn>=0.3.1' + , 'sparse_dot_topn==0.3.6' #1.1.1 ] ) diff --git a/string_grouper/__init__.py b/string_grouper/__init__.py index 84e3abd..bcbd349 100644 --- a/string_grouper/__init__.py +++ b/string_grouper/__init__.py @@ -1,2 +1,8 @@ -from .string_grouper import compute_pairwise_similarities, group_similar_strings, match_most_similar, match_strings, \ -StringGrouperConfig, StringGrouper +from .string_grouper import ( + compute_pairwise_similarities, + group_similar_strings, + match_most_similar, + match_strings, + StringGrouperConfig, + StringGrouper, +) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d161251..aec1239 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -3,48 +3,64 @@ import re import multiprocessing from sklearn.feature_extraction.text import TfidfVectorizer -from scipy.sparse.csr import csr_matrix -from scipy.sparse.lil import lil_matrix +from scipy.sparse import csr_matrix +from scipy.sparse import lil_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union from sparse_dot_topn import awesome_cossim_topn from functools import wraps DEFAULT_NGRAM_SIZE: int = 3 -DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) -DEFAULT_REGEX: str = r'[,-./]|\s' +DEFAULT_TFIDF_MATRIX_DTYPE: type = ( + np.float64 +) # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) +DEFAULT_REGEX: str = r"[,-./]|\s" DEFAULT_MAX_N_MATCHES: int = 20 -DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match +DEFAULT_MIN_SIMILARITY: float = ( + 0.8 # minimum cosine similarity for an item to be considered a match +) DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1 DEFAULT_IGNORE_CASE: bool = True # ignores case by default DEFAULT_DROP_INDEX: bool = False # includes index-columns in output -DEFAULT_REPLACE_NA: bool = False # when finding the most similar strings, does not replace NaN values in most +DEFAULT_REPLACE_NA: bool = ( + False # when finding the most similar strings, does not replace NaN values in most +) # similar string index-columns with corresponding duplicates-index values -DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity +DEFAULT_INCLUDE_ZEROES: bool = ( + True # when the minimum cosine similarity <=0, determines whether zero-similarity +) # matches appear in the output -GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest +GROUP_REP_CENTROID: str = ( + "centroid" # Option value to select the string in each group with the largest +) # similarity aggregate as group-representative: -GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative: -DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default +GROUP_REP_FIRST: str = "first" # Option value to select the first string in each group as group-representative: +DEFAULT_GROUP_REP: str = ( + GROUP_REP_CENTROID # chooses group centroid as group-representative by default +) # The following string constants are used by (but aren't [yet] options passed to) StringGrouper -DEFAULT_COLUMN_NAME: str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches -DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches -LEFT_PREFIX: str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches -RIGHT_PREFIX: str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches -MOST_SIMILAR_PREFIX: str = 'most_similar_' # used to prefix columns of the output of +DEFAULT_COLUMN_NAME: str = ( + "side" # used to name non-index columns of the output of StringGrouper.get_matches +) +DEFAULT_ID_NAME: str = ( + "id" # used to name id-columns in the output of StringGrouper.get_matches +) +LEFT_PREFIX: str = "left_" # used to prefix columns on the left of the output of StringGrouper.get_matches +RIGHT_PREFIX: str = "right_" # used to prefix columns on the right of the output of StringGrouper.get_matches +MOST_SIMILAR_PREFIX: str = "most_similar_" # used to prefix columns of the output of # StringGrouper._get_nearest_matches -DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches -DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}' # used to name id-column of the output of +DEFAULT_MASTER_NAME: str = "master" # used to name non-index column of the output of StringGrouper.get_nearest_matches +DEFAULT_MASTER_ID_NAME: str = f"{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}" # used to name id-column of the output of # StringGrouper.get_nearest_matches -GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate +GROUP_REP_PREFIX: str = "group_rep_" # used to prefix and name columns of the output of StringGrouper._deduplicate # High level functions -def compute_pairwise_similarities(string_series_1: pd.Series, - string_series_2: pd.Series, - **kwargs) -> pd.Series: +def compute_pairwise_similarities( + string_series_1: pd.Series, string_series_2: pd.Series, **kwargs +) -> pd.Series: """ Computes the similarity scores between two Series of strings row-wise. @@ -56,9 +72,9 @@ def compute_pairwise_similarities(string_series_1: pd.Series, return StringGrouper(string_series_1, string_series_2, **kwargs).dot() -def group_similar_strings(strings_to_group: pd.Series, - string_ids: Optional[pd.Series] = None, - **kwargs) -> Union[pd.DataFrame, pd.Series]: +def group_similar_strings( + strings_to_group: pd.Series, string_ids: Optional[pd.Series] = None, **kwargs +) -> Union[pd.DataFrame, pd.Series]: """ If 'string_ids' is not given, finds all similar strings in 'strings_to_group' and returns a Series of strings of the same length as 'strings_to_group'. For each group of similar strings a single string @@ -74,15 +90,19 @@ def group_similar_strings(strings_to_group: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit() + string_grouper = StringGrouper( + strings_to_group, master_id=string_ids, **kwargs + ).fit() return string_grouper.get_groups() -def match_most_similar(master: pd.Series, - duplicates: pd.Series, - master_id: Optional[pd.Series] = None, - duplicates_id: Optional[pd.Series] = None, - **kwargs) -> Union[pd.DataFrame, pd.Series]: +def match_most_similar( + master: pd.Series, + duplicates: pd.Series, + master_id: Optional[pd.Series] = None, + duplicates_id: Optional[pd.Series] = None, + **kwargs, +) -> Union[pd.DataFrame, pd.Series]: """ If no IDs ('master_id' and 'duplicates_id') are given, returns a Series of strings of the same length as 'duplicates' where for each string in duplicates the most similar string in 'master' is returned. @@ -103,19 +123,23 @@ def match_most_similar(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - string_grouper = StringGrouper(master, - duplicates=duplicates, - master_id=master_id, - duplicates_id=duplicates_id, - **kwargs).fit() + string_grouper = StringGrouper( + master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs, + ).fit() return string_grouper.get_groups() -def match_strings(master: pd.Series, - duplicates: Optional[pd.Series] = None, - master_id: Optional[pd.Series] = None, - duplicates_id: Optional[pd.Series] = None, - **kwargs) -> pd.DataFrame: +def match_strings( + master: pd.Series, + duplicates: Optional[pd.Series] = None, + master_id: Optional[pd.Series] = None, + duplicates_id: Optional[pd.Series] = None, + **kwargs, +) -> pd.DataFrame: """ Returns all highly similar strings. If only 'master' is given, it will return highly similar strings within master. This can be seen as an self-join. If both master and duplicates is given, it will return highly similar strings @@ -128,11 +152,13 @@ def match_strings(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. :return: pandas.Dataframe. """ - string_grouper = StringGrouper(master, - duplicates=duplicates, - master_id=master_id, - duplicates_id=duplicates_id, - **kwargs).fit() + string_grouper = StringGrouper( + master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs, + ).fit() return string_grouper.get_matches() @@ -182,24 +208,30 @@ def wrapper(*args, **kwargs): if args[0].is_build: return f(*args, **kwargs) else: - raise StringGrouperNotFitException(f'{f.__name__} was called before the "fit" function was called.' - f' Make sure to run fit the StringGrouper first using ' - f'StringGrouper.fit()') + raise StringGrouperNotFitException( + f'{f.__name__} was called before the "fit" function was called.' + f" Make sure to run fit the StringGrouper first using " + f"StringGrouper.fit()" + ) return wrapper class StringGrouperNotFitException(Exception): """Raised when one of the public functions is called which requires the StringGrouper to be fit first""" + pass class StringGrouper(object): - def __init__(self, master: pd.Series, - duplicates: Optional[pd.Series] = None, - master_id: Optional[pd.Series] = None, - duplicates_id: Optional[pd.Series] = None, - **kwargs): + def __init__( + self, + master: pd.Series, + duplicates: Optional[pd.Series] = None, + master_id: Optional[pd.Series] = None, + duplicates_id: Optional[pd.Series] = None, + **kwargs, + ): """ StringGrouper is a class that holds the matrix with cosine similarities between the master and duplicates matrix. If duplicates is not given it is replaced by master. To build this matrix the `fit` function must be @@ -213,22 +245,32 @@ def __init__(self, master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig """ # Validate match strings input - if not StringGrouper._is_series_of_strings(master) or \ - (duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)): - raise TypeError('Input does not consist of pandas.Series containing only Strings') + if not StringGrouper._is_series_of_strings(master) or ( + duplicates is not None + and not StringGrouper._is_series_of_strings(duplicates) + ): + raise TypeError( + "Input does not consist of pandas.Series containing only Strings" + ) # Validate optional IDs input - if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id): - raise Exception('List of data Series options is invalid') + if not StringGrouper._is_input_data_combination_valid( + duplicates, master_id, duplicates_id + ): + raise Exception("List of data Series options is invalid") StringGrouper._validate_id_data(master, duplicates, master_id, duplicates_id) self._master: pd.Series = master self._duplicates: pd.Series = duplicates if duplicates is not None else None self._master_id: pd.Series = master_id if master_id is not None else None - self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None + self._duplicates_id: pd.Series = ( + duplicates_id if duplicates_id is not None else None + ) self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) if self._config.max_n_matches is None: - self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) + self._max_n_matches = ( + len(self._master) if self._duplicates is None else len(self._duplicates) + ) else: self._max_n_matches = self._config.max_n_matches @@ -236,7 +278,9 @@ def __init__(self, master: pd.Series, self._validate_tfidf_matrix_dtype() self._validate_replace_na_and_drop() self.is_build = False # indicates if the grouper was fit or not - self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams, dtype=self._config.tfidf_matrix_dtype) + self._vectorizer = TfidfVectorizer( + min_df=1, analyzer=self.n_grams, dtype=self._config.tfidf_matrix_dtype + ) # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches self._matches_list: pd.DataFrame = pd.DataFrame() # _true_max_n_matches will contain the true maximum number of matches over all strings in master if @@ -252,16 +296,18 @@ def n_grams(self, string: str) -> List[str]: regex_pattern = self._config.regex if self._config.ignore_case and string is not None: string = string.lower() # lowercase to ignore all case - string = re.sub(regex_pattern, r'', string) + string = re.sub(regex_pattern, r"", string) n_grams = zip(*[string[i:] for i in range(ngram_size)]) - return [''.join(n_gram) for n_gram in n_grams] + return ["".join(n_gram) for n_gram in n_grams] - def fit(self) -> 'StringGrouper': + def fit(self) -> "StringGrouper": """Builds the _matches list which contains string matches indices and similarity""" master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity - matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix) + matches, self._true_max_n_matches = self._build_matches( + master_matrix, duplicate_matrix + ) if self._duplicates is None: # convert to lil format for best efficiency when setting matrix-elements @@ -282,16 +328,22 @@ def fit(self) -> 'StringGrouper': def dot(self) -> pd.Series: """Computes the row-wise similarity scores between strings in _master and _duplicates""" if len(self._master) != len(self._duplicates): - raise Exception("To perform this function, both input Series must have the same length.") + raise Exception( + "To perform this function, both input Series must have the same length." + ) master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate pairwise cosine similarities: - pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze(axis=1) - return pd.Series(pairwise_similarities, name='similarity', index=self._master.index) + pairwise_similarities = np.asarray( + master_matrix.multiply(duplicate_matrix).sum(axis=1) + ).squeeze(axis=1) + return pd.Series( + pairwise_similarities, name="similarity", index=self._master.index + ) @validate_is_fit - def get_matches(self, - ignore_index: Optional[bool] = None, - include_zeroes: Optional[bool] = None) -> pd.DataFrame: + def get_matches( + self, ignore_index: Optional[bool] = None, include_zeroes: Optional[bool] = None + ) -> pd.DataFrame: """ Returns a DataFrame with all the matches and their cosine similarity. If optional IDs are used, returned as extra columns with IDs matched to respective data rows @@ -301,10 +353,13 @@ def get_matches(self, :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to self._config.include_zeroes. """ - def get_both_sides(master: pd.Series, - duplicates: pd.Series, - generic_name=(DEFAULT_COLUMN_NAME, DEFAULT_COLUMN_NAME), - drop_index=False): + + def get_both_sides( + master: pd.Series, + duplicates: pd.Series, + generic_name=(DEFAULT_COLUMN_NAME, DEFAULT_COLUMN_NAME), + drop_index=False, + ): lname, rname = generic_name left = master if master.name else master.rename(lname) left = left.iloc[matches_list.master_side].reset_index(drop=drop_index) @@ -313,7 +368,9 @@ def get_both_sides(master: pd.Series, else: right = duplicates if duplicates.name else duplicates.rename(rname) right = right.iloc[matches_list.dupe_side].reset_index(drop=drop_index) - return left, (right if isinstance(right, pd.Series) else right[right.columns[::-1]]) + return left, ( + right if isinstance(right, pd.Series) else right[right.columns[::-1]] + ) def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str): if isinstance(data, pd.DataFrame): @@ -332,26 +389,33 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str): # the fix includes zero-similarity matches that are missing by default # in _matches_list due to our use of sparse matrices non_matches_list = self._get_non_matches_list() - matches_list = self._matches_list if non_matches_list.empty else \ - pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True) + matches_list = ( + self._matches_list + if non_matches_list.empty + else pd.concat( + [self._matches_list, non_matches_list], axis=0, ignore_index=True + ) + ) - left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index) + left_side, right_side = get_both_sides( + self._master, self._duplicates, drop_index=ignore_index + ) similarity = matches_list.similarity.reset_index(drop=True) if self._master_id is None: return pd.concat( [ prefix_column_names(left_side, LEFT_PREFIX), similarity, - prefix_column_names(right_side, RIGHT_PREFIX) + prefix_column_names(right_side, RIGHT_PREFIX), ], - axis=1 + axis=1, ) else: left_side_id, right_side_id = get_both_sides( self._master_id, self._duplicates_id, (DEFAULT_ID_NAME, DEFAULT_ID_NAME), - drop_index=True + drop_index=True, ) return pd.concat( [ @@ -359,15 +423,15 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str): prefix_column_names(left_side_id, LEFT_PREFIX), similarity, prefix_column_names(right_side_id, RIGHT_PREFIX), - prefix_column_names(right_side, RIGHT_PREFIX) + prefix_column_names(right_side, RIGHT_PREFIX), ], - axis=1 + axis=1, ) @validate_is_fit - def get_groups(self, - ignore_index: Optional[bool] = None, - replace_na: Optional[bool] = None) -> Union[pd.DataFrame, pd.Series]: + def get_groups( + self, ignore_index: Optional[bool] = None, replace_na: Optional[bool] = None + ) -> Union[pd.DataFrame, pd.Series]: """If there is only a master Series of strings, this will return a Series of 'master' strings. A single string in a group of near duplicates is chosen as 'master' and is returned for each string in the master Series. @@ -380,7 +444,7 @@ def get_groups(self, self._config.ignore_index. :param replace_na: whether or not to replace NaN values in most similar string index-columns with corresponding duplicates-index values. Defaults to self._config.replace_na. - """ + """ if ignore_index is None: ignore_index = self._config.ignore_index if self._duplicates is None: @@ -388,33 +452,41 @@ def get_groups(self, else: if replace_na is None: replace_na = self._config.replace_na - return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na) + return self._get_nearest_matches( + ignore_index=ignore_index, replace_na=replace_na + ) @validate_is_fit - def add_match(self, master_side: str, dupe_side: str) -> 'StringGrouper': + def add_match(self, master_side: str, dupe_side: str) -> "StringGrouper": """Adds a match if it wasn't found by the fit function""" master_indices, dupe_indices = self._get_indices_of(master_side, dupe_side) # add prior matches to new match - prior_matches = self._matches_list.master_side[self._matches_list.dupe_side.isin(dupe_indices)] - dupe_indices = dupe_indices.append(prior_matches) + prior_matches = self._matches_list.master_side[ + self._matches_list.dupe_side.isin(dupe_indices) + ] + dupe_indices = dupe_indices._append(prior_matches) dupe_indices.drop_duplicates(inplace=True) similarities = [1] # cross join the indices - new_matches = StringGrouper._cross_join(dupe_indices, master_indices, similarities) + new_matches = StringGrouper._cross_join( + dupe_indices, master_indices, similarities + ) # If we are de-duping within one Series, we need to make sure the matches stay symmetric if self._duplicates is None: new_matches = StringGrouper._make_symmetric(new_matches) # update the matches - self._matches_list = pd.concat([self._matches_list.drop_duplicates(), new_matches], ignore_index=True) + self._matches_list = pd.concat( + [self._matches_list.drop_duplicates(), new_matches], ignore_index=True + ) return self @validate_is_fit - def remove_match(self, master_side: str, dupe_side: str) -> 'StringGrouper': - """ Removes a match from the StringGrouper""" + def remove_match(self, master_side: str, dupe_side: str) -> "StringGrouper": + """Removes a match from the StringGrouper""" master_indices, dupe_indices = self._get_indices_of(master_side, dupe_side) # In the case of having only a master series, we need to remove both the master - dupe match # and the dupe - master match: @@ -424,9 +496,10 @@ def remove_match(self, master_side: str, dupe_side: str) -> 'StringGrouper': self._matches_list = self._matches_list[ ~( - (self._matches_list.master_side.isin(master_indices)) & - (self._matches_list.dupe_side.isin(dupe_indices)) - )] + (self._matches_list.master_side.isin(master_indices)) + & (self._matches_list.dupe_side.isin(dupe_indices)) + ) + ] return self def _get_tf_idf_matrices(self) -> Tuple[csr_matrix, csr_matrix]: @@ -453,106 +526,170 @@ def _fit_vectorizer(self) -> TfidfVectorizer: self._vectorizer.fit(strings) return self._vectorizer - def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix: + def _build_matches( + self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix + ) -> csr_matrix: """Builds the cossine similarity matrix of two csr matrices""" tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() optional_kwargs = { - 'return_best_ntop': True, - 'use_threads': self._config.number_of_processes > 1, - 'n_jobs': self._config.number_of_processes + "return_best_ntop": True, + "use_threads": self._config.number_of_processes > 1, + "n_jobs": self._config.number_of_processes, } return awesome_cossim_topn( - tf_idf_matrix_1, tf_idf_matrix_2, + tf_idf_matrix_1, + tf_idf_matrix_2, self._max_n_matches, self._config.min_similarity, - **optional_kwargs + **optional_kwargs, ) def _get_non_matches_list(self) -> pd.DataFrame: """Returns a list of all the indices of non-matching pairs (with similarity set to 0)""" - m_sz, d_sz = len(self._master), len(self._master if self._duplicates is None else self._duplicates) - all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side']) - matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']]) + m_sz, d_sz = len(self._master), len( + self._master if self._duplicates is None else self._duplicates + ) + all_pairs = pd.MultiIndex.from_product( + [range(m_sz), range(d_sz)], names=["master_side", "dupe_side"] + ) + matched_pairs = pd.MultiIndex.from_frame( + self._matches_list[["master_side", "dupe_side"]] + ) missing_pairs = all_pairs.difference(matched_pairs) if missing_pairs.empty: return pd.DataFrame() - if (self._max_n_matches < self._true_max_n_matches): - raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n' - f'\t\t max_n_matches={self._max_n_matches} is too small!\n' - f'\t\t Try setting max_n_matches={self._true_max_n_matches} (the \n' - f'\t\t true maximum number of matches over all strings in master)\n' - f'\t\t or greater or do not set this kwarg at all.') + if self._max_n_matches < self._true_max_n_matches: + raise Exception( + f"\nERROR: Cannot return zero-similarity matches since \n" + f"\t\t max_n_matches={self._max_n_matches} is too small!\n" + f"\t\t Try setting max_n_matches={self._true_max_n_matches} (the \n" + f"\t\t true maximum number of matches over all strings in master)\n" + f"\t\t or greater or do not set this kwarg at all." + ) missing_pairs = missing_pairs.to_frame(index=False) - missing_pairs['similarity'] = 0 + missing_pairs["similarity"] = 0 return missing_pairs - def _get_nearest_matches(self, - ignore_index=False, - replace_na=False) -> Union[pd.DataFrame, pd.Series]: + def _get_nearest_matches( + self, ignore_index=False, replace_na=False + ) -> Union[pd.DataFrame, pd.Series]: prefix = MOST_SIMILAR_PREFIX - master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}' + master_label = ( + f"{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}" + ) master = self._master.rename(master_label).reset_index(drop=ignore_index) - dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index) + dupes = self._duplicates.rename("duplicates").reset_index(drop=ignore_index) # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging if isinstance(dupes, pd.DataFrame): master.rename( - columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label}, - inplace=True + columns={ + col: f"{prefix}{col}" + for col in master.columns + if str(col) != master_label + }, + inplace=True, ) if self._master_id is not None: - master_id_label = f'{prefix}{self._master_id.name if self._master_id.name else DEFAULT_MASTER_ID_NAME}' - master = pd.concat([master, self._master_id.rename(master_id_label).reset_index(drop=True)], axis=1) - dupes = pd.concat([dupes, self._duplicates_id.rename('duplicates_id').reset_index(drop=True)], axis=1) + master_id_label = f"{prefix}{self._master_id.name if self._master_id.name else DEFAULT_MASTER_ID_NAME}" + master = pd.concat( + [ + master, + self._master_id.rename(master_id_label).reset_index(drop=True), + ], + axis=1, + ) + dupes = pd.concat( + [ + dupes, + self._duplicates_id.rename("duplicates_id").reset_index(drop=True), + ], + axis=1, + ) - dupes_max_sim = self._matches_list.groupby('dupe_side').agg({'similarity': 'max'}).reset_index() - dupes_max_sim = dupes_max_sim.merge(self._matches_list, on=['dupe_side', 'similarity']) + dupes_max_sim = ( + self._matches_list.groupby("dupe_side") + .agg({"similarity": "max"}) + .reset_index() + ) + dupes_max_sim = dupes_max_sim.merge( + self._matches_list, on=["dupe_side", "similarity"] + ) # In case there are multiple equal similarities, we pick the one that comes first - dupes_max_sim = dupes_max_sim.groupby(['dupe_side']).agg({'master_side': 'min'}).reset_index() + dupes_max_sim = ( + dupes_max_sim.groupby(["dupe_side"]) + .agg({"master_side": "min"}) + .reset_index() + ) # First we add the duplicate strings - dupes_max_sim = dupes_max_sim.merge(dupes, left_on='dupe_side', right_index=True, how='outer') + dupes_max_sim = dupes_max_sim.merge( + dupes, left_on="dupe_side", right_index=True, how="outer" + ) # Now add the master strings - dupes_max_sim = dupes_max_sim.merge(master, left_on='master_side', right_index=True, how='left') + dupes_max_sim = dupes_max_sim.merge( + master, left_on="master_side", right_index=True, how="left" + ) # Update the master-series with the duplicates in cases were there is no match rows_to_update = dupes_max_sim[master_label].isnull() - dupes_max_sim.loc[rows_to_update, master_label] = dupes_max_sim[rows_to_update].duplicates + dupes_max_sim.loc[rows_to_update, master_label] = dupes_max_sim[ + rows_to_update + ].duplicates if self._master_id is not None: # Also update the master_id-series with the duplicates_id in cases were there is no match - dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id + dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[ + rows_to_update + ].duplicates_id # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values # appear within them. So here we change them back to their original datatypes if possible: - if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \ - self._duplicates_id.dtype == self._master_id.dtype: - dupes_max_sim.loc[:, master_id_label] = \ - dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype) + if ( + dupes_max_sim[master_id_label].dtype != self._master_id.dtype + and self._duplicates_id.dtype == self._master_id.dtype + ): + dupes_max_sim.loc[:, master_id_label] = dupes_max_sim.loc[ + :, master_id_label + ].astype(self._master_id.dtype) # Prepare the output: - required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label] - index_column_list = \ - [col for col in master.columns if col not in required_column_list] \ - if isinstance(master, pd.DataFrame) else [] + required_column_list = ( + [master_label] + if self._master_id is None + else [master_id_label, master_label] + ) + index_column_list = ( + [col for col in master.columns if col not in required_column_list] + if isinstance(master, pd.DataFrame) + else [] + ) if replace_na: # Update the master index-columns with the duplicates index-column values in cases were there is no match - dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates'] - dupes_max_sim.loc[rows_to_update, index_column_list] = \ - dupes_max_sim.loc[rows_to_update, dupes_index_columns].values + dupes_index_columns = [ + col for col in dupes.columns if str(col) != "duplicates" + ] + dupes_max_sim.loc[rows_to_update, index_column_list] = dupes_max_sim.loc[ + rows_to_update, dupes_index_columns + ].values # Restore their original datatypes if possible: for m, d in zip(index_column_list, dupes_index_columns): - if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype: - dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype) + if ( + dupes_max_sim[m].dtype != master[m].dtype + and dupes[d].dtype == master[m].dtype + ): + dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype( + master[m].dtype + ) # Make sure to keep same order as duplicates - dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side') + dupes_max_sim = dupes_max_sim.sort_values("dupe_side").set_index("dupe_side") output = dupes_max_sim[index_column_list + required_column_list] output.index = self._duplicates.index return output.squeeze(axis=1) @@ -564,13 +701,13 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: graph = csr_matrix( ( np.full(len(pairs), 1), - (pairs.master_side.to_numpy(), pairs.dupe_side.to_numpy()) + (pairs.master_side.to_numpy(), pairs.dupe_side.to_numpy()), ), - shape=(n, n) + shape=(n, n), ) # apply scipy.csgraph's clustering algorithm (result is a 1D numpy array of length n): _, groups = connected_components(csgraph=graph, directed=True) - group_of_master_index = pd.Series(groups, name='raw_group_id') + group_of_master_index = pd.Series(groups, name="raw_group_id") # merge groups with string indices to obtain two-column DataFrame: # note: the following line automatically creates a new column named 'index' with the corresponding indices: @@ -578,48 +715,73 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: # Determine weights for obtaining group representatives: # 1. option-setting group_rep='first': - group_of_master_index.rename(columns={'index': 'weight'}, inplace=True) - method = 'first' + group_of_master_index.rename(columns={"index": "weight"}, inplace=True) + method = "first" # 2. option-setting group_rep='centroid': if self._config.group_rep == GROUP_REP_CENTROID: # reuse the adjacency matrix built above (change the 1's to corresponding cosine similarities): - graph.data = pairs['similarity'].to_numpy() + graph.data = pairs["similarity"].to_numpy() # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ... # ... convert to 1D numpy array (using asarray then squeeze) and then to Series: - group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze(axis=1)) - method = 'idxmax' + group_of_master_index["weight"] = pd.Series( + np.asarray(graph.sum(axis=1)).squeeze(axis=1) + ) + method = "idxmax" # Determine the group representatives AND merge with indices: # pandas groupby transform function and enlargement enable both respectively in one step: - group_of_master_index['group_rep'] = \ - group_of_master_index.groupby('raw_group_id', sort=False)['weight'].transform(method) + group_of_master_index["group_rep"] = group_of_master_index.groupby( + "raw_group_id", sort=False + )["weight"].transform(method) # Prepare the output: prefix = GROUP_REP_PREFIX - label = f'{prefix}{self._master.name}' if self._master.name else prefix[:-1] + label = f"{prefix}{self._master.name}" if self._master.name else prefix[:-1] # use group rep indexes obtained in the last step above to select the corresponding strings: - output = self._master.iloc[group_of_master_index.group_rep].rename(label).reset_index(drop=ignore_index) + output = ( + self._master.iloc[group_of_master_index.group_rep] + .rename(label) + .reset_index(drop=ignore_index) + ) if isinstance(output, pd.DataFrame): output.rename( - columns={col: f'{prefix}{col}' for col in output.columns if str(col) != label}, - inplace=True + columns={ + col: f"{prefix}{col}" for col in output.columns if str(col) != label + }, + inplace=True, ) if self._master_id is not None: - id_label = f'{prefix}{self._master_id.name if self._master_id.name else DEFAULT_ID_NAME}' + id_label = f"{prefix}{self._master_id.name if self._master_id.name else DEFAULT_ID_NAME}" # use group rep indexes obtained above to select the corresponding string IDs: - output_id = self._master_id.iloc[group_of_master_index.group_rep].rename(id_label).reset_index(drop=True) + output_id = ( + self._master_id.iloc[group_of_master_index.group_rep] + .rename(id_label) + .reset_index(drop=True) + ) output = pd.concat([output_id, output], axis=1) output.index = self._master.index return output - def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, pd.Series]: + def _get_indices_of( + self, master_side: str, dupe_side: str + ) -> Tuple[pd.Series, pd.Series]: master_strings = self._master dupe_strings = self._master if self._duplicates is None else self._duplicates # Check if input is valid: - self._validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings) + self._validate_strings_exist( + master_side, dupe_side, master_strings, dupe_strings + ) # Get the indices of the two strings - master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True) - dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True) + master_indices = ( + master_strings[master_strings == master_side] + .index.to_series() + .reset_index(drop=True) + ) + dupe_indices = ( + dupe_strings[dupe_strings == dupe_side] + .index.to_series() + .reset_index(drop=True) + ) return master_indices, dupe_indices def _validate_group_rep_specs(self): @@ -638,8 +800,13 @@ def _validate_tfidf_matrix_dtype(self): def _validate_replace_na_and_drop(self): if self._config.ignore_index and self._config.replace_na: - raise Exception("replace_na can only be set to True when ignore_index=False.") - if self._config.replace_na and self._master.index.nlevels != self._duplicates.index.nlevels: + raise Exception( + "replace_na can only be set to True when ignore_index=False." + ) + if ( + self._config.replace_na + and self._master.index.nlevels != self._duplicates.index.nlevels + ): raise Exception( "replace_na=True: Cannot replace NaN values of index-columns with the values of another " "index if the number of index-levels does not equal the number of index-columns." @@ -661,46 +828,65 @@ def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix: def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" r, c = matches.nonzero() - matches_list = pd.DataFrame({'master_side': r.astype(np.int64), - 'dupe_side': c.astype(np.int64), - 'similarity': matches.data}) + matches_list = pd.DataFrame( + { + "master_side": r.astype(np.int64), + "dupe_side": c.astype(np.int64), + "similarity": matches.data, + } + ) return matches_list @staticmethod def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame: - columns_switched = pd.DataFrame({'master_side': new_matches.dupe_side, - 'dupe_side': new_matches.master_side, - 'similarity': new_matches.similarity}) + columns_switched = pd.DataFrame( + { + "master_side": new_matches.dupe_side, + "dupe_side": new_matches.master_side, + "similarity": new_matches.similarity, + } + ) return pd.concat([new_matches, columns_switched]) @staticmethod def _cross_join(dupe_indices, master_indices, similarities) -> pd.DataFrame: - x_join_index = pd.MultiIndex.from_product([master_indices, dupe_indices, similarities], - names=['master_side', 'dupe_side', 'similarity']) + x_join_index = pd.MultiIndex.from_product( + [master_indices, dupe_indices, similarities], + names=["master_side", "dupe_side", "similarity"], + ) x_joined_df = pd.DataFrame(index=x_join_index).reset_index() return x_joined_df @staticmethod def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings): if not master_strings.isin([master_side]).any(): - raise ValueError(f'{master_side} not found in StringGrouper string series') + raise ValueError(f"{master_side} not found in StringGrouper string series") elif not dupe_strings.isin([dupe_side]).any(): - raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series') + raise ValueError( + f"{dupe_side} not found in StringGrouper dupe string series" + ) @staticmethod def _is_series_of_strings(series_to_test: pd.Series) -> bool: if not isinstance(series_to_test, pd.Series): return False - elif series_to_test.to_frame().applymap( - lambda x: not isinstance(x, str) - ).squeeze(axis=1).any(): + elif ( + series_to_test.to_frame() + .map(lambda x: not isinstance(x, str)) + .squeeze(axis=1) + .any() + ): return False return True @staticmethod def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool: - if duplicates is None and (duplicates_id is not None) \ - or duplicates is not None and ((master_id is None) ^ (duplicates_id is None)): + if ( + duplicates is None + and (duplicates_id is not None) + or duplicates is not None + and ((master_id is None) ^ (duplicates_id is None)) + ): return False else: return True @@ -708,6 +894,14 @@ def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bo @staticmethod def _validate_id_data(master, duplicates, master_id, duplicates_id): if master_id is not None and len(master) != len(master_id): - raise Exception('Both master and master_id must be pandas.Series of the same length.') - if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id): - raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.') + raise Exception( + "Both master and master_id must be pandas.Series of the same length." + ) + if ( + duplicates is not None + and duplicates_id is not None + and len(duplicates) != len(duplicates_id) + ): + raise Exception( + "Both duplicates and duplicates_id must be pandas.Series of the same length." + ) diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index f5f0aac..638fa84 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -1,12 +1,21 @@ import unittest import pandas as pd import numpy as np -from scipy.sparse.csr import csr_matrix -from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \ - DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ - StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ - match_most_similar, group_similar_strings, match_strings, \ - compute_pairwise_similarities +from scipy.sparse import csr_matrix +from string_grouper.string_grouper import ( + DEFAULT_MIN_SIMILARITY, + DEFAULT_REGEX, + DEFAULT_NGRAM_SIZE, + DEFAULT_N_PROCESSES, + DEFAULT_IGNORE_CASE, + StringGrouperConfig, + StringGrouper, + StringGrouperNotFitException, + match_most_similar, + group_similar_strings, + match_strings, + compute_pairwise_similarities, +) from unittest.mock import patch @@ -17,80 +26,171 @@ def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix: class SimpleExample(object): def __init__(self): self.customers_df = pd.DataFrame( - [ - ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2), - ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5), - ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3), - ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1), - ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9), - ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0) - ], - columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight') + [ + ( + "BB016741P", + "Mega Enterprises Corporation", + "Address0", + "Tel0", + "Description0", + 0.2, + ), + ("CC082744L", "Hyper Startup Incorporated", "", "Tel1", "", 0.5), + ( + "AA098762D", + "Hyper Startup Inc.", + "Address2", + "Tel2", + "Description2", + 0.3, + ), + ( + "BB099931J", + "Hyper-Startup Inc.", + "Address3", + "Tel3", + "Description3", + 0.1, + ), + ("HH072982K", "Hyper Hyper Inc.", "Address4", "", "Description4", 0.9), + ( + "EE059082Q", + "Mega Enterprises Corp.", + "Address5", + "Tel5", + "Description5", + 1.0, + ), + ], + columns=( + "Customer ID", + "Customer Name", + "Address", + "Tel", + "Description", + "weight", + ), ) self.customers_df2 = pd.DataFrame( - [ - ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2), - ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5), - ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3), - ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1), - ('DD012339M', 'HyperStartup Inc.', 'Address4', 'Tel4', 'Description4', 0.1), - ('HH072982K', 'Hyper Hyper Inc.', 'Address5', '', 'Description5', 0.9), - ('EE059082Q', 'Mega Enterprises Corp.', 'Address6', 'Tel6', 'Description6', 1.0) - ], - columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight') - ) - self.a_few_strings = pd.Series(['BB016741P', 'BB082744L', 'BB098762D', 'BB099931J', 'BB072982K', 'BB059082Q']) - self.one_string = pd.Series(['BB0']) - self.two_strings = pd.Series(['Hyper', 'Hyp']) - self.whatever_series_1 = pd.Series(['whatever']) + [ + ( + "BB016741P", + "Mega Enterprises Corporation", + "Address0", + "Tel0", + "Description0", + 0.2, + ), + ("CC082744L", "Hyper Startup Incorporated", "", "Tel1", "", 0.5), + ( + "AA098762D", + "Hyper Startup Inc.", + "Address2", + "Tel2", + "Description2", + 0.3, + ), + ( + "BB099931J", + "Hyper-Startup Inc.", + "Address3", + "Tel3", + "Description3", + 0.1, + ), + ( + "DD012339M", + "HyperStartup Inc.", + "Address4", + "Tel4", + "Description4", + 0.1, + ), + ("HH072982K", "Hyper Hyper Inc.", "Address5", "", "Description5", 0.9), + ( + "EE059082Q", + "Mega Enterprises Corp.", + "Address6", + "Tel6", + "Description6", + 1.0, + ), + ], + columns=( + "Customer ID", + "Customer Name", + "Address", + "Tel", + "Description", + "weight", + ), + ) + self.a_few_strings = pd.Series( + [ + "BB016741P", + "BB082744L", + "BB098762D", + "BB099931J", + "BB072982K", + "BB059082Q", + ] + ) + self.one_string = pd.Series(["BB0"]) + self.two_strings = pd.Series(["Hyper", "Hyp"]) + self.whatever_series_1 = pd.Series(["whatever"]) self.expected_result_with_zeroes = pd.DataFrame( [ - (1, 'Hyper Startup Incorporated', 0.08170638, 'whatever', 0), - (0, 'Mega Enterprises Corporation', 0., 'whatever', 0), - (2, 'Hyper Startup Inc.', 0., 'whatever', 0), - (3, 'Hyper-Startup Inc.', 0., 'whatever', 0), - (4, 'Hyper Hyper Inc.', 0., 'whatever', 0), - (5, 'Mega Enterprises Corp.', 0., 'whatever', 0) + (1, "Hyper Startup Incorporated", 0.08170638, "whatever", 0), + (0, "Mega Enterprises Corporation", 0.0, "whatever", 0), + (2, "Hyper Startup Inc.", 0.0, "whatever", 0), + (3, "Hyper-Startup Inc.", 0.0, "whatever", 0), + (4, "Hyper Hyper Inc.", 0.0, "whatever", 0), + (5, "Mega Enterprises Corp.", 0.0, "whatever", 0), + ], + columns=[ + "left_index", + "left_Customer Name", + "similarity", + "right_side", + "right_index", ], - columns=['left_index', 'left_Customer Name', 'similarity', 'right_side', 'right_index'] ) self.expected_result_centroid = pd.Series( [ - 'Mega Enterprises Corporation', - 'Hyper Startup Inc.', - 'Hyper Startup Inc.', - 'Hyper Startup Inc.', - 'Hyper Hyper Inc.', - 'Mega Enterprises Corporation' + "Mega Enterprises Corporation", + "Hyper Startup Inc.", + "Hyper Startup Inc.", + "Hyper Startup Inc.", + "Hyper Hyper Inc.", + "Mega Enterprises Corporation", ], - name='group_rep_Customer Name' + name="group_rep_Customer Name", ) self.expected_result_centroid_with_index_col = pd.DataFrame( [ - (0, 'Mega Enterprises Corporation'), - (2, 'Hyper Startup Inc.'), - (2, 'Hyper Startup Inc.'), - (2, 'Hyper Startup Inc.'), - (4, 'Hyper Hyper Inc.'), - (0, 'Mega Enterprises Corporation') + (0, "Mega Enterprises Corporation"), + (2, "Hyper Startup Inc."), + (2, "Hyper Startup Inc."), + (2, "Hyper Startup Inc."), + (4, "Hyper Hyper Inc."), + (0, "Mega Enterprises Corporation"), ], - columns=['group_rep_index', 'group_rep_Customer Name'] + columns=["group_rep_index", "group_rep_Customer Name"], ) self.expected_result_first = pd.Series( [ - 'Mega Enterprises Corporation', - 'Hyper Startup Incorporated', - 'Hyper Startup Incorporated', - 'Hyper Startup Incorporated', - 'Hyper Hyper Inc.', - 'Mega Enterprises Corporation' + "Mega Enterprises Corporation", + "Hyper Startup Incorporated", + "Hyper Startup Incorporated", + "Hyper Startup Incorporated", + "Hyper Hyper Inc.", + "Mega Enterprises Corporation", ], - name='group_rep_Customer Name' + name="group_rep_Customer Name", ) class StringGrouperConfigTest(unittest.TestCase): - def test_config_defaults(self): """Empty initialisation should set default values""" config = StringGrouperConfig() @@ -109,7 +209,9 @@ def test_config_immutable(self): def test_config_non_default_values(self): """Configurations should be immutable""" - config = StringGrouperConfig(min_similarity=0.1, max_n_matches=100, number_of_processes=1) + config = StringGrouperConfig( + min_similarity=0.1, max_n_matches=100, number_of_processes=1 + ) self.assertEqual(0.1, config.min_similarity) self.assertEqual(100, config.max_n_matches) self.assertEqual(1, config.number_of_processes) @@ -119,7 +221,7 @@ class StringGrouperTest(unittest.TestCase): def test_compute_pairwise_similarities(self): """tests the high-level function compute_pairwise_similarities""" simple_example = SimpleExample() - df1 = simple_example.customers_df['Customer Name'] + df1 = simple_example.customers_df["Customer Name"] df2 = simple_example.expected_result_centroid similarities = compute_pairwise_similarities(df1, df2) expected_result = pd.Series( @@ -129,68 +231,65 @@ def test_compute_pairwise_similarities(self): 1.0000000000000004, 1.0000000000000004, 1.0, - 0.826462625999832 + 0.826462625999832, ], - name='similarity' + name="similarity", ) - expected_result = expected_result.astype(np.float32) + expected_result = expected_result.astype(np.float64) pd.testing.assert_series_equal(expected_result, similarities) def test_compute_pairwise_similarities_data_integrity(self): """tests that an exception is raised whenever the lengths of the two input series of the high-level function compute_pairwise_similarities are unequal""" simple_example = SimpleExample() - df1 = simple_example.customers_df['Customer Name'] + df1 = simple_example.customers_df["Customer Name"] df2 = simple_example.expected_result_centroid with self.assertRaises(Exception): _ = compute_pairwise_similarities(df1, df2[:-2]) - @patch('string_grouper.string_grouper.StringGrouper') + @patch("string_grouper.string_grouper.StringGrouper") def test_group_similar_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance - mock_StringGrouper_instance.get_groups.return_value = 'whatever' + mock_StringGrouper_instance.get_groups.return_value = "whatever" test_series_1 = None test_series_id_1 = None - df = group_similar_strings( - test_series_1, - string_ids=test_series_id_1 - ) + df = group_similar_strings(test_series_1, string_ids=test_series_id_1) mock_StringGrouper_instance.fit.assert_called_once() mock_StringGrouper_instance.get_groups.assert_called_once() - self.assertEqual(df, 'whatever') + self.assertEqual(df, "whatever") - @patch('string_grouper.string_grouper.StringGrouper') + @patch("string_grouper.string_grouper.StringGrouper") def test_match_most_similar(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance - mock_StringGrouper_instance.get_groups.return_value = 'whatever' + mock_StringGrouper_instance.get_groups.return_value = "whatever" test_series_1 = None test_series_2 = None test_series_id_1 = None test_series_id_2 = None df = match_most_similar( - test_series_1, - test_series_2, - master_id=test_series_id_1, - duplicates_id=test_series_id_2 - ) + test_series_1, + test_series_2, + master_id=test_series_id_1, + duplicates_id=test_series_id_2, + ) mock_StringGrouper_instance.fit.assert_called_once() mock_StringGrouper_instance.get_groups.assert_called_once() - self.assertEqual(df, 'whatever') + self.assertEqual(df, "whatever") - @patch('string_grouper.string_grouper.StringGrouper') + @patch("string_grouper.string_grouper.StringGrouper") def test_match_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance - mock_StringGrouper_instance.get_matches.return_value = 'whatever' + mock_StringGrouper_instance.get_matches.return_value = "whatever" test_series_1 = None test_series_id_1 = None @@ -198,63 +297,86 @@ def test_match_strings(self, mock_StringGouper): mock_StringGrouper_instance.fit.assert_called_once() mock_StringGrouper_instance.get_matches.assert_called_once() - self.assertEqual(df, 'whatever') + self.assertEqual(df, "whatever") @patch( - 'string_grouper.string_grouper.StringGrouper._symmetrize_matrix', - side_effect=mock_symmetrize_matrix + "string_grouper.string_grouper.StringGrouper._symmetrize_matrix", + side_effect=mock_symmetrize_matrix, ) - def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix_param): + def test_match_list_symmetry_without_symmetrize_function( + self, mock_symmetrize_matrix_param + ): """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is - **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" + **partially** symmetric which often occurs when the kwarg max_n_matches is too small + """ simple_example = SimpleExample() - df = simple_example.customers_df2['Customer Name'] + df = simple_example.customers_df2["Customer Name"] sg = StringGrouper(df, max_n_matches=2).fit() mock_symmetrize_matrix_param.assert_called_once() # obtain the upper and lower triangular parts of the matrix of matches: - upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']] - lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']] + upper = sg._matches_list[ + sg._matches_list["master_side"] < sg._matches_list["dupe_side"] + ] + lower = sg._matches_list[ + sg._matches_list["master_side"] > sg._matches_list["dupe_side"] + ] # switch the column names of lower triangular part (i.e., transpose) to convert it to upper triangular: - upper_prime = lower.rename(columns={'master_side': 'dupe_side', 'dupe_side': 'master_side'}) + upper_prime = lower.rename( + columns={"master_side": "dupe_side", "dupe_side": "master_side"} + ) # obtain the intersection between upper and upper_prime: - intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) + intersection = upper_prime.merge( + upper, how="inner", on=["master_side", "dupe_side"] + ) # if the intersection is empty then _matches_list is completely non-symmetric (this is acceptable) # if the intersection is not empty then at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. - self.assertFalse(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) + self.assertFalse( + intersection.empty or len(upper) == len(upper_prime) == len(intersection) + ) def test_match_list_symmetry_with_symmetrize_function(self): """This test ensures that _matches_list is symmetric""" simple_example = SimpleExample() - df = simple_example.customers_df2['Customer Name'] + df = simple_example.customers_df2["Customer Name"] sg = StringGrouper(df, max_n_matches=2).fit() # Obtain the upper and lower triangular parts of the matrix of matches: - upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']] - lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']] + upper = sg._matches_list[ + sg._matches_list["master_side"] < sg._matches_list["dupe_side"] + ] + lower = sg._matches_list[ + sg._matches_list["master_side"] > sg._matches_list["dupe_side"] + ] # Switch the column names of the lower triangular part (i.e., transpose) to convert it to upper triangular: - upper_prime = lower.rename(columns={'master_side': 'dupe_side', 'dupe_side': 'master_side'}) + upper_prime = lower.rename( + columns={"master_side": "dupe_side", "dupe_side": "master_side"} + ) # Obtain the intersection between upper and upper_prime: - intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) + intersection = upper_prime.merge( + upper, how="inner", on=["master_side", "dupe_side"] + ) # If the intersection is empty this means _matches_list is completely non-symmetric (this is acceptable) # If the intersection is not empty this means at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. - self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) + self.assertTrue( + intersection.empty or len(upper) == len(upper_prime) == len(intersection) + ) @patch( - 'string_grouper.string_grouper.StringGrouper._fix_diagonal', - side_effect=mock_symmetrize_matrix + "string_grouper.string_grouper.StringGrouper._fix_diagonal", + side_effect=mock_symmetrize_matrix, ) def test_match_list_diagonal_without_the_fix(self, mock_fix_diagonal): """test fails whenever _matches_list's number of self-joins is not equal to the number of strings""" # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; # for small datasets setting max_n_matches=1 reproduces the bug simple_example = SimpleExample() - df = simple_example.customers_df['Customer Name'] + df = simple_example.customers_df["Customer Name"] matches = match_strings(df, max_n_matches=1) mock_fix_diagonal.assert_called_once() - num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) + num_self_joins = len(matches[matches["left_index"] == matches["right_index"]]) num_strings = len(df) self.assertNotEqual(num_self_joins, num_strings) @@ -263,26 +385,29 @@ def test_match_list_diagonal(self): # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; # for small datasets setting max_n_matches=1 reproduces the bug simple_example = SimpleExample() - df = simple_example.customers_df['Customer Name'] + df = simple_example.customers_df["Customer Name"] matches = match_strings(df, max_n_matches=1) - num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) + num_self_joins = len(matches[matches["left_index"] == matches["right_index"]]) num_strings = len(df) self.assertEqual(num_self_joins, num_strings) def test_zero_min_similarity(self): """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are - returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" + returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic + """ simple_example = SimpleExample() - s_master = simple_example.customers_df['Customer Name'] + s_master = simple_example.customers_df["Customer Name"] s_dup = simple_example.whatever_series_1 matches = match_strings(s_master, s_dup, min_similarity=0) - pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches) + pd.testing.assert_frame_equal( + simple_example.expected_result_with_zeroes, matches + ) def test_zero_min_similarity_small_max_n_matches(self): """This test ensures that a warning is issued when n_max_matches is suspected to be too small while min_similarity <= 0 and include_zeroes is True""" simple_example = SimpleExample() - s_master = simple_example.customers_df['Customer Name'] + s_master = simple_example.customers_df["Customer Name"] s_dup = simple_example.two_strings with self.assertRaises(Exception): _ = match_strings(s_master, s_dup, max_n_matches=1, min_similarity=0) @@ -292,194 +417,261 @@ def test_get_non_matches_empty_case(self): simple_example = SimpleExample() s_master = simple_example.a_few_strings s_dup = simple_example.one_string - sg = StringGrouper(s_master, s_dup, max_n_matches=len(s_master), min_similarity=0).fit() + sg = StringGrouper( + s_master, s_dup, max_n_matches=len(s_master), min_similarity=0 + ).fit() self.assertTrue(sg._get_non_matches_list().empty) def test_n_grams_case_unchanged(self): """Should return all ngrams in a string with case""" - test_series = pd.Series(pd.Series(['aa'])) + test_series = pd.Series(pd.Series(["aa"])) # Explicit do not ignore case sg = StringGrouper(test_series, ignore_case=False) - expected_result = ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds'] - self.assertListEqual(expected_result, sg.n_grams('McDonalds')) + expected_result = ["McD", "cDo", "Don", "ona", "nal", "ald", "lds"] + self.assertListEqual(expected_result, sg.n_grams("McDonalds")) def test_n_grams_ignore_case_to_lower(self): """Should return all case insensitive ngrams in a string""" - test_series = pd.Series(pd.Series(['aa'])) + test_series = pd.Series(pd.Series(["aa"])) # Explicit ignore case sg = StringGrouper(test_series, ignore_case=True) - expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds'] - self.assertListEqual(expected_result, sg.n_grams('McDonalds')) + expected_result = ["mcd", "cdo", "don", "ona", "nal", "ald", "lds"] + self.assertListEqual(expected_result, sg.n_grams("McDonalds")) def test_n_grams_ignore_case_to_lower_with_defaults(self): """Should return all case insensitive ngrams in a string""" - test_series = pd.Series(pd.Series(['aa'])) + test_series = pd.Series(pd.Series(["aa"])) # Implicit default case (i.e. default behaviour) sg = StringGrouper(test_series) - expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds'] - self.assertListEqual(expected_result, sg.n_grams('McDonalds')) + expected_result = ["mcd", "cdo", "don", "ona", "nal", "ald", "lds"] + self.assertListEqual(expected_result, sg.n_grams("McDonalds")) def test_build_matrix(self): """Should create a csr matrix only master""" - test_series = pd.Series(['foo', 'bar', 'baz']) + test_series = pd.Series(["foo", "bar", "baz"]) sg = StringGrouper(test_series) master, dupe = sg._get_tf_idf_matrices() - c = csr_matrix([[0., 0., 1.], - [1., 0., 0.], - [0., 1., 0.]]) + c = csr_matrix([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) np.testing.assert_array_equal(c.toarray(), master.toarray()) np.testing.assert_array_equal(c.toarray(), dupe.toarray()) def test_build_matrix_master_and_duplicates(self): """Should create a csr matrix for master and duplicates""" - test_series_1 = pd.Series(['foo', 'bar', 'baz']) - test_series_2 = pd.Series(['foo', 'bar', 'bop']) + test_series_1 = pd.Series(["foo", "bar", "baz"]) + test_series_2 = pd.Series(["foo", "bar", "bop"]) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() - master_expected = csr_matrix([[0., 0., 0., 1.], - [1., 0., 0., 0.], - [0., 1., 0., 0.]]) - dupes_expected = csr_matrix([[0., 0., 0., 1.], - [1., 0., 0., 0.], - [0., 0., 1., 0.]]) + master_expected = csr_matrix( + [[0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]] + ) + dupes_expected = csr_matrix( + [[0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]] + ) np.testing.assert_array_equal(master_expected.toarray(), master.toarray()) np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray()) def test_build_matches(self): """Should create the cosine similarity matrix of two series""" - test_series_1 = pd.Series(['foo', 'bar', 'baz']) - test_series_2 = pd.Series(['foo', 'bar', 'bop']) + test_series_1 = pd.Series(["foo", "bar", "baz"]) + test_series_2 = pd.Series(["foo", "bar", "bop"]) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() - expected_matches = np.array([[1., 0., 0.], - [0., 1., 0.], - [0., 0., 0.]]) - np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[0].toarray()) + expected_matches = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]]) + np.testing.assert_array_equal( + expected_matches, sg._build_matches(master, dupe)[0].toarray() + ) def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" - test_series_1 = pd.Series(['foo', 'bar', 'baz']) - test_series_2 = pd.Series(['foo', 'bar', 'bop']) + test_series_1 = pd.Series(["foo", "bar", "baz"]) + test_series_2 = pd.Series(["foo", "bar", "bop"]) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] - expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) - expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) + expected_df = pd.DataFrame( + {"master_side": master, "dupe_side": dupe_side, "similarity": similarity} + ) + expected_df.loc[:, "similarity"] = expected_df.loc[:, "similarity"].astype( + sg._config.tfidf_matrix_dtype + ) pd.testing.assert_frame_equal(expected_df, sg._matches_list) def test_case_insensitive_build_matches_list(self): """Should create the cosine similarity matrix of two case insensitive series""" - test_series_1 = pd.Series(['foo', 'BAR', 'baz']) - test_series_2 = pd.Series(['FOO', 'bar', 'bop']) + test_series_1 = pd.Series(["foo", "BAR", "baz"]) + test_series_2 = pd.Series(["FOO", "bar", "bop"]) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] - expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) - expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) + expected_df = pd.DataFrame( + {"master_side": master, "dupe_side": dupe_side, "similarity": similarity} + ) + expected_df.loc[:, "similarity"] = expected_df.loc[:, "similarity"].astype( + sg._config.tfidf_matrix_dtype + ) pd.testing.assert_frame_equal(expected_df, sg._matches_list) def test_get_matches_two_dataframes(self): - test_series_1 = pd.Series(['foo', 'bar', 'baz']) - test_series_2 = pd.Series(['foo', 'bar', 'bop']) + test_series_1 = pd.Series(["foo", "bar", "baz"]) + test_series_2 = pd.Series(["foo", "bar", "bop"]) sg = StringGrouper(test_series_1, test_series_2).fit() - left_side = ['foo', 'bar'] + left_side = ["foo", "bar"] left_index = [0, 1] - right_side = ['foo', 'bar'] + right_side = ["foo", "bar"] right_index = [0, 1] similarity = [1.0, 1.0] - expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, - 'similarity': similarity, - 'right_side': right_side, 'right_index': right_index}) - expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) + expected_df = pd.DataFrame( + { + "left_index": left_index, + "left_side": left_side, + "similarity": similarity, + "right_side": right_side, + "right_index": right_index, + } + ) + expected_df.loc[:, "similarity"] = expected_df.loc[:, "similarity"].astype( + sg._config.tfidf_matrix_dtype + ) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_single(self): - test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) + test_series_1 = pd.Series(["foo", "bar", "baz", "foo"]) sg = StringGrouper(test_series_1) sg = sg.fit() - left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] + left_side = ["foo", "foo", "bar", "baz", "foo", "foo"] + right_side = ["foo", "foo", "bar", "baz", "foo", "foo"] left_index = [0, 0, 1, 2, 3, 3] right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, - 'similarity': similarity, - 'right_side': right_side, 'right_index': right_index}) - expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) + expected_df = pd.DataFrame( + { + "left_index": left_index, + "left_side": left_side, + "similarity": similarity, + "right_side": right_side, + "right_index": right_index, + } + ) + expected_df.loc[:, "similarity"] = expected_df.loc[:, "similarity"].astype( + sg._config.tfidf_matrix_dtype + ) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_1_series_1_id_series(self): - test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) - test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) + test_series_1 = pd.Series(["foo", "bar", "baz", "foo"]) + test_series_id_1 = pd.Series(["A0", "A1", "A2", "A3"]) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() - left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] + left_side = ["foo", "foo", "bar", "baz", "foo", "foo"] + left_side_id = ["A0", "A0", "A1", "A2", "A3", "A3"] left_index = [0, 0, 1, 2, 3, 3] - right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] + right_side = ["foo", "foo", "bar", "baz", "foo", "foo"] + right_side_id = ["A0", "A3", "A1", "A2", "A0", "A3"] right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, - 'similarity': similarity, - 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index}) - expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) + expected_df = pd.DataFrame( + { + "left_index": left_index, + "left_side": left_side, + "left_id": left_side_id, + "similarity": similarity, + "right_id": right_side_id, + "right_side": right_side, + "right_index": right_index, + } + ) + expected_df.loc[:, "similarity"] = expected_df.loc[:, "similarity"].astype( + sg._config.tfidf_matrix_dtype + ) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_2_series_2_id_series(self): - test_series_1 = pd.Series(['foo', 'bar', 'baz']) - test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) - test_series_2 = pd.Series(['foo', 'bar', 'bop']) - test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) - sg = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2, - master_id=test_series_id_1).fit() - left_side = ['foo', 'bar'] - left_side_id = ['A0', 'A1'] + test_series_1 = pd.Series(["foo", "bar", "baz"]) + test_series_id_1 = pd.Series(["A0", "A1", "A2"]) + test_series_2 = pd.Series(["foo", "bar", "bop"]) + test_series_id_2 = pd.Series(["B0", "B1", "B2"]) + sg = StringGrouper( + test_series_1, + test_series_2, + duplicates_id=test_series_id_2, + master_id=test_series_id_1, + ).fit() + left_side = ["foo", "bar"] + left_side_id = ["A0", "A1"] left_index = [0, 1] - right_side = ['foo', 'bar'] - right_side_id = ['B0', 'B1'] + right_side = ["foo", "bar"] + right_side_id = ["B0", "B1"] right_index = [0, 1] similarity = [1.0, 1.0] - expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, - 'similarity': similarity, - 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index}) - expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) + expected_df = pd.DataFrame( + { + "left_index": left_index, + "left_side": left_side, + "left_id": left_side_id, + "similarity": similarity, + "right_id": right_side_id, + "right_side": right_side, + "right_index": right_index, + } + ) + expected_df.loc[:, "similarity"] = expected_df.loc[:, "similarity"].astype( + sg._config.tfidf_matrix_dtype + ) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_raises_exception_if_unexpected_options_given(self): # When the input id data does not correspond with its string data: - test_series_1 = pd.Series(['foo', 'bar', 'baz']) - bad_test_series_id_1 = pd.Series(['A0', 'A1']) - good_test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) - test_series_2 = pd.Series(['foo', 'bar', 'bop']) - bad_test_series_id_2 = pd.Series(['B0', 'B1']) - good_test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) + test_series_1 = pd.Series(["foo", "bar", "baz"]) + bad_test_series_id_1 = pd.Series(["A0", "A1"]) + good_test_series_id_1 = pd.Series(["A0", "A1", "A2"]) + test_series_2 = pd.Series(["foo", "bar", "bop"]) + bad_test_series_id_2 = pd.Series(["B0", "B1"]) + good_test_series_id_2 = pd.Series(["B0", "B1", "B2"]) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, master_id=bad_test_series_id_1) with self.assertRaises(Exception): - _ = StringGrouper(test_series_1, duplicates=test_series_2, duplicates_id=bad_test_series_id_2, - master_id=good_test_series_id_1) + _ = StringGrouper( + test_series_1, + duplicates=test_series_2, + duplicates_id=bad_test_series_id_2, + master_id=good_test_series_id_1, + ) # When the input data is ok but the option combinations are invalid: with self.assertRaises(Exception): - _ = StringGrouper(test_series_1, test_series_2, master_id=good_test_series_id_1) + _ = StringGrouper( + test_series_1, test_series_2, master_id=good_test_series_id_1 + ) with self.assertRaises(Exception): - _ = StringGrouper(test_series_1, test_series_2, duplicates_id=good_test_series_id_2) + _ = StringGrouper( + test_series_1, test_series_2, duplicates_id=good_test_series_id_2 + ) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates_id=good_test_series_id_2) with self.assertRaises(Exception): - _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, duplicates_id=good_test_series_id_2) + _ = StringGrouper( + test_series_1, + master_id=good_test_series_id_1, + duplicates_id=good_test_series_id_2, + ) with self.assertRaises(Exception): - _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, ignore_index=True, replace_na=True) + _ = StringGrouper( + test_series_1, + master_id=good_test_series_id_1, + ignore_index=True, + replace_na=True, + ) # Here we force an exception by making the number of index-levels of duplicates different from master: # and setting replace_na=True - test_series_2.index = pd.MultiIndex.from_tuples(list(zip(list('ABC'), [0, 1, 2]))) + test_series_2.index = pd.MultiIndex.from_tuples( + list(zip(list("ABC"), [0, 1, 2])) + ) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates=test_series_2, replace_na=True) @@ -491,10 +683,8 @@ def test_get_groups_single_df_group_rep_default(self): pd.testing.assert_series_equal( simple_example.expected_result_centroid, group_similar_strings( - customers_df['Customer Name'], - min_similarity=0.6, - ignore_index=True - ) + customers_df["Customer Name"], min_similarity=0.6, ignore_index=True + ), ) def test_get_groups_single_valued_series(self): @@ -502,36 +692,31 @@ def test_get_groups_single_valued_series(self): since the input-series is also single-valued. This test was created in response to a bug discovered by George Walker""" pd.testing.assert_frame_equal( - pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']), - group_similar_strings( - pd.Series(["hello"]), - min_similarity=0.6 - ) + pd.DataFrame([(0, "hello")], columns=["group_rep_index", "group_rep"]), + group_similar_strings(pd.Series(["hello"]), min_similarity=0.6), ) pd.testing.assert_series_equal( - pd.Series(["hello"], name='group_rep'), + pd.Series(["hello"], name="group_rep"), group_similar_strings( - pd.Series(["hello"]), - min_similarity=0.6, - ignore_index=True - ) + pd.Series(["hello"]), min_similarity=0.6, ignore_index=True + ), ) pd.testing.assert_frame_equal( - pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']), + pd.DataFrame( + [(0, "hello")], columns=["most_similar_index", "most_similar_master"] + ), match_most_similar( - pd.Series(["hello"]), - pd.Series(["hello"]), - min_similarity=0.6 - ) + pd.Series(["hello"]), pd.Series(["hello"]), min_similarity=0.6 + ), ) pd.testing.assert_series_equal( - pd.Series(["hello"], name='most_similar_master'), + pd.Series(["hello"], name="most_similar_master"), match_most_similar( pd.Series(["hello"]), pd.Series(["hello"]), min_similarity=0.6, - ignore_index=True - ) + ignore_index=True, + ), ) def test_get_groups_single_df_keep_index(self): @@ -542,10 +727,8 @@ def test_get_groups_single_df_keep_index(self): pd.testing.assert_frame_equal( simple_example.expected_result_centroid_with_index_col, group_similar_strings( - customers_df['Customer Name'], - min_similarity=0.6, - ignore_index=False - ) + customers_df["Customer Name"], min_similarity=0.6, ignore_index=False + ), ) def test_get_groups_single_df_group_rep_centroid(self): @@ -556,11 +739,11 @@ def test_get_groups_single_df_group_rep_centroid(self): pd.testing.assert_series_equal( simple_example.expected_result_first, group_similar_strings( - customers_df['Customer Name'], - group_rep='first', + customers_df["Customer Name"], + group_rep="first", min_similarity=0.6, - ignore_index=True - ) + ignore_index=True, + ), ) def test_get_groups_single_df_group_rep_bad_option_value(self): @@ -569,259 +752,307 @@ def test_get_groups_single_df_group_rep_bad_option_value(self): customers_df = simple_example.customers_df with self.assertRaises(Exception): _ = group_similar_strings( - customers_df['Customer Name'], - group_rep='nonsense', - min_similarity=0.6 - ) + customers_df["Customer Name"], group_rep="nonsense", min_similarity=0.6 + ) def test_get_groups_single_df(self): """Should return a pd.Series object with the same length as the original df. The series object will contain a list of the grouped strings""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1, ignore_index=True) sg = sg.fit() result = sg.get_groups() - expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='group_rep') + expected_result = pd.Series(["foooo", "bar", "baz", "foooo"], name="group_rep") pd.testing.assert_series_equal(expected_result, result) def test_get_groups_1_string_series_1_id_series(self): """Should return a pd.DataFrame object with the same length as the original df. The series object will contain a list of the grouped strings""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) - test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) + test_series_1 = pd.Series(["foooo", "bar", "baz", "foooob"]) + test_series_id_1 = pd.Series(["A0", "A1", "A2", "A3"]) sg = StringGrouper(test_series_1, master_id=test_series_id_1, ignore_index=True) sg = sg.fit() result = sg.get_groups() - expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), - columns=['group_rep_id', 'group_rep']) + expected_result = pd.DataFrame( + list(zip(["A0", "A1", "A2", "A0"], ["foooo", "bar", "baz", "foooo"])), + columns=["group_rep_id", "group_rep"], + ) pd.testing.assert_frame_equal(expected_result, result) def test_get_groups_two_df(self): """Should return a pd.Series object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "bar", "baz"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1, test_series_2, ignore_index=True) sg = sg.fit() result = sg.get_groups() - expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master') + expected_result = pd.Series( + ["foooo", "bar", "baz", "foooo"], name="most_similar_master" + ) pd.testing.assert_series_equal(expected_result, result) def test_get_groups_2_string_series_2_id_series(self): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) - test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) - test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) - sg = StringGrouper(test_series_1, - test_series_2, - master_id=test_series_id_1, - duplicates_id=test_series_id_2, - ignore_index=True) + test_series_1 = pd.Series(["foooo", "bar", "baz"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) + test_series_id_1 = pd.Series(["A0", "A1", "A2"]) + test_series_id_2 = pd.Series(["B0", "B1", "B2", "B3"]) + sg = StringGrouper( + test_series_1, + test_series_2, + master_id=test_series_id_1, + duplicates_id=test_series_id_2, + ignore_index=True, + ) sg = sg.fit() result = sg.get_groups() - expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), - columns=['most_similar_master_id', 'most_similar_master']) + expected_result = pd.DataFrame( + list(zip(["A0", "A1", "A2", "A0"], ["foooo", "bar", "baz", "foooo"])), + columns=["most_similar_master_id", "most_similar_master"], + ) pd.testing.assert_frame_equal(expected_result, result) - def test_get_groups_2_string_series_2_numeric_id_series_with_missing_master_value(self): + def test_get_groups_2_string_series_2_numeric_id_series_with_missing_master_value( + self, + ): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" - test_series_1 = pd.Series(['foooo', 'bar', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "bar", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) test_series_id_1 = pd.Series([0, 1, 2]) test_series_id_2 = pd.Series([100, 101, 102, 103]) - sg = StringGrouper(test_series_1, - test_series_2, - master_id=test_series_id_1, - duplicates_id=test_series_id_2, - ignore_index=True) + sg = StringGrouper( + test_series_1, + test_series_2, + master_id=test_series_id_1, + duplicates_id=test_series_id_2, + ignore_index=True, + ) sg = sg.fit() result = sg.get_groups() - expected_result = pd.DataFrame(list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), - columns=['most_similar_master_id', 'most_similar_master']) + expected_result = pd.DataFrame( + list(zip([0.0, 1.0, 102.0, 0.0], ["foooo", "bar", "baz", "foooo"])), + columns=["most_similar_master_id", "most_similar_master"], + ) pd.testing.assert_frame_equal(expected_result, result) - def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(self): + def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value( + self, + ): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" - test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2]) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'], index=[100, 101, 102, 103]) + test_series_1 = pd.Series(["foooo", "bar", "foooo"], index=[0, 1, 2]) + test_series_2 = pd.Series( + ["foooo", "bar", "baz", "foooob"], index=[100, 101, 102, 103] + ) sg = StringGrouper(test_series_1, test_series_2, replace_na=True) sg = sg.fit() result = sg.get_groups() - expected_result = pd.DataFrame(list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), - columns=['most_similar_index', 'most_similar_master'], - index=test_series_2.index) + expected_result = pd.DataFrame( + list(zip([0.0, 1.0, 102.0, 0.0], ["foooo", "bar", "baz", "foooo"])), + columns=["most_similar_index", "most_similar_master"], + index=test_series_2.index, + ) pd.testing.assert_frame_equal(expected_result, result) def test_get_groups_two_df_same_similarity(self): """Should return a pd.Series object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "bar", "baz", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1, test_series_2, ignore_index=True) sg = sg.fit() result = sg.get_groups() - expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master') + expected_result = pd.Series( + ["foooo", "bar", "baz", "foooo"], name="most_similar_master" + ) pd.testing.assert_series_equal(expected_result, result) def test_get_groups_4_df_same_similarity(self): """Should return a pd.DataFrame object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) - test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) - test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) - sg = StringGrouper(test_series_1, - test_series_2, - master_id=test_series_id_1, - duplicates_id=test_series_id_2, - ignore_index=True) + test_series_1 = pd.Series(["foooo", "bar", "baz", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) + test_series_id_1 = pd.Series(["A0", "A1", "A2", "A3"]) + test_series_id_2 = pd.Series(["B0", "B1", "B2", "B3"]) + sg = StringGrouper( + test_series_1, + test_series_2, + master_id=test_series_id_1, + duplicates_id=test_series_id_2, + ignore_index=True, + ) sg = sg.fit() result = sg.get_groups() - expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), - columns=['most_similar_master_id', 'most_similar_master']) + expected_result = pd.DataFrame( + list(zip(["A0", "A1", "A2", "A0"], ["foooo", "bar", "baz", "foooo"])), + columns=["most_similar_master_id", "most_similar_master"], + ) pd.testing.assert_frame_equal(expected_result, result) def test_get_groups_two_df_no_match(self): """Should return a pd.Series object with the length of the dupes. If no match is found in dupes, the original will be returned""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz']) - test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "bar", "baz"]) + test_series_2 = pd.Series(["foooo", "dooz", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1, test_series_2, ignore_index=True) sg = sg.fit() result = sg.get_groups() - expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo'], name='most_similar_master') + expected_result = pd.Series( + ["foooo", "dooz", "bar", "baz", "foooo"], name="most_similar_master" + ) pd.testing.assert_series_equal(expected_result, result) def test_get_groups_4_df_no_match(self): """Should return a pd.DataFrame object with the length of the dupes. If no match is found in dupes, the original will be returned""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz']) - test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) - test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) - test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4']) - sg = StringGrouper(test_series_1, - test_series_2, - master_id=test_series_id_1, - duplicates_id=test_series_id_2, - ignore_index=True) + test_series_1 = pd.Series(["foooo", "bar", "baz"]) + test_series_2 = pd.Series(["foooo", "dooz", "bar", "baz", "foooob"]) + test_series_id_1 = pd.Series(["A0", "A1", "A2"]) + test_series_id_2 = pd.Series(["B0", "B1", "B2", "B3", "B4"]) + sg = StringGrouper( + test_series_1, + test_series_2, + master_id=test_series_id_1, + duplicates_id=test_series_id_2, + ignore_index=True, + ) sg = sg.fit() result = sg.get_groups() - expected_result = pd.DataFrame(list(zip( - ['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo'] - )), - columns=['most_similar_master_id', 'most_similar_master'] + expected_result = pd.DataFrame( + list( + zip( + ["A0", "B1", "A1", "A2", "A0"], + ["foooo", "dooz", "bar", "baz", "foooo"], + ) + ), + columns=["most_similar_master_id", "most_similar_master"], ) pd.testing.assert_frame_equal(expected_result, result) def test_get_groups_raises_exception(self): """Should raise an exception if called before the StringGrouper is fit""" - test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "bar", "baz", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1, test_series_2) with self.assertRaises(StringGrouperNotFitException): _ = sg.get_groups() def test_add_match_raises_exception_if_string_not_present(self): - test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "no match", "baz", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1).fit() sg2 = StringGrouper(test_series_1, test_series_2).fit() with self.assertRaises(ValueError): - sg.add_match('doesnt exist', 'baz') + sg.add_match("doesnt exist", "baz") with self.assertRaises(ValueError): - sg.add_match('baz', 'doesnt exist') + sg.add_match("baz", "doesnt exist") with self.assertRaises(ValueError): - sg2.add_match('doesnt exist', 'baz') + sg2.add_match("doesnt exist", "baz") with self.assertRaises(ValueError): - sg2.add_match('baz', 'doesnt exist') + sg2.add_match("baz", "doesnt exist") def test_add_match_single_occurence(self): """Should add the match if there are no exact duplicates""" - test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "no match", "baz", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1).fit() - sg.add_match('no match', 'baz') + sg.add_match("no match", "baz") matches = sg.get_matches() - matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] + matches = matches[ + (matches.left_side == "no match") & (matches.right_side == "baz") + ] self.assertEqual(1, matches.shape[0]) sg2 = StringGrouper(test_series_1, test_series_2).fit() - sg2.add_match('no match', 'bar') + sg2.add_match("no match", "bar") matches = sg2.get_matches() - matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'bar')] + matches = matches[ + (matches.left_side == "no match") & (matches.right_side == "bar") + ] self.assertEqual(1, matches.shape[0]) def test_add_match_single_group_matches_symmetric(self): """New matches that are added to a SG with only a master series should be symmetric""" - test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) + test_series_1 = pd.Series(["foooo", "no match", "baz", "foooo"]) sg = StringGrouper(test_series_1).fit() - sg.add_match('no match', 'baz') + sg.add_match("no match", "baz") matches = sg.get_matches() - matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] + matches_1 = matches[ + (matches.left_side == "no match") & (matches.right_side == "baz") + ] self.assertEqual(1, matches_1.shape[0]) - matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')] + matches_2 = matches[ + (matches.left_side == "baz") & (matches.right_side == "no match") + ] self.assertEqual(1, matches_2.shape[0]) def test_add_match_multiple_occurences(self): """Should add multiple matches if there are exact duplicates""" - test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "no match", "baz", "foooo"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1, test_series_2).fit() - sg.add_match('foooo', 'baz') + sg.add_match("foooo", "baz") matches = sg.get_matches() - matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'baz')] + matches = matches[ + (matches.left_side == "foooo") & (matches.right_side == "baz") + ] self.assertEqual(2, matches.shape[0]) def test_remove_match(self): """Should remove a match""" - test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob']) - test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) + test_series_1 = pd.Series(["foooo", "no match", "baz", "foooob"]) + test_series_2 = pd.Series(["foooo", "bar", "baz", "foooob"]) sg = StringGrouper(test_series_1).fit() - sg.remove_match('foooo', 'foooob') + sg.remove_match("foooo", "foooob") matches = sg.get_matches() - matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] + matches_1 = matches[ + (matches.left_side == "foooo") & (matches.right_side == "foooob") + ] # In the case of only a master series, the matches are recursive, so both variants are to be removed - matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')] + matches_2 = matches[ + (matches.left_side == "foooob") & (matches.right_side == "foooo") + ] self.assertEqual(0, matches_1.shape[0]) self.assertEqual(0, matches_2.shape[0]) sg2 = StringGrouper(test_series_1, test_series_2).fit() - sg2.remove_match('foooo', 'foooob') + sg2.remove_match("foooo", "foooob") matches = sg2.get_matches() - matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] + matches = matches[ + (matches.left_side == "foooo") & (matches.right_side == "foooob") + ] self.assertEqual(0, matches.shape[0]) def test_string_grouper_type_error(self): """StringGrouper should raise an typeerror master or duplicates are not a series of strings""" with self.assertRaises(TypeError): - _ = StringGrouper('foo', 'bar') + _ = StringGrouper("foo", "bar") with self.assertRaises(TypeError): - _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1])) + _ = StringGrouper(pd.Series(["foo", "bar"]), pd.Series(["foo", 1])) with self.assertRaises(TypeError): - _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j'])) + _ = StringGrouper(pd.Series(["foo", np.nan]), pd.Series(["foo", "j"])) def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" sample = [ - 'microsoftoffice 365 home', - 'microsoftoffice 365 pers', - 'microsoft office' - ] + "microsoftoffice 365 home", + "microsoftoffice 365 pers", + "microsoft office", + ] - df = pd.DataFrame(sample, columns=['name']) + df = pd.DataFrame(sample, columns=["name"]) - sg = StringGrouper(df['name'], ignore_index=True) + sg = StringGrouper(df["name"], ignore_index=True) sg = sg.fit() - sg = sg.add_match('microsoft office', 'microsoftoffice 365 home') - sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office') - df['deduped'] = sg.get_groups() + sg = sg.add_match("microsoft office", "microsoftoffice 365 home") + sg = sg.add_match("microsoftoffice 365 pers", "microsoft office") + df["deduped"] = sg.get_groups() # All strings should now match to the same "master" string self.assertEqual(1, len(df.deduped.unique())) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/string_grouper_utils/__init__.py b/string_grouper_utils/__init__.py index 3ab821f..9f7ee83 100644 --- a/string_grouper_utils/__init__.py +++ b/string_grouper_utils/__init__.py @@ -1,2 +1,5 @@ -from .string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \ - new_group_rep_by_highest_weight +from .string_grouper_utils import ( + new_group_rep_by_earliest_timestamp, + new_group_rep_by_completeness, + new_group_rep_by_highest_weight, +) diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py index e674367..79eab59 100644 --- a/string_grouper_utils/string_grouper_utils.py +++ b/string_grouper_utils/string_grouper_utils.py @@ -8,13 +8,15 @@ import pydoc -def new_group_rep_by_earliest_timestamp(grouped_data: pd.DataFrame, - group_col: Union[str, int], - record_id_col: Union[str, int], - timestamps: Union[pd.Series, str, int], - record_name_col: Optional[Union[str, int]] = None, - parserinfo=None, - **kwargs) -> Union[pd.DataFrame, pd.Series]: +def new_group_rep_by_earliest_timestamp( + grouped_data: pd.DataFrame, + group_col: Union[str, int], + record_id_col: Union[str, int], + timestamps: Union[pd.Series, str, int], + record_name_col: Optional[Union[str, int]] = None, + parserinfo=None, + **kwargs +) -> Union[pd.DataFrame, pd.Series]: """ Selects the oldest string in each group as group-representative. :param grouped_data: The grouped DataFrame @@ -34,19 +36,24 @@ def new_group_rep_by_earliest_timestamp(grouped_data: pd.DataFrame, """ if isinstance(timestamps, pd.Series): if len(grouped_data) != len(timestamps): - raise Exception('Both grouped_data and timestamps must be pandas.Series of the same length.') + raise Exception( + "Both grouped_data and timestamps must be pandas.Series of the same length." + ) else: timestamps = get_column(timestamps, grouped_data) weights = parse_timestamps(timestamps, parserinfo, **kwargs) - return group_rep_transform('idxmin', weights, grouped_data, group_col, record_id_col, record_name_col) - - -def new_group_rep_by_completeness(grouped_data: pd.DataFrame, - group_col: Union[str, int], - record_id_col: Union[str, int], - record_name_col: Optional[Union[str, int]] = None, - tested_cols: Optional[Union[pd.DataFrame, List[Union[str, int]]]] = None - ) -> Union[pd.DataFrame, pd.Series]: + return group_rep_transform( + "idxmin", weights, grouped_data, group_col, record_id_col, record_name_col + ) + + +def new_group_rep_by_completeness( + grouped_data: pd.DataFrame, + group_col: Union[str, int], + record_id_col: Union[str, int], + record_name_col: Optional[Union[str, int]] = None, + tested_cols: Optional[Union[pd.DataFrame, List[Union[str, int]]]] = None, +) -> Union[pd.DataFrame, pd.Series]: """ Selects the string in the group with the most filled-in row/record as group-representative. :param grouped_data: The grouped DataFrame @@ -62,28 +69,33 @@ def new_group_rep_by_completeness(grouped_data: pd.DataFrame, """ if isinstance(tested_cols, pd.DataFrame): if len(grouped_data) != len(tested_cols): - raise Exception('Both grouped_data and tested_cols must be pandas.DataFrame of the same length.') + raise Exception( + "Both grouped_data and tested_cols must be pandas.DataFrame of the same length." + ) elif tested_cols is not None: tested_cols = get_column(tested_cols, grouped_data) else: tested_cols = grouped_data def is_notnull_and_not_empty(x): - if x == '' or pd.isnull(x): + if x == "" or pd.isnull(x): return 0 else: return 1 - weights = tested_cols.applymap(is_notnull_and_not_empty).sum(axis=1) - return group_rep_transform('idxmax', weights, grouped_data, group_col, record_id_col, record_name_col) + weights = tested_cols.map(is_notnull_and_not_empty).sum(axis=1) + return group_rep_transform( + "idxmax", weights, grouped_data, group_col, record_id_col, record_name_col + ) -def new_group_rep_by_highest_weight(grouped_data: pd.DataFrame, - group_col: Union[str, int], - record_id_col: Union[str, int], - weights: Union[pd.Series, str, int], - record_name_col: Optional[Union[str, int]] = None, - ) -> Union[pd.DataFrame, pd.Series]: +def new_group_rep_by_highest_weight( + grouped_data: pd.DataFrame, + group_col: Union[str, int], + record_id_col: Union[str, int], + weights: Union[pd.Series, str, int], + record_name_col: Optional[Union[str, int]] = None, +) -> Union[pd.DataFrame, pd.Series]: """ Selects the string in the group with the largest weight as group-representative. :param grouped_data: The grouped DataFrame @@ -97,31 +109,50 @@ def new_group_rep_by_highest_weight(grouped_data: pd.DataFrame, """ if isinstance(weights, pd.Series): if len(grouped_data) != len(weights): - raise Exception('Both grouped_data and weights must be pandas.Series of the same length.') + raise Exception( + "Both grouped_data and weights must be pandas.Series of the same length." + ) else: weights = get_column(weights, grouped_data) - return group_rep_transform('idxmax', weights, grouped_data, group_col, record_id_col, record_name_col) - - -def group_rep_transform(method: str, - weights: pd.Series, - grouped_data, - group_col, - record_id_col, - record_name_col) -> Union[pd.Series, pd.DataFrame]: + return group_rep_transform( + "idxmax", weights, grouped_data, group_col, record_id_col, record_name_col + ) + + +def group_rep_transform( + method: str, + weights: pd.Series, + grouped_data, + group_col, + record_id_col, + record_name_col, +) -> Union[pd.Series, pd.DataFrame]: stashed_index = grouped_data.index group_of_master_id = get_column(group_col, grouped_data).reset_index(drop=True) - group_of_master_id = group_of_master_id.rename('raw_group_id').reset_index().rename(columns={'index': 'weight'}) - group_of_master_id['weight'] = weights.reset_index(drop=True) - group_of_master_id['group_rep'] = \ - group_of_master_id.groupby('raw_group_id', sort=False)['weight'].transform(method) + group_of_master_id = ( + group_of_master_id.rename("raw_group_id") + .reset_index() + .rename(columns={"index": "weight"}) + ) + group_of_master_id["weight"] = weights.reset_index(drop=True) + group_of_master_id["group_rep"] = group_of_master_id.groupby( + "raw_group_id", sort=False + )["weight"].transform(method) record_id_col = get_column(record_id_col, grouped_data) - new_rep = record_id_col.iloc[group_of_master_id.group_rep].reset_index(drop=True).rename(None) + new_rep = ( + record_id_col.iloc[group_of_master_id.group_rep] + .reset_index(drop=True) + .rename(None) + ) if record_name_col is None: output = new_rep else: record_name_col = get_column(record_name_col, grouped_data) - new_rep_name = record_name_col.iloc[group_of_master_id.group_rep].reset_index(drop=True).rename(None) + new_rep_name = ( + record_name_col.iloc[group_of_master_id.group_rep] + .reset_index(drop=True) + .rename(None) + ) output = pd.concat([new_rep, new_rep_name], axis=1) output.index = stashed_index return output @@ -141,10 +172,12 @@ def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Ser error_msg += " or datetime datatype or pandas Timestamp datatype or numbers" if is_series_of_type(str, timestamps): # if any of the strings is not datetime-like raise an exception - if timestamps.to_frame().applymap(is_date).squeeze().all(): + if timestamps.to_frame().map(is_date).squeeze().all(): # convert strings to numpy datetime64 - return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC)) - elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps): + return timestamps.transform( + lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC) + ) + elif is_series_of_type(type(pd.Timestamp("15-1-2000")), timestamps): # convert pandas Timestamps to numpy datetime64 return timestamps.transform(lambda x: x.to_numpy()) elif is_series_of_type(datetime, timestamps): @@ -172,20 +205,19 @@ def is_date(string, parserinfo=None, **kwargs): def is_series_of_type(what: type, series_to_test: pd.Series) -> bool: - if series_to_test.to_frame().applymap( - lambda x: not isinstance(x, what) - ).squeeze().any(): + if series_to_test.to_frame().map(lambda x: not isinstance(x, what)).squeeze().any(): return False return True # The following lines modify and append the kwargs portion of the docstring of dateutil.parser.parse to # the docstring of new_group_rep_by_earliest_timestamp: -parse_docstring_kwargs = re.search(':param parserinfo:.*?:return:', pydoc.render_doc(parse), flags=re.DOTALL).group(0) +parse_docstring_kwargs = re.search( + ":param parserinfo:.*?:return:", pydoc.render_doc(parse), flags=re.DOTALL +).group(0) parse_docstring_kwargs = re.sub( - '``timestr``', - 'the strings containing the date/time-stamps', - parse_docstring_kwargs + "``timestr``", "the strings containing the date/time-stamps", parse_docstring_kwargs +) +new_group_rep_by_earliest_timestamp.__doc__ = ( + new_group_rep_by_earliest_timestamp.__doc__ + parse_docstring_kwargs[:-9] ) -new_group_rep_by_earliest_timestamp.__doc__ = new_group_rep_by_earliest_timestamp.__doc__ + \ - parse_docstring_kwargs[:-9] diff --git a/string_grouper_utils/test/test_string_grouper_utils.py b/string_grouper_utils/test/test_string_grouper_utils.py index 0c8a8ee..abb822e 100644 --- a/string_grouper_utils/test/test_string_grouper_utils.py +++ b/string_grouper_utils/test/test_string_grouper_utils.py @@ -1,83 +1,149 @@ import unittest import pandas as pd from dateutil.parser import parse -from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \ - new_group_rep_by_completeness, new_group_rep_by_highest_weight +from string_grouper_utils.string_grouper_utils import ( + new_group_rep_by_earliest_timestamp, + new_group_rep_by_completeness, + new_group_rep_by_highest_weight, +) class SimpleExample(object): def __init__(self): self.customers_df = pd.DataFrame( [ - ('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2, - '2014-12-30 10:55:00-02:00', 'EE059082Q', 'Mega Enterprises Corp.'), - ('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5, '2017-01-01 20:23:15-05:00', - 'BB099931J', 'Hyper-Startup Inc.'), - ('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3, - '2020-10-20 15:29:30+02:00', 'BB099931J', 'Hyper-Startup Inc.'), - ('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1, - '2013-07-01 03:34:45-05:00', 'BB099931J', 'Hyper-Startup Inc.'), - ('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9, '2005-09-11 11:56:00-07:00', - 'HH072982K', 'Hyper Hyper Inc.'), - ('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0, - '1998-04-14 09:21:11+00:00', 'EE059082Q', 'Mega Enterprises Corp.') + ( + "BB016741P", + "Mega Enterprises Corporation", + "Address0", + "Tel0", + "Description0", + 0.2, + "2014-12-30 10:55:00-02:00", + "EE059082Q", + "Mega Enterprises Corp.", + ), + ( + "CC082744L", + "Hyper Startup Incorporated", + "", + "Tel1", + "", + 0.5, + "2017-01-01 20:23:15-05:00", + "BB099931J", + "Hyper-Startup Inc.", + ), + ( + "AA098762D", + "Hyper Startup Inc.", + "Address2", + "Tel2", + "Description2", + 0.3, + "2020-10-20 15:29:30+02:00", + "BB099931J", + "Hyper-Startup Inc.", + ), + ( + "BB099931J", + "Hyper-Startup Inc.", + "Address3", + "Tel3", + "Description3", + 0.1, + "2013-07-01 03:34:45-05:00", + "BB099931J", + "Hyper-Startup Inc.", + ), + ( + "HH072982K", + "Hyper Hyper Inc.", + "Address4", + "", + "Description4", + 0.9, + "2005-09-11 11:56:00-07:00", + "HH072982K", + "Hyper Hyper Inc.", + ), + ( + "EE059082Q", + "Mega Enterprises Corp.", + "Address5", + "Tel5", + "Description5", + 1.0, + "1998-04-14 09:21:11+00:00", + "EE059082Q", + "Mega Enterprises Corp.", + ), ], - columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight', 'timestamp', - 'group ID', 'group name') + columns=( + "Customer ID", + "Customer Name", + "Address", + "Tel", + "Description", + "weight", + "timestamp", + "group ID", + "group name", + ), ) # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'timestamp') self.expected_result_TS = pd.Series( [ - 'EE059082Q', - 'BB099931J', - 'BB099931J', - 'BB099931J', - 'HH072982K', - 'EE059082Q', + "EE059082Q", + "BB099931J", + "BB099931J", + "BB099931J", + "HH072982K", + "EE059082Q", ] ) # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'timestamp', 'Customer Name') self.expected_result_T = pd.DataFrame( [ - ('EE059082Q', 'Mega Enterprises Corp.'), - ('BB099931J', 'Hyper-Startup Inc.'), - ('BB099931J', 'Hyper-Startup Inc.'), - ('BB099931J', 'Hyper-Startup Inc.'), - ('HH072982K', 'Hyper Hyper Inc.'), - ('EE059082Q', 'Mega Enterprises Corp.') + ("EE059082Q", "Mega Enterprises Corp."), + ("BB099931J", "Hyper-Startup Inc."), + ("BB099931J", "Hyper-Startup Inc."), + ("BB099931J", "Hyper-Startup Inc."), + ("HH072982K", "Hyper Hyper Inc."), + ("EE059082Q", "Mega Enterprises Corp."), ] ) # new_group_rep_by_earliest_timestamp(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name') self.expected_result_TW = pd.DataFrame( [ - ('BB016741P', 'Mega Enterprises Corporation'), - ('BB099931J', 'Hyper-Startup Inc.'), - ('BB099931J', 'Hyper-Startup Inc.'), - ('BB099931J', 'Hyper-Startup Inc.'), - ('HH072982K', 'Hyper Hyper Inc.'), - ('BB016741P', 'Mega Enterprises Corporation') + ("BB016741P", "Mega Enterprises Corporation"), + ("BB099931J", "Hyper-Startup Inc."), + ("BB099931J", "Hyper-Startup Inc."), + ("BB099931J", "Hyper-Startup Inc."), + ("HH072982K", "Hyper Hyper Inc."), + ("BB016741P", "Mega Enterprises Corporation"), ] ) # new_group_rep_by_highest_weight(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name') self.expected_result_W = pd.DataFrame( [ - ('EE059082Q', 'Mega Enterprises Corp.'), - ('CC082744L', 'Hyper Startup Incorporated'), - ('CC082744L', 'Hyper Startup Incorporated'), - ('CC082744L', 'Hyper Startup Incorporated'), - ('HH072982K', 'Hyper Hyper Inc.'), - ('EE059082Q', 'Mega Enterprises Corp.') + ("EE059082Q", "Mega Enterprises Corp."), + ("CC082744L", "Hyper Startup Incorporated"), + ("CC082744L", "Hyper Startup Incorporated"), + ("CC082744L", "Hyper Startup Incorporated"), + ("HH072982K", "Hyper Hyper Inc."), + ("EE059082Q", "Mega Enterprises Corp."), ] ) # new_group_rep_by_highest_weight(customers_df, 'group ID', 'Customer ID', 'weight', 'Customer Name') self.expected_result_C = pd.DataFrame( [ - ('BB016741P', 'Mega Enterprises Corporation'), - ('AA098762D', 'Hyper Startup Inc.'), - ('AA098762D', 'Hyper Startup Inc.'), - ('AA098762D', 'Hyper Startup Inc.'), - ('HH072982K', 'Hyper Hyper Inc.'), - ('BB016741P', 'Mega Enterprises Corporation') + ("BB016741P", "Mega Enterprises Corporation"), + ("AA098762D", "Hyper Startup Inc."), + ("AA098762D", "Hyper Startup Inc."), + ("AA098762D", "Hyper Startup Inc."), + ("HH072982K", "Hyper Hyper Inc."), + ("BB016741P", "Mega Enterprises Corporation"), ] ) @@ -85,49 +151,45 @@ def __init__(self): class StringGrouperUtilTest(unittest.TestCase): def test_group_rep_by_timestamp_return_series(self): """Should return a pd.Series object with the same length as the grouped_data. The series object will contain - a list of groups whose group-representatives have the earliest timestamp of the group""" + a list of groups whose group-representatives have the earliest timestamp of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_series_equal( simple_example.expected_result_TS, new_group_rep_by_earliest_timestamp( - customers_df, - 'group ID', - 'Customer ID', - 'timestamp' - ) + customers_df, "group ID", "Customer ID", "timestamp" + ), ) def test_group_rep_by_timestamp_return_dataframe(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the earliest timestamp of the group""" + a list of groups whose group-representatives have the earliest timestamp of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_T, new_group_rep_by_earliest_timestamp( - customers_df, - 'group ID', - 'Customer ID', - 'timestamp', - 'Customer Name' - ) + customers_df, "group ID", "Customer ID", "timestamp", "Customer Name" + ), ) def test_group_rep_by_timestamp_series_input(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the earliest timestamp of the group""" + a list of groups whose group-representatives have the earliest timestamp of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_T, new_group_rep_by_earliest_timestamp( customers_df, - 'group ID', - 'Customer ID', - customers_df['timestamp'], - 'Customer Name' - ) + "group ID", + "Customer ID", + customers_df["timestamp"], + "Customer Name", + ), ) def test_group_rep_by_timestamp_input_series_length(self): @@ -137,10 +199,10 @@ def test_group_rep_by_timestamp_input_series_length(self): with self.assertRaises(Exception): _ = new_group_rep_by_earliest_timestamp( customers_df, - 'group ID', - 'Customer ID', - customers_df['timestamp'].iloc[:-2], - 'Customer Name' + "group ID", + "Customer ID", + customers_df["timestamp"].iloc[:-2], + "Customer Name", ) def test_group_rep_by_timestamp_bad_input_timestamp_strings(self): @@ -150,46 +212,52 @@ def test_group_rep_by_timestamp_bad_input_timestamp_strings(self): with self.assertRaises(Exception): _ = new_group_rep_by_earliest_timestamp( customers_df, - 'group ID', - 'Customer ID', - customers_df['Customer ID'], - 'Customer Name' + "group ID", + "Customer ID", + customers_df["Customer ID"], + "Customer Name", ) def test_group_rep_by_timestamp_pandas_timestamps(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the earliest timestamp of the group""" + a list of groups whose group-representatives have the earliest timestamp of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df customers_df2 = customers_df.copy() - customers_df2['timestamp'] = customers_df2['timestamp'].transform(lambda t: pd.Timestamp(t)) + customers_df2["timestamp"] = customers_df2["timestamp"].transform( + lambda t: pd.Timestamp(t) + ) pd.testing.assert_frame_equal( simple_example.expected_result_T, new_group_rep_by_earliest_timestamp( customers_df2, - 'group ID', - 'Customer ID', - customers_df2['timestamp'], - 'Customer Name' - ) + "group ID", + "Customer ID", + customers_df2["timestamp"], + "Customer Name", + ), ) def test_group_rep_by_timestamp_dateutil_timestamps(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the earliest timestamp of the group""" + a list of groups whose group-representatives have the earliest timestamp of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df customers_df2 = customers_df.copy() - customers_df2['timestamp'] = customers_df2['timestamp'].transform(lambda t: parse(t)) + customers_df2["timestamp"] = customers_df2["timestamp"].transform( + lambda t: parse(t) + ) pd.testing.assert_frame_equal( simple_example.expected_result_T, new_group_rep_by_earliest_timestamp( customers_df2, - 'group ID', - 'Customer ID', - customers_df2['timestamp'], - 'Customer Name' - ) + "group ID", + "Customer ID", + customers_df2["timestamp"], + "Customer Name", + ), ) def test_group_rep_by_timestamp_bad_nonstring_timestamps(self): @@ -197,62 +265,61 @@ def test_group_rep_by_timestamp_bad_nonstring_timestamps(self): simple_example = SimpleExample() customers_df = simple_example.customers_df customers_df2 = customers_df.copy() - customers_df2.at[0, 'timestamp'] = 1.0 + customers_df2.at[0, "timestamp"] = 1.0 with self.assertRaises(Exception): _ = new_group_rep_by_earliest_timestamp( customers_df2, - 'group ID', - 'Customer ID', - customers_df2['timestamp'], - 'Customer Name' + "group ID", + "Customer ID", + customers_df2["timestamp"], + "Customer Name", ) def test_group_rep_by_timestamp_input_numbers(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the earliest timestamp of the group""" + a list of groups whose group-representatives have the earliest timestamp of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_TW, new_group_rep_by_earliest_timestamp( customers_df, - 'group ID', - 'Customer ID', - customers_df['weight'], - 'Customer Name' - ) + "group ID", + "Customer ID", + customers_df["weight"], + "Customer Name", + ), ) def test_group_rep_by_weight(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the highest weight of the group""" + a list of groups whose group-representatives have the highest weight of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_W, new_group_rep_by_highest_weight( - customers_df, - 'group ID', - 'Customer ID', - 'weight', - 'Customer Name' - ) + customers_df, "group ID", "Customer ID", "weight", "Customer Name" + ), ) def test_group_rep_by_weight_input_series(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the highest weight of the group""" + a list of groups whose group-representatives have the highest weight of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_W, new_group_rep_by_highest_weight( customers_df, - 'group ID', - 'Customer ID', - customers_df['weight'], - 'Customer Name' - ) + "group ID", + "Customer ID", + customers_df["weight"], + "Customer Name", + ), ) def test_group_rep_by_weight_input_series_length(self): @@ -262,57 +329,49 @@ def test_group_rep_by_weight_input_series_length(self): with self.assertRaises(Exception): _ = new_group_rep_by_highest_weight( customers_df, - 'group ID', - 'Customer ID', - customers_df['weight'].iloc[:-2], - 'Customer Name' + "group ID", + "Customer ID", + customers_df["weight"].iloc[:-2], + "Customer Name", ) def test_group_rep_by_completeness_column_list(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the most filled-in records of the group""" + a list of groups whose group-representatives have the most filled-in records of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_C, new_group_rep_by_completeness( - customers_df, - 'group ID', - 'Customer ID', - 'Customer Name', - [1, 2, 3, 4] - ) + customers_df, "group ID", "Customer ID", "Customer Name", [1, 2, 3, 4] + ), ) def test_group_rep_by_completeness_no_columns(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the most filled-in records of the group""" + a list of groups whose group-representatives have the most filled-in records of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_C, new_group_rep_by_completeness( - customers_df, - 'group ID', - 'Customer ID', - 'Customer Name' - ) + customers_df, "group ID", "Customer ID", "Customer Name" + ), ) def test_group_rep_by_completeness_input_dataframe(self): """Should return a pd.DataFrame object with the same length as the grouped_data. The DataFrame object will contain - a list of groups whose group-representatives have the most filled-in records of the group""" + a list of groups whose group-representatives have the most filled-in records of the group + """ simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_C, new_group_rep_by_completeness( - customers_df, - 'group ID', - 'Customer ID', - 'Customer Name', - customers_df - ) + customers_df, "group ID", "Customer ID", "Customer Name", customers_df + ), ) def test_group_rep_by_completeness_input_dataframe_length(self): @@ -322,12 +381,12 @@ def test_group_rep_by_completeness_input_dataframe_length(self): with self.assertRaises(Exception): _ = new_group_rep_by_completeness( customers_df, - 'group ID', - 'Customer ID', - 'Customer Name', - customers_df.iloc[:-2, :] + "group ID", + "Customer ID", + "Customer Name", + customers_df.iloc[:-2, :], ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 2b5a1d6e06d2dbe1f51cc9f623374a8ece9578d1 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 13:41:02 +0200 Subject: [PATCH 02/14] moved to hatch --- LICENSE => LICENSE.txt | 0 pyproject.toml | 83 ++++++++++++++++++++++++++++++++++++++ setup.py | 30 -------------- string_grouper/__init__.py | 1 + 4 files changed, 84 insertions(+), 30 deletions(-) rename LICENSE => LICENSE.txt (100%) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c7fe4a9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,83 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "string-grouper" +dynamic = ["version"] +description = "String grouper contains functions to do string matching using TF-IDF and the cossine similarity. Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html" +readme = "README.md" +license = "LISCENCE.txt" +requires-python = ">3.7" +authors = [ + { name = "Chris van den Berg", email = "fake_email@gmail.com" }, + { name = "Ruben Menke", email = "rum@bankingcircle.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "numpy", + "pandas>=0.25.3", + "scikit-learn", + "scipy", + "sparse_dot_topn==0.3.6", +] + +[project.urls] +Homepage = "https://github.com/Bergvca/string_grouper" +Documentation = "https://github.com/unknown/string-grouper#readme" +Issues = "https://github.com/unknown/string-grouper/issues" +Source = "https://github.com/unknown/string-grouper" + +[tool.hatch.version] +path = "string_grouper/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/string_grouper", +] + + +[tool.hatch.envs.test] +dependencies = ["pytest", "pdbpp"] + +[tool.hatch.envs.test.scripts] +tests = "pytest {args}" +base = "pytest {args}" +cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=diss --cov=api --cov=llm {args}" +htmlcov = "pytest --cov-report=html --cov-config=pyproject.toml --cov=diss --cov=api --cov=llm {args}" +xmlcov = "pytest --cov-report=xml --cov-config=pyproject.toml --cov=diss --cov=api --cov=llm {args}" +code_check = "pre-commit run --all-files" + +[[tool.hatch.envs.test.matrix]] +python = ["3.10", "3.11"] + + + +[tool.black] +line-length = 120 +include = '\.pyi?$' +exclude = ''' +/( + \.git +| \.hg +| \.mypy_cache +| \.tox +| \.venv +| _build +| buck-out +| build +)/ +''' + +[tool.isort] +profile = "black" \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 29b0270..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -from setuptools import setup -import pathlib - -# The directory containing this file -HERE = pathlib.Path(__file__).parent - -# The text of the README file -README = (HERE / "README.md").read_text() - -setup( - name='string_grouper', - version='0.5.0', - packages=['string_grouper'], - license='MIT License', - description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. ' - 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html', - author='Chris van den Berg', - long_description=README, - long_description_content_type="text/markdown", - author_email='fake_email@gmail.com', - url='https://github.com/Bergvca/string_grouper', - zip_safe=False, - python_requires='>3.7', - install_requires=['pandas>=0.25.3' - , 'scipy' - , 'scikit-learn' - , 'numpy' - , 'sparse_dot_topn==0.3.6' #1.1.1 - ] -) diff --git a/string_grouper/__init__.py b/string_grouper/__init__.py index bcbd349..8eff16d 100644 --- a/string_grouper/__init__.py +++ b/string_grouper/__init__.py @@ -6,3 +6,4 @@ StringGrouperConfig, StringGrouper, ) +__version__ = "0.5.0" \ No newline at end of file From 3f6bad4b69085113eaadf859743df71590d0c462 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 13:43:11 +0200 Subject: [PATCH 03/14] can't spell --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c7fe4a9..4c06212 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "string-grouper" dynamic = ["version"] description = "String grouper contains functions to do string matching using TF-IDF and the cossine similarity. Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html" readme = "README.md" -license = "LISCENCE.txt" +license = "LICENSE.txt" requires-python = ">3.7" authors = [ { name = "Chris van den Berg", email = "fake_email@gmail.com" }, From b524c0fd05b22ebb63a15aca8edb05d764a61c87 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 13:46:32 +0200 Subject: [PATCH 04/14] remove the license notice --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4c06212..468af8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,6 @@ name = "string-grouper" dynamic = ["version"] description = "String grouper contains functions to do string matching using TF-IDF and the cossine similarity. Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html" readme = "README.md" -license = "LICENSE.txt" requires-python = ">3.7" authors = [ { name = "Chris van den Berg", email = "fake_email@gmail.com" }, From 252770e8978577cb12cd6d663dff7f3e082df3de Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 17:21:42 +0200 Subject: [PATCH 05/14] working on it --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 468af8e..b3ac2df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "pandas>=0.25.3", "scikit-learn", "scipy", - "sparse_dot_topn==0.3.6", + "sparse_dot_topn>=1.1.1", ] [project.urls] From ae3b90004f3d8562a6a38781495388d2dc9c8358 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 17:23:22 +0200 Subject: [PATCH 06/14] moving over --- string_grouper/string_grouper.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index aec1239..9ca0ac5 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -7,7 +7,7 @@ from scipy.sparse import lil_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union -from sparse_dot_topn import awesome_cossim_topn +from sparse_dot_topn import sp_matmul_topn from functools import wraps DEFAULT_NGRAM_SIZE: int = 3 @@ -313,7 +313,7 @@ def fit(self) -> "StringGrouper": # convert to lil format for best efficiency when setting matrix-elements matches = matches.tolil() # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by - # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results) + # floating-point computations in sp_matmul_topn sometimes lead to unexpected results) matches = StringGrouper._fix_diagonal(matches) if self._max_n_matches < self._true_max_n_matches: # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A) @@ -533,18 +533,13 @@ def _build_matches( tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() - optional_kwargs = { - "return_best_ntop": True, - "use_threads": self._config.number_of_processes > 1, - "n_jobs": self._config.number_of_processes, - } - return awesome_cossim_topn( + return sp_matmul_topn( tf_idf_matrix_1, tf_idf_matrix_2, - self._max_n_matches, - self._config.min_similarity, - **optional_kwargs, + top_n=self._max_n_matches, + threshold=self._config.min_similarity, + sort=True, ) def _get_non_matches_list(self) -> pd.DataFrame: From 250b2e402270e950fd330f1ad2bc4f715bdf3aa0 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 19:37:51 +0200 Subject: [PATCH 07/14] removed awesome_cosine function --- string_grouper/string_grouper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 9ca0ac5..8385a77 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -305,9 +305,10 @@ def fit(self) -> "StringGrouper": master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity - matches, self._true_max_n_matches = self._build_matches( + matches = self._build_matches( master_matrix, duplicate_matrix ) + self._true_max_n_matches = np.diff(matches.indptr).max() if self._duplicates is None: # convert to lil format for best efficiency when setting matrix-elements From 74e2a1963a359b088ad1ce77c3715ae6ce31aace Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 19:40:36 +0200 Subject: [PATCH 08/14] add unittest option --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3ac2df..c43685c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ include = [ dependencies = ["pytest", "pdbpp"] [tool.hatch.envs.test.scripts] -tests = "pytest {args}" +tests = "python -m unittest {args}" base = "pytest {args}" cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=diss --cov=api --cov=llm {args}" htmlcov = "pytest --cov-report=html --cov-config=pyproject.toml --cov=diss --cov=api --cov=llm {args}" From 2f95b77fdd9aefd3a090495be97dac1176ef8567 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 19:43:39 +0200 Subject: [PATCH 09/14] fix corresponding test --- string_grouper/test/test_string_grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 638fa84..b8307d3 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -480,7 +480,7 @@ def test_build_matches(self): expected_matches = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]]) np.testing.assert_array_equal( - expected_matches, sg._build_matches(master, dupe)[0].toarray() + expected_matches, sg._build_matches(master, dupe).toarray() ) def test_build_matches_list(self): From b62d461351606610086cdbbfc51f5cf2fff7c14f Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 19:57:23 +0200 Subject: [PATCH 10/14] remove unescessary test --- string_grouper/test/test_string_grouper.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index b8307d3..771435d 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -403,15 +403,6 @@ def test_zero_min_similarity(self): simple_example.expected_result_with_zeroes, matches ) - def test_zero_min_similarity_small_max_n_matches(self): - """This test ensures that a warning is issued when n_max_matches is suspected to be too small while - min_similarity <= 0 and include_zeroes is True""" - simple_example = SimpleExample() - s_master = simple_example.customers_df["Customer Name"] - s_dup = simple_example.two_strings - with self.assertRaises(Exception): - _ = match_strings(s_master, s_dup, max_n_matches=1, min_similarity=0) - def test_get_non_matches_empty_case(self): """This test ensures that _get_non_matches() returns an empty DataFrame when all pairs of strings match""" simple_example = SimpleExample() From d5c8a2a752146115a228253a473d0a9056f82582 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 20:09:35 +0200 Subject: [PATCH 11/14] remove the selective symetrification --- string_grouper/string_grouper.py | 4 +--- string_grouper/test/test_string_grouper.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 8385a77..85d82fc 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -316,9 +316,7 @@ def fit(self) -> "StringGrouper": # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by # floating-point computations in sp_matmul_topn sometimes lead to unexpected results) matches = StringGrouper._fix_diagonal(matches) - if self._max_n_matches < self._true_max_n_matches: - # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A) - matches = StringGrouper._symmetrize_matrix(matches) + matches = StringGrouper._symmetrize_matrix(matches) matches = matches.tocsr() # build list from matrix diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 771435d..f63ab0d 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -360,6 +360,7 @@ def test_match_list_symmetry_with_symmetrize_function(self): # If the intersection is not empty this means at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. + # breakpoint() self.assertTrue( intersection.empty or len(upper) == len(upper_prime) == len(intersection) ) From 6c053c6a79829ddf30e5bc517e404664e252b146 Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 20:12:32 +0200 Subject: [PATCH 12/14] upgraded the github actions --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b29917e..b89793b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: [3.10, 3.11, 3.12] os: [ubuntu-latest, windows-latest] steps: From cca5727d53e6d16048f8795e99b90e72f0bcdc6b Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 20:12:52 +0200 Subject: [PATCH 13/14] upgraded text.yaml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b89793b..d5bd146 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,7 +3,7 @@ on: pull_request: push: branches: - - master + - '*' jobs: test: From a13ee78e27c550c7b34172cf703a32293357534e Mon Sep 17 00:00:00 2001 From: Ruben Menke Date: Mon, 6 May 2024 20:24:14 +0200 Subject: [PATCH 14/14] update requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c43685c..edfd68f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "string-grouper" dynamic = ["version"] description = "String grouper contains functions to do string matching using TF-IDF and the cossine similarity. Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html" readme = "README.md" -requires-python = ">3.7" +requires-python = ">3.10" authors = [ { name = "Chris van den Berg", email = "fake_email@gmail.com" }, { name = "Ruben Menke", email = "rum@bankingcircle.com" },