Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

created a more user-friendly error message when bad data is found #56

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

## [0.5.2?] - 2021-07-05

* Provided a more user-friendly error message to be issued when any entries in the input string-Series are not strings.

## [0.5.1?] - 2021-07-05

* Improved the performance of the function `match_most_similar`.
* Changed the default value of the keyword argument `max_n_matches` to the total number of strings in `master`. (`max_n_matches` is now defined as the maximum number of matches allowed per string in `duplicates` \[or `master` if `duplicates` is not given\]).

## [0.5.0] - 2021-06-11

### Added
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use
* **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
* **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
* **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
* **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
* **`max_n_matches`**: The maximum number of matching strings in `master` allowed per string in `duplicates` (or `master` if `duplicates` is not given). Default is the total number of strings in `master`.
* **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
Defaults to `0.8`
* **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
Expand Down
201 changes: 156 additions & 45 deletions string_grouper/string_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
DEFAULT_NGRAM_SIZE: int = 3
DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
DEFAULT_REGEX: str = r'[,-./]|\s'
DEFAULT_MAX_N_MATCHES: int = 20
DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match
DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1
DEFAULT_IGNORE_CASE: bool = True # ignores case by default
Expand Down Expand Up @@ -42,7 +41,28 @@
# High level functions


def compute_pairwise_similarities(string_series_1: pd.Series,
def who(bad_StringGrouper_param, param_1, param_name_1, param_2, param_name_2):
# Private utility function used by high-level functions (that call StringGrouper) to form a
# descriptive name for their series input parameter which caused the exception of type
# StringGrouperNotAllStringsException to occur
if bad_StringGrouper_param == 'master':
return f'\'{param_1.name}\' ({param_name_1})' if param_1.name else param_name_1
else:
return f'\'{param_2.name}\' ({param_name_2})' if param_2.name else param_name_2


def add_this_arg(func):
# Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func")
# that shifts the parameters of "func" to the right by one, inserting a reference to local
# function "this" in the first parameter position
def this(*args, **kwargs):
return func(this, *args, **kwargs)
return this


@add_this_arg
def compute_pairwise_similarities(this,
string_series_1: pd.Series,
string_series_2: pd.Series,
**kwargs) -> pd.Series:
"""
Expand All @@ -53,10 +73,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
:return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2
"""
return StringGrouper(string_series_1, string_series_2, **kwargs).dot()


def group_similar_strings(strings_to_group: pd.Series,
sg = StringGrouperPrime(string_series_1, string_series_2, **kwargs)
# error handler (for input Series with values that are not strings)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
string_series_1, 'string_series_1',
string_series_2, 'string_series_2')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'compute_pairwise_similarities'))
return sg.dot()


@add_this_arg
def group_similar_strings(this,
strings_to_group: pd.Series,
string_ids: Optional[pd.Series] = None,
**kwargs) -> Union[pd.DataFrame, pd.Series]:
"""
Expand All @@ -74,11 +105,22 @@ def group_similar_strings(strings_to_group: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
:return: pandas.Series or pandas.DataFrame.
"""
string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit()
return string_grouper.get_groups()


def match_most_similar(master: pd.Series,
sg = StringGrouperPrime(strings_to_group, master_id=string_ids, **kwargs)
# error handler (for input Series with values that are not strings)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
strings_to_group, 'strings_to_group',
None, '')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'group_similar_strings'))
fit_sg = sg.fit()
return fit_sg.get_groups()


@add_this_arg
def match_most_similar(this,
master: pd.Series,
duplicates: pd.Series,
master_id: Optional[pd.Series] = None,
duplicates_id: Optional[pd.Series] = None,
Expand All @@ -103,15 +145,27 @@ def match_most_similar(master: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
:return: pandas.Series or pandas.DataFrame.
"""
string_grouper = StringGrouper(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs).fit()
return string_grouper.get_groups()


def match_strings(master: pd.Series,
kwargs['max_n_matches'] = 1
sg = StringGrouperPrime(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs)
# error handler (for input Series with values that are not strings)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
master, 'master',
duplicates, 'duplicates')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'match_most_similar'))
fit_sg = sg.fit()
return fit_sg.get_groups()


@add_this_arg
def match_strings(this,
master: pd.Series,
duplicates: Optional[pd.Series] = None,
master_id: Optional[pd.Series] = None,
duplicates_id: Optional[pd.Series] = None,
Expand All @@ -128,12 +182,20 @@ def match_strings(master: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig.
:return: pandas.Dataframe.
"""
string_grouper = StringGrouper(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs).fit()
return string_grouper.get_matches()
sg = StringGrouperPrime(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
master, 'master',
duplicates, 'duplicates')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'match_strings'))
fit_sg = sg.fit()
return fit_sg.get_matches()


class StringGrouperConfig(NamedTuple):
Expand All @@ -146,7 +208,8 @@ class StringGrouperConfig(NamedTuple):
(Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
than np.float64.)
:param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
:param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
:param max_n_matches: int. The maximum number of matching strings in `master` allowed per string in
`duplicates` (or `master` if `duplicates` is not given). Default will be set by StringGrouper.
:param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
Defaults to 0.8.
:param number_of_processes: int. The number of processes used by the cosine similarity calculation.
Expand Down Expand Up @@ -194,6 +257,11 @@ class StringGrouperNotFitException(Exception):
pass


class StringGrouperNotAllStringsException(TypeError):
"""Raised when either input Series master or duplicates contains non-strings"""
pass


class StringGrouper(object):
def __init__(self, master: pd.Series,
duplicates: Optional[pd.Series] = None,
Expand All @@ -213,9 +281,10 @@ def __init__(self, master: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
"""
# Validate match strings input
if not StringGrouper._is_series_of_strings(master) or \
(duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)):
raise TypeError('Input does not consist of pandas.Series containing only Strings')
self.issues: pd.Series = None
self._check_string_series(master, 'master')
if (duplicates is not None):
self._check_string_series(duplicates, 'duplicates')
# Validate optional IDs input
if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id):
raise Exception('List of data Series options is invalid')
Expand All @@ -228,7 +297,7 @@ def __init__(self, master: pd.Series,

self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
if self._config.max_n_matches is None:
self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
self._max_n_matches = len(self._master)
else:
self._max_n_matches = self._config.max_n_matches

Expand Down Expand Up @@ -455,8 +524,8 @@ def _fit_vectorizer(self) -> TfidfVectorizer:

def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix:
"""Builds the cossine similarity matrix of two csr matrices"""
tf_idf_matrix_1 = master_matrix
tf_idf_matrix_2 = duplicate_matrix.transpose()
tf_idf_matrix_1 = duplicate_matrix
tf_idf_matrix_2 = master_matrix.transpose()

optional_kwargs = {
'return_best_ntop': True,
Expand Down Expand Up @@ -622,6 +691,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
return master_indices, dupe_indices

def _check_string_series(self, series_to_test: pd.Series, which: str):
self.bad_series_name = which
StringGrouper._check_type(series_to_test, which)
self._check_content(series_to_test, which)

def _check_content(self, series_to_test: pd.Series, which: str):
non_strings_exist = series_to_test.to_frame().applymap(
lambda x: (not isinstance(x, str)) or len(x) == 0
).squeeze(axis=1)
if non_strings_exist.any():
self.issues = series_to_test[non_strings_exist]
sname = f' {series_to_test.name}' if series_to_test.name else ''
self.issues.rename(f'Non-strings in {which} Series{sname}', inplace=True)
raise StringGrouperNotAllStringsException

def _validate_group_rep_specs(self):
group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
if self._config.group_rep not in group_rep_options:
Expand All @@ -645,6 +729,11 @@ def _validate_replace_na_and_drop(self):
"index if the number of index-levels does not equal the number of index-columns."
)

@staticmethod
def _check_type(series_to_test: pd.Series, which: str):
if not isinstance(series_to_test, pd.Series):
raise TypeError(f'Input {which} is not a pandas.Series containing only Strings')

@staticmethod
def _fix_diagonal(m: lil_matrix) -> csr_matrix:
r = np.arange(m.shape[0])
Expand All @@ -661,8 +750,8 @@ def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix:
def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
"""Returns a list of all the indices of matches"""
r, c = matches.nonzero()
matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
'dupe_side': c.astype(np.int64),
matches_list = pd.DataFrame({'master_side': c.astype(np.int64),
'dupe_side': r.astype(np.int64),
'similarity': matches.data})
return matches_list

Expand All @@ -687,16 +776,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings
elif not dupe_strings.isin([dupe_side]).any():
raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series')

@staticmethod
def _is_series_of_strings(series_to_test: pd.Series) -> bool:
if not isinstance(series_to_test, pd.Series):
return False
elif series_to_test.to_frame().applymap(
lambda x: not isinstance(x, str)
).squeeze(axis=1).any():
return False
return True

@staticmethod
def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool:
if duplicates is None and (duplicates_id is not None) \
Expand All @@ -711,3 +790,35 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id):
raise Exception('Both master and master_id must be pandas.Series of the same length.')
if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id):
raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.')


class StringGrouperPrime(StringGrouper):
# (To be used in high-level functions)
# Child class of StringGrouper that captures information about the input Series
# that caused the StringGrouperNotAllStringsException even when the StringGrouper
# instance is not fully initialized
def __init__(self, master: pd.Series,
duplicates: Optional[pd.Series] = None,
master_id: Optional[pd.Series] = None,
duplicates_id: Optional[pd.Series] = None,
**kwargs):
self.issues = None
self.non_strings_present = False
self.bad_series_name = None
try:
super().__init__(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs)
except StringGrouperNotAllStringsException:
self.non_strings_present = True

def error_msg(self, bad_series_name, function_name):
nl = ':\n'
return (
f'\n\nERROR: Input pandas Series {bad_series_name} contains values that are not strings!\n'
f'Display the pandas Series \'{function_name}.issues\' to find where these values are'
f'{nl if 0 < len(self.issues) < 12 else "."}'
f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}'
)
Loading