Skip to content

Commit

Permalink
added error-handler to capture non-strings in input Series
Browse files Browse the repository at this point in the history
  • Loading branch information
ParticularMiner committed May 9, 2021
1 parent 4a0b225 commit faa974c
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 45 deletions.
186 changes: 148 additions & 38 deletions string_grouper/string_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,32 @@
# StringGrouper.get_nearest_matches
GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate


# High level functions


def compute_pairwise_similarities(string_series_1: pd.Series,
def who(bad_StringGrouper_param, param_1, param_name_1, param_2, param_name_2):
# Private utility function used by high-level functions (that call StringGrouper) to form a
# descriptive name for their series input parameter which caused the exception of type
# StringGrouperNotAllStringsException to occur
if bad_StringGrouper_param == 'master':
return f'\'{param_1.name}\' ({param_name_1})' if param_1.name else param_name_1
else:
return f'\'{param_2.name}\' ({param_name_2})' if param_2.name else param_name_2


def add_this_arg(func):
# Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func")
# that shifts the parameters of "func" to the right by one, inserting a reference to local
# function "this" in the first parameter position
def this(*args, **kwargs):
return func(this, *args, **kwargs)
return this


@add_this_arg
def compute_pairwise_similarities(this,
string_series_1: pd.Series,
string_series_2: pd.Series,
**kwargs) -> pd.Series:
"""
Expand All @@ -55,10 +77,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
:return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2
"""
return StringGrouper(string_series_1, string_series_2, **kwargs).dot()


def group_similar_strings(strings_to_group: pd.Series,
sg = StringGrouperPrime(string_series_1, string_series_2, **kwargs)
# error handler (for input Series with values that are not strings)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
string_series_1, 'string_series_1',
string_series_2, 'string_series_2')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'compute_pairwise_similarities'))
return sg.dot()


@add_this_arg
def group_similar_strings(this,
strings_to_group: pd.Series,
string_ids: Optional[pd.Series] = None,
**kwargs) -> Union[pd.DataFrame, pd.Series]:
"""
Expand All @@ -76,11 +109,22 @@ def group_similar_strings(strings_to_group: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
:return: pandas.Series or pandas.DataFrame.
"""
string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit()
return string_grouper.get_groups()


def match_most_similar(master: pd.Series,
sg = StringGrouperPrime(strings_to_group, master_id=string_ids, **kwargs)
# error handler (for input Series with values that are not strings)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
strings_to_group, 'strings_to_group',
None, '')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'group_similar_strings'))
fit_sg = sg.fit()
return fit_sg.get_groups()


@add_this_arg
def match_most_similar(this,
master: pd.Series,
duplicates: pd.Series,
master_id: Optional[pd.Series] = None,
duplicates_id: Optional[pd.Series] = None,
Expand All @@ -105,15 +149,26 @@ def match_most_similar(master: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
:return: pandas.Series or pandas.DataFrame.
"""
string_grouper = StringGrouper(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs).fit()
return string_grouper.get_groups()


def match_strings(master: pd.Series,
sg = StringGrouperPrime(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs)
# error handler (for input Series with values that are not strings)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
master, 'master',
duplicates, 'duplicates')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'match_most_similar'))
fit_sg = sg.fit()
return fit_sg.get_groups()


@add_this_arg
def match_strings(this,
master: pd.Series,
duplicates: Optional[pd.Series] = None,
master_id: Optional[pd.Series] = None,
duplicates_id: Optional[pd.Series] = None,
Expand All @@ -130,12 +185,20 @@ def match_strings(master: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig.
:return: pandas.Dataframe.
"""
string_grouper = StringGrouper(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs).fit()
return string_grouper.get_matches()
sg = StringGrouperPrime(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs)
if sg.non_strings_present:
sname = who(sg.bad_series_name,
master, 'master',
duplicates, 'duplicates')
this.issues = sg.issues
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
raise TypeError(sg.error_msg(sname, 'match_strings'))
fit_sg = sg.fit()
return fit_sg.get_matches()


class StringGrouperConfig(NamedTuple):
Expand Down Expand Up @@ -194,6 +257,10 @@ class StringGrouperNotFitException(Exception):
pass


class StringGrouperNotAllStringsException(TypeError):
"""Raised when either input Series master or duplicates contains non-strings"""
pass

class StringGrouper(object):
def __init__(self, master: pd.Series,
duplicates: Optional[pd.Series] = None,
Expand All @@ -213,9 +280,9 @@ def __init__(self, master: pd.Series,
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
"""
# Validate match strings input
if not StringGrouper._is_series_of_strings(master) or \
(duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)):
raise TypeError('Input does not consist of pandas.Series containing only Strings')
self.issues: pd.Series = None
self._check_string_series(master, 'master')
if (duplicates is not None): self._check_string_series(duplicates, 'duplicates')
# Validate optional IDs input
if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id):
raise Exception('List of data Series options is invalid')
Expand Down Expand Up @@ -601,6 +668,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
return master_indices, dupe_indices

def _check_string_series(self, series_to_test: pd.Series, which: str):
self.bad_series_name = which
StringGrouper._check_type(series_to_test, which)
self._check_content(series_to_test, which)

def _check_content(self, series_to_test: pd.Series, which: str):
non_strings_exist = series_to_test.to_frame().applymap(
lambda x: (not isinstance(x, str)) or len(x) == 0
).squeeze(axis=1)
if non_strings_exist.any():
self.issues = series_to_test[non_strings_exist]
sname = f' {series_to_test.name}' if series_to_test.name else ''
self.issues.rename(f'Non-strings in {which} Series{sname}', inplace=True)
raise StringGrouperNotAllStringsException

def _validate_group_rep_specs(self):
group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
if self._config.group_rep not in group_rep_options:
Expand All @@ -617,6 +699,11 @@ def _validate_replace_na_and_drop(self):
"index if the number of index-levels does not equal the number of index-columns."
)

@staticmethod
def _check_type(series_to_test: pd.Series, which: str):
if not isinstance(series_to_test, pd.Series):
raise TypeError(f'Input {which} is not a pandas.Series containing only Strings')

@staticmethod
def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
A = AA.tolil()
Expand Down Expand Up @@ -656,16 +743,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings
elif not dupe_strings.isin([dupe_side]).any():
raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series')

@staticmethod
def _is_series_of_strings(series_to_test: pd.Series) -> bool:
if not isinstance(series_to_test, pd.Series):
return False
elif series_to_test.to_frame().applymap(
lambda x: not isinstance(x, str)
).squeeze(axis=1).any():
return False
return True

@staticmethod
def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool:
if duplicates is None and (duplicates_id is not None) \
Expand All @@ -680,3 +757,36 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id):
raise Exception('Both master and master_id must be pandas.Series of the same length.')
if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id):
raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.')


class StringGrouperPrime(StringGrouper):
# (To be used in high-level functions)
# Child class of StringGrouper that captures information about the input Series
# that caused the StringGrouperNotAllStringsException even when the StringGrouper
# instance is not fully initialized
def __init__(self, master: pd.Series,
duplicates: Optional[pd.Series] = None,
master_id: Optional[pd.Series] = None,
duplicates_id: Optional[pd.Series] = None,
**kwargs):
self.issues = None
self.non_strings_present = False
self.bad_series_name = None
try:
super().__init__(master,
duplicates=duplicates,
master_id=master_id,
duplicates_id=duplicates_id,
**kwargs)
except StringGrouperNotAllStringsException:
self.non_strings_present = True

def error_msg(self, bad_series_name, function_name):
nl = ':\n'
return (
f'\n\nERROR: Input pandas Series {bad_series_name} contains values that are not strings!\n'
f'Display the pandas Series \'{function_name}.issues\' to find where these values are'
f'{nl if 0 < len(self.issues) < 12 else "."}'
f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}'
)

35 changes: 28 additions & 7 deletions string_grouper/test/test_string_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \
DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
StringGrouperConfig, StringGrouper, \
StringGrouperNotFitException, StringGrouperNotAllStringsException, \
match_most_similar, group_similar_strings, match_strings,\
compute_pairwise_similarities
from unittest.mock import patch
Expand Down Expand Up @@ -144,12 +145,14 @@ def test_compute_pairwise_similarities_data_integrity(self):
with self.assertRaises(Exception):
_ = compute_pairwise_similarities(df1, df2[:-2])

@patch('string_grouper.string_grouper.StringGrouper')
@patch('string_grouper.string_grouper.StringGrouperPrime')
def test_group_similar_strings(self, mock_StringGouper):
"""mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected"""
mock_StringGrouper_instance = mock_StringGouper.return_value
mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
mock_StringGrouper_instance.get_groups.return_value = 'whatever'
mock_StringGrouper_instance.non_strings_present = False
mock_StringGrouper_instance.error_msg.return_value = 'mock_error'

test_series_1 = None
test_series_id_1 = None
Expand All @@ -162,12 +165,14 @@ def test_group_similar_strings(self, mock_StringGouper):
mock_StringGrouper_instance.get_groups.assert_called_once()
self.assertEqual(df, 'whatever')

@patch('string_grouper.string_grouper.StringGrouper')
@patch('string_grouper.string_grouper.StringGrouperPrime')
def test_match_most_similar(self, mock_StringGouper):
"""mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected"""
mock_StringGrouper_instance = mock_StringGouper.return_value
mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
mock_StringGrouper_instance.get_groups.return_value = 'whatever'
mock_StringGrouper_instance.non_strings_present = False
mock_StringGrouper_instance.error_msg.return_value = 'mock_error'

test_series_1 = None
test_series_2 = None
Expand All @@ -184,12 +189,14 @@ def test_match_most_similar(self, mock_StringGouper):
mock_StringGrouper_instance.get_groups.assert_called_once()
self.assertEqual(df, 'whatever')

@patch('string_grouper.string_grouper.StringGrouper')
@patch('string_grouper.string_grouper.StringGrouperPrime')
def test_match_strings(self, mock_StringGouper):
"""mocks StringGrouper to test if the high-level function match_strings utilizes it as expected"""
mock_StringGrouper_instance = mock_StringGouper.return_value
mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
mock_StringGrouper_instance.get_matches.return_value = 'whatever'
mock_StringGrouper_instance.non_strings_present = False
mock_StringGrouper_instance.error_msg.return_value = 'mock_error'

test_series_1 = None
test_series_id_1 = None
Expand Down Expand Up @@ -792,18 +799,32 @@ def test_string_grouper_type_error(self):
"""StringGrouper should raise an typeerror master or duplicates are not a series of strings"""
with self.assertRaises(TypeError):
_ = StringGrouper('foo', 'bar')
with self.assertRaises(TypeError):
with self.assertRaises(StringGrouperNotAllStringsException):
_ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1]))
with self.assertRaises(TypeError):
with self.assertRaises(StringGrouperNotAllStringsException):
_ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j']))
with self.assertRaises(StringGrouperNotAllStringsException):
_ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan]))

def test_not_all_strings_exception_in_high_level_fucntions(self):
good_series = pd.Series(['foo', 'bar'])
bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes')
with self.assertRaises(TypeError):
_ = compute_pairwise_similarities(good_series, bad_series.rename_axis('dupes_id'))
with self.assertRaises(TypeError):
_ = group_similar_strings(bad_series.rename_axis('string_id'))
with self.assertRaises(TypeError):
_ = match_most_similar(bad_series.rename('master'), good_series)
with self.assertRaises(TypeError):
_ = match_strings(good_series, bad_series.rename('dupes').rename_axis('dupes_id'))

def test_prior_matches_added(self):
"""When a new match is added, any pre-existing matches should also be updated"""
sample = [
'microsoftoffice 365 home',
'microsoftoffice 365 pers',
'microsoft office'
]
]

df = pd.DataFrame(sample, columns=['name'])

Expand Down

0 comments on commit faa974c

Please sign in to comment.