Skip to content

Commit

Permalink
made PEP8-conforming modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
ParticularMiner committed May 11, 2021
1 parent faa974c commit 0bc533f
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 59 deletions.
94 changes: 50 additions & 44 deletions string_grouper/string_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,26 @@
DEFAULT_IGNORE_CASE: bool = True # ignores case by default
DEFAULT_DROP_INDEX: bool = False # includes index-columns in output
DEFAULT_REPLACE_NA: bool = False # when finding the most similar strings, does not replace NaN values in most
# similar string index-columns with corresponding duplicates-index values
DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
# matches appear in the output
# similar string index-columns with corresponding duplicates-index values
DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
# matches appear in the output
DEFAULT_SUPPRESS_WARNING: bool = False # when the minimum cosine similarity <=0 and zero-similarity matches are
# requested, determines whether or not to suppress the message warning that
# max_n_matches may be too small
# requested, determines whether or not to suppress the message warning that max_n_matches may be too small
GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest
# similarity aggregate as group-representative:
# similarity aggregate as group-representative:
GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative:
DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default

# The following string constants are used by (but aren't [yet] options passed to) StringGrouper
DEFAULT_COLUMN_NAME: str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches
DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
LEFT_PREFIX: str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches
RIGHT_PREFIX: str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches
MOST_SIMILAR_PREFIX: str = 'most_similar_' # used to prefix columns of the output of
# StringGrouper._get_nearest_matches
DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
# StringGrouper._get_nearest_matches
DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}' # used to name id-column of the output of
# StringGrouper.get_nearest_matches
# StringGrouper.get_nearest_matches
GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate


Expand All @@ -65,7 +64,7 @@ def this(*args, **kwargs):


@add_this_arg
def compute_pairwise_similarities(this,
def compute_pairwise_similarities(this,
string_series_1: pd.Series,
string_series_2: pd.Series,
**kwargs) -> pd.Series:
Expand Down Expand Up @@ -214,11 +213,11 @@ class StringGrouperConfig(NamedTuple):
Defaults to number of cores on a machine - 1.
:param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False.
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
appear in the output. Defaults to True.
:param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to supress
the message warning that max_n_matches may be too small. Defaults to False.
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
corresponding duplicates-index values. Defaults to False.
:param group_rep: str. The scheme to select the group-representative. Default is 'centroid'.
The other choice is 'first'.
Expand Down Expand Up @@ -261,6 +260,7 @@ class StringGrouperNotAllStringsException(TypeError):
"""Raised when either input Series master or duplicates contains non-strings"""
pass


class StringGrouper(object):
def __init__(self, master: pd.Series,
duplicates: Optional[pd.Series] = None,
Expand All @@ -282,7 +282,8 @@ def __init__(self, master: pd.Series,
# Validate match strings input
self.issues: pd.Series = None
self._check_string_series(master, 'master')
if (duplicates is not None): self._check_string_series(duplicates, 'duplicates')
if (duplicates is not None):
self._check_string_series(duplicates, 'duplicates')
# Validate optional IDs input
if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id):
raise Exception('List of data Series options is invalid')
Expand Down Expand Up @@ -320,7 +321,7 @@ def fit(self) -> 'StringGrouper':
matches = self._build_matches(master_matrix, duplicate_matrix)
if self._duplicates is None:
# the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
# and each of its diagonal components must be equal to 1
# and each of its diagonal components must be equal to 1
matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches)
# retrieve all matches
self._matches_list = self._get_matches_list(matches)
Expand All @@ -339,15 +340,15 @@ def dot(self) -> pd.Series:
@validate_is_fit
def get_matches(self,
ignore_index: Optional[bool] = None,
include_zeroes: Optional[bool]=None,
suppress_warning: Optional[bool]=None) -> pd.DataFrame:
include_zeroes: Optional[bool] = None,
suppress_warning: Optional[bool] = None) -> pd.DataFrame:
"""
Returns a DataFrame with all the matches and their cosine similarity.
If optional IDs are used, returned as extra columns with IDs matched to respective data rows
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
self._config.ignore_index.
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
appear in the output. Defaults to self._config.include_zeroes.
:param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to suppress
the message warning that max_n_matches may be too small. Defaults to self._config.suppress_warning.
Expand All @@ -372,19 +373,22 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
else:
return data.rename(f"{prefix}{data.name}")

if ignore_index is None: ignore_index = self._config.ignore_index
if include_zeroes is None: include_zeroes = self._config.include_zeroes
if suppress_warning is None: suppress_warning = self._config.suppress_warning
if ignore_index is None:
ignore_index = self._config.ignore_index
if include_zeroes is None:
include_zeroes = self._config.include_zeroes
if suppress_warning is None:
suppress_warning = self._config.suppress_warning
if self._config.min_similarity > 0 or not include_zeroes:
matches_list = self._matches_list
elif include_zeroes:
# Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
# the fix includes zero-similarity matches that are missing by default
# in _matches_list due to our use of sparse matrices
# the fix includes zero-similarity matches that are missing by default
# in _matches_list due to our use of sparse matrices
non_matches_list = self._get_non_matches_list(suppress_warning)
matches_list = self._matches_list if non_matches_list.empty else \
pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True)

left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index)
similarity = matches_list.similarity.reset_index(drop=True)
if self._master_id is None:
Expand Down Expand Up @@ -426,16 +430,18 @@ def get_groups(self,
If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
above are returned as well altogether in a DataFrame.
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
self._config.ignore_index.
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
corresponding duplicates-index values. Defaults to self._config.replace_na.
"""
if ignore_index is None: ignore_index = self._config.ignore_index
if ignore_index is None:
ignore_index = self._config.ignore_index
if self._duplicates is None:
return self._deduplicate(ignore_index=ignore_index)
else:
if replace_na is None: replace_na = self._config.replace_na
if replace_na is None:
replace_na = self._config.replace_na
return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na)

@validate_is_fit
Expand Down Expand Up @@ -524,7 +530,8 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side'])
matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']])
missing_pairs = all_pairs.difference(matched_pairs)
if missing_pairs.empty: return pd.DataFrame()
if missing_pairs.empty:
return pd.DataFrame()
if (self._config.max_n_matches < d_sz) and not suppress_warning:
warnings.warn(f'WARNING: max_n_matches={self._config.max_n_matches} may be too small!\n'
f'\t\t Some zero-similarity matches returned may be false!\n'
Expand All @@ -542,8 +549,8 @@ def _get_nearest_matches(self,
master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}'
master = self._master.rename(master_label).reset_index(drop=ignore_index)
dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index)
# Rename new master-columns to avoid possible conflict with new dupes-columns when later merging

# Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
if isinstance(dupes, pd.DataFrame):
master.rename(
columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label},
Expand Down Expand Up @@ -573,14 +580,14 @@ def _get_nearest_matches(self,
if self._master_id is not None:
# Also update the master_id-series with the duplicates_id in cases were there is no match
dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id

# For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
# appear within them. So here we change them back to their original datatypes if possible:
if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \
self._duplicates_id.dtype == self._master_id.dtype:
self._duplicates_id.dtype == self._master_id.dtype:
dupes_max_sim.loc[:, master_id_label] = \
dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)

# Prepare the output:
required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label]
index_column_list = \
Expand All @@ -590,13 +597,13 @@ def _get_nearest_matches(self,
# Update the master index-columns with the duplicates index-column values in cases were there is no match
dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates']
dupes_max_sim.loc[rows_to_update, index_column_list] = \
dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
dupes_max_sim.loc[rows_to_update, dupes_index_columns].values

# Restore their original datatypes if possible:
for m, d in zip(index_column_list, dupes_index_columns):
if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype:
dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype)

# Make sure to keep same order as duplicates
dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
output = dupes_max_sim[index_column_list + required_column_list]
Expand Down Expand Up @@ -667,9 +674,9 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True)
dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
return master_indices, dupe_indices

def _check_string_series(self, series_to_test: pd.Series, which: str):
self.bad_series_name = which
self.bad_series_name = which
StringGrouper._check_type(series_to_test, which)
self._check_content(series_to_test, which)

Expand Down Expand Up @@ -780,7 +787,7 @@ def __init__(self, master: pd.Series,
**kwargs)
except StringGrouperNotAllStringsException:
self.non_strings_present = True

def error_msg(self, bad_series_name, function_name):
nl = ':\n'
return (
Expand All @@ -789,4 +796,3 @@ def error_msg(self, bad_series_name, function_name):
f'{nl if 0 < len(self.issues) < 12 else "."}'
f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}'
)

Loading

0 comments on commit 0bc533f

Please sign in to comment.