Skip to content

Commit

Permalink
Remove string-grouper
Browse files Browse the repository at this point in the history
  • Loading branch information
luis11011 committed Sep 16, 2021
1 parent 2fbf5ec commit 373a114
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 118 deletions.
46 changes: 0 additions & 46 deletions conda/recipes/string-grouper/meta.yaml

This file was deleted.

72 changes: 0 additions & 72 deletions optimus/helpers/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,78 +283,6 @@ def deep_sort(obj):
return _sorted


def infer_dataframes_keys(df_left: pd.DataFrame, df_right: pd.DataFrame):
"""
Infer the possible key columns in two data frames
:param df_left:
:param df_right:
:return:
"""
from string_grouper import match_strings

result = []

df_left = df_left.dropna().astype(str)
df_right = df_right.dropna().astype(str)

# Search column names wiht *id* substring
def check_ids_columns(_df):
return [x for x in _df.columns if re.search(r"_id| id|id_| id ", x)]

ids_columns_left = check_ids_columns(df_left)
ids_columns_right = check_ids_columns(df_right)
if len(ids_columns_left) == len(ids_columns_right):
for i, j in zip(ids_columns_left, ids_columns_right):
result.append((i, j,))

# Numeric median len
def min_max_len(_df):

df_is_int = _df.applymap(lambda value: fastnumbers.isint(value)).sum()
df_is_int = df_is_int[df_is_int == len(_df)]
int_columns_names = df_is_int.index.values
int_columns_df = _df[int_columns_names]
string_len = int_columns_df.applymap(lambda value: len(value))
return (int_columns_names, string_len.min().values, string_len.max().values)

min_max_df_left = min_max_len(df_left)
min_max_df_right = min_max_len(df_right)

def median_len(arr, idx):
"""
Calculate median len of the columns string
:param arr:
:param idx:
:return:
"""
_min = arr[1][idx]
_max = arr[2][idx]
if _min != _max:
_median = _max - _min
else:
_median = _max
return _median

for i, col_l in enumerate(min_max_df_left[0]):
median_left = median_len(min_max_df_left, i)
for j, col_r in enumerate(min_max_df_right[0]):
median_right = median_len(min_max_df_right, j)
if median_left == median_right:
result.append((col_l, col_r,))

# String Clustering
for col_l in df_left:
for col_r in df_right:
try:
m = match_strings(df_left[col_l], df_right[col_r], min_similarity=0.05)
if len(m) > 0:
result.append((col_l, col_r,))
except ValueError:
pass
# Count tuples
return [(count,) + item for item, count in Counter(result).items()]


def update_dict(d, u):
"""
Update only the given keys
Expand Down

0 comments on commit 373a114

Please sign in to comment.