Skip to content

Commit

Permalink
Merge pull request #39 from dsi-clinic/networkx_record_linkage
Browse files Browse the repository at this point in the history
Networkx record linkage
  • Loading branch information
averyschoen authored Mar 6, 2024
2 parents 0af314c + 51f82d4 commit 24ef142
Showing 1 changed file with 9 additions and 11 deletions.
20 changes: 9 additions & 11 deletions utils/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def get_address_line_1_from_full_address(address: str) -> str:
... )
'1415 PARKER STREET'
"""
pass

address_tuples = usaddress.parse(
address
Expand Down Expand Up @@ -137,7 +136,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
>>> get_likely_name("Jane","","Doe, Jane, Elisabeth")
'Jane Elisabeth Doe'
"""
# first, convert any Nans to empty strings ''
# first, convert any NaNs to empty strings ''
first_name, last_name, full_name = [
"" if x is np.NAN else x for x in [first_name, last_name, full_name]
]
Expand All @@ -151,8 +150,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
if first_name + " " + last_name == full_name:
return full_name.title()

# some names have titles or professions associated with the name. We need to
# remove those from the name.
# remove titles or professions from the name
names = [first_name, last_name, full_name]

for i in range(len(names)):
Expand Down Expand Up @@ -278,7 +276,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
# first remove all duplicate entries:
new_df = df.drop_duplicates()

# now find the duplicates along all columns but the ID
# find the duplicates along all columns but the id
new_df = (
new_df.groupby(df.columns.difference(["id"]).tolist(), dropna=False)[
"id"
Expand All @@ -289,7 +287,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
)
new_df.index = new_df["duplicated"].str[0].tolist()

# now convert the duplicated column into a dictionary that can will be
# convert the duplicated column into a dictionary that can will be
# an output by only feeding the entries with duplicates
new_df = new_df.reset_index().rename(columns={"index": "id"})
convert_duplicates_to_dict(new_df[["id", "duplicated"]])
Expand All @@ -299,8 +297,8 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:

def cleaning_company_column(company_entry: str) -> str:
"""
Given a string, check if it contains a variation of self employed, unemployed,
or retired and return the standardized version.
Given a string, check if it contains a variation of self employed,
unemployed, or retired and return the standardized version.
Args:
company: string of inputted company names
Expand Down Expand Up @@ -419,7 +417,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
return address_line_1_components[i][0]
elif address_line_1_components[i][1] == "USPSBoxID":
return address_line_1_components[i][0]
raise ValueError("Can not find Address Number")
raise ValueError("Cannot find Address Number")


def splink_dedupe(
Expand Down Expand Up @@ -478,10 +476,10 @@ def splink_dedupe(
on="cluster_id",
how="left",
)
deduped_df.rename(columns={"cluster_id": "unique_id"}, inplace=True)
deduped_df = deduped_df.rename(columns={"cluster_id": "unique_id"})

convert_duplicates_to_dict(deduped_df)

deduped_df.drop(columns=["duplicated"])
deduped_df = deduped_df.drop(columns=["duplicated"])

return deduped_df

0 comments on commit 24ef142

Please sign in to comment.