Merge pull request #39 from dsi-clinic/networkx_record_linkage

Networkx record linkage
uchicago-dsi · Mar 6, 2024 · 24ef142 · 24ef142
2 parents 0af314c + 51f82d4
commit 24ef142
Showing 1 changed file with 9 additions and 11 deletions.
diff --git a/utils/linkage.py b/utils/linkage.py
@@ -39,7 +39,6 @@ def get_address_line_1_from_full_address(address: str) -> str:
     ... )
     '1415 PARKER STREET'
     """
-    pass
 
     address_tuples = usaddress.parse(
         address
@@ -137,7 +136,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     >>> get_likely_name("Jane","","Doe, Jane, Elisabeth")
     'Jane Elisabeth Doe'
     """
-    # first, convert any Nans to empty strings ''
+    # first, convert any NaNs to empty strings ''
     first_name, last_name, full_name = [
         "" if x is np.NAN else x for x in [first_name, last_name, full_name]
     ]
@@ -151,8 +150,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     if first_name + " " + last_name == full_name:
         return full_name.title()
 
-    # some names have titles or professions associated with the name. We need to
-    # remove those from the name.
+    # remove titles or professions from the name
     names = [first_name, last_name, full_name]
 
     for i in range(len(names)):
@@ -278,7 +276,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     # first remove all duplicate entries:
     new_df = df.drop_duplicates()
 
-    # now find the duplicates along all columns but the ID
+    # find the duplicates along all columns but the id
     new_df = (
         new_df.groupby(df.columns.difference(["id"]).tolist(), dropna=False)[
             "id"
@@ -289,7 +287,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     )
     new_df.index = new_df["duplicated"].str[0].tolist()
 
-    # now convert the duplicated column into a dictionary that can will be
+    # convert the duplicated column into a dictionary that can will be
     # an output by only feeding the entries with duplicates
     new_df = new_df.reset_index().rename(columns={"index": "id"})
     convert_duplicates_to_dict(new_df[["id", "duplicated"]])
@@ -299,8 +297,8 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
 
 def cleaning_company_column(company_entry: str) -> str:
     """
-    Given a string, check if it contains a variation of self employed, unemployed,
-    or retired and return the standardized version.
+    Given a string, check if it contains a variation of self employed,
+    unemployed, or retired and return the standardized version.
 
     Args:
         company: string of inputted company names
@@ -419,7 +417,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
             return address_line_1_components[i][0]
         elif address_line_1_components[i][1] == "USPSBoxID":
             return address_line_1_components[i][0]
-    raise ValueError("Can not find Address Number")
+    raise ValueError("Cannot find Address Number")
 
 
 def splink_dedupe(
@@ -478,10 +476,10 @@ def splink_dedupe(
         on="cluster_id",
         how="left",
     )
-    deduped_df.rename(columns={"cluster_id": "unique_id"}, inplace=True)
+    deduped_df = deduped_df.rename(columns={"cluster_id": "unique_id"})
 
     convert_duplicates_to_dict(deduped_df)
 
-    deduped_df.drop(columns=["duplicated"])
+    deduped_df = deduped_df.drop(columns=["duplicated"])
 
     return deduped_df