diff --git a/utils/linkage.py b/utils/linkage.py index 32a44dfc..c89b5818 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -39,7 +39,6 @@ def get_address_line_1_from_full_address(address: str) -> str: ... ) '1415 PARKER STREET' """ - pass address_tuples = usaddress.parse( address @@ -137,7 +136,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: >>> get_likely_name("Jane","","Doe, Jane, Elisabeth") 'Jane Elisabeth Doe' """ - # first, convert any Nans to empty strings '' + # first, convert any NaNs to empty strings '' first_name, last_name, full_name = [ "" if x is np.NAN else x for x in [first_name, last_name, full_name] ] @@ -151,8 +150,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: if first_name + " " + last_name == full_name: return full_name.title() - # some names have titles or professions associated with the name. We need to - # remove those from the name. + # remove titles or professions from the name names = [first_name, last_name, full_name] for i in range(len(names)): @@ -278,7 +276,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: # first remove all duplicate entries: new_df = df.drop_duplicates() - # now find the duplicates along all columns but the ID + # find the duplicates along all columns but the id new_df = ( new_df.groupby(df.columns.difference(["id"]).tolist(), dropna=False)[ "id" @@ -289,7 +287,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: ) new_df.index = new_df["duplicated"].str[0].tolist() - # now convert the duplicated column into a dictionary that can will be + # convert the duplicated column into a dictionary that can will be # an output by only feeding the entries with duplicates new_df = new_df.reset_index().rename(columns={"index": "id"}) convert_duplicates_to_dict(new_df[["id", "duplicated"]]) @@ -299,8 +297,8 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: def cleaning_company_column(company_entry: str) -> str: """ - Given a string, check if it contains a variation of self employed, unemployed, - or retired and return the standardized version. + Given a string, check if it contains a variation of self employed, + unemployed, or retired and return the standardized version. Args: company: string of inputted company names @@ -419,7 +417,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: return address_line_1_components[i][0] elif address_line_1_components[i][1] == "USPSBoxID": return address_line_1_components[i][0] - raise ValueError("Can not find Address Number") + raise ValueError("Cannot find Address Number") def splink_dedupe( @@ -478,10 +476,10 @@ def splink_dedupe( on="cluster_id", how="left", ) - deduped_df.rename(columns={"cluster_id": "unique_id"}, inplace=True) + deduped_df = deduped_df.rename(columns={"cluster_id": "unique_id"}) convert_duplicates_to_dict(deduped_df) - deduped_df.drop(columns=["duplicated"]) + deduped_df = deduped_df.drop(columns=["duplicated"]) return deduped_df