From 7dc5b7054cd6f525c6c542d12bb7bb4b68487d11 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 8 Feb 2024 07:40:57 +0000 Subject: [PATCH 01/37] updating requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index db05b66f..0b21babc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ Requests==2.31.0 setuptools==68.0.0 textdistance==4.6.1 usaddress==0.5.4 +nameparser==1.1.3 From a3310a1871b0415140a4dad8bd41ab46d47fa97b Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 8 Feb 2024 07:41:35 +0000 Subject: [PATCH 02/37] adding pre_process pipeline funcion --- utils/linkage.py | 144 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/utils/linkage.py b/utils/linkage.py index ac11a5ac..97b0ad6e 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,5 +1,10 @@ +import re +from typing import Tuple + +import pandas as pd import textdistance as td import usaddress +from nameparser import HumanName from utils.constants import COMPANY_TYPES @@ -194,3 +199,142 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: elif address_line_1_components[i][1] == "USPSBoxID": return address_line_1_components[i][0] raise ValueError("Can not find Address Number") + + +def cleaning_company_column(company_entry: str) -> str: + """ + Given a string, check if it contains a variation of self employed, unemployed, + or retired and return the standardized version. + + Args: + company: string of inputted company names + Returns: + standardized for retired, self employed, and unemployed, + or original string if no match or empty string + + >>> cleaning_company_column("Retireed") + 'Retired' + >>> cleaning_company_column("self") + 'Self Employed' + >>> cleaning_company_column("None") + 'Unemployed' + >>> cleaning_company_column("N/A") + 'Unemployed' + """ + + if not company_entry: + return company_entry + + company_edited = company_entry.lower() + + if company_edited == "n/a": + return "Unemployed" + + company_edited = re.sub(r"[^\w\s]", "", company_edited) + + if ( + company_edited == "retired" + or company_edited == "retiree" + or company_edited == "retire" + or "retiree" in company_edited + ): + return "Retired" + + elif ( + "self employe" in company_edited + or "freelance" in company_edited + or company_edited == "self" + or company_edited == "independent contractor" + ): + return "Self Employed" + elif ( + "unemploye" in company_edited + or company_edited == "none" + or company_edited == "not employed" + or company_edited == "nan" + ): + return "Unemployed" + + else: + return company_edited + + +def preprocess_pipeline( + individuals: pd.DataFrame, + Address: str, + organizations: pd.DataFrame, + transactions: pd.DataFrame, +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Preprocesses data for record linkage + + Args: + Individuals: dataframe of individual contributions + Address: column name of address + Organizations: dataframe of organization contributions + Transactions: dataframe of transactions + Returns: + preprocessed tuple of dataframes + first element is the individuals dataframe, + second element is the organizations dataframe, + third element is the transactions dataframe + """ + # Preprocess organizations dataframe + organizations["name"] = ( + organizations["name"].astype(str).apply(standardize_corp_names) + ) + + # Preprocess individuals dataframe + if "Unnamed: 0" in individuals.columns: + individuals.drop(columns="Unnamed: 0", inplace=True) + + individuals = individuals.astype( + {"first_name": str, "last_name": str, "full_name": str, "company": str} + ) + + # Standardize company names in individuals dataframe + individuals["company"] = individuals["company"].apply( + standardize_corp_names + ) + individuals["company"] = individuals["company"].apply( + cleaning_company_column + ) + + # Address functions, assuming address column is named 'address' + individuals["Address Line 1"] = individuals[Address].apply( + get_address_line_1_from_full_address + ) + individuals["Street Name"] = individuals["Address Line 1"].apply( + get_street_from_address_line_1 + ) + individuals["Address Number"] = individuals["Address Line 1"].apply( + get_address_number_from_address_line_1 + ) + + # Check if first name or last names are empty, if so, extract from full name column + individuals["full_name"] = individuals["full_name"].astype(str) + if individuals["first_name"].isnull().any(): + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) + first_name = name.apply(lambda x: x["first"]) + individuals["first_name"] = first_name + + if individuals["last_name"].isnull().any(): + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) + last_name = name.apply(lambda x: x["last"]) + individuals["last_name"] = last_name + + # Transactions + if "Unnamed: 0" in transactions.columns: + transactions.drop(columns="Unnamed: 0", inplace=True) + + transactions["purpose"] = transactions["purpose"].str.upper() + + return individuals, organizations, transactions From c3c8defec982adfea07ebba96b735f6cfd5ec29e Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 19 Feb 2024 16:09:46 +0000 Subject: [PATCH 03/37] preprocess file and function initial commit --- utils/preprocess.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 utils/preprocess.py diff --git a/utils/preprocess.py b/utils/preprocess.py new file mode 100644 index 00000000..2831996a --- /dev/null +++ b/utils/preprocess.py @@ -0,0 +1,81 @@ +from typing import Tuple + +import pandas as pd +from nameparser import HumanName + +from utils.linkage import ( + cleaning_company_column, + get_address_line_1_from_full_address, + get_address_number_from_address_line_1, + get_street_from_address_line_1, + standardize_corp_names, +) + + +def preprocess_pipeline( + individuals: pd.DataFrame, + organizations: pd.DataFrame, + transactions: pd.DataFrame, +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Preprocesses data for record linkage + + Args: + Individuals: dataframe of individual contributions + Organizations: dataframe of organization contributions + Transactions: dataframe of transactions + Returns: + preprocessed tuple of dataframes + first element is the individuals dataframe, + second element is the organizations dataframe, + third element is the transactions dataframe + """ + # Preprocess organizations dataframe + organizations["name"] = ( + organizations["name"].astype(str, skipna=True).apply(standardize_corp_names) + ) + + # Preprocess individuals dataframe + if "Unnamed: 0" in individuals.columns: + individuals.drop(columns="Unnamed: 0", inplace=True) + + individuals = individuals.astype( + {"first_name": str, "last_name": str, "full_name": str, "company": str} + ) + + # Standardize company names in individuals dataframe + individuals["company"] = individuals["company"].apply(standardize_corp_names) + individuals["company"] = individuals["company"].apply(cleaning_company_column) + + # Address functions, assuming address column is named 'address' + individuals["Address Line 1"] = individuals["Address"].apply( + get_address_line_1_from_full_address + ) + individuals["Street Name"] = individuals["Address Line 1"].apply( + get_street_from_address_line_1 + ) + individuals["Address Number"] = individuals["Address Line 1"].apply( + get_address_number_from_address_line_1 + ) + + # Check if first name or last names are empty, if so, extract from full name column + individuals["full_name"] = individuals["full_name"].astype(str)[ + individuals["full_name"].notnull() + ] + if individuals["first_name"].isnull().any(): + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + first_name = name.apply(lambda x: x["first"]) + individuals["first_name"] = first_name + + if individuals["last_name"].isnull().any(): + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + last_name = name.apply(lambda x: x["last"]) + individuals["last_name"] = last_name + + # Transactions + if "Unnamed: 0" in transactions.columns: + transactions.drop(columns="Unnamed: 0", inplace=True) + + transactions["purpose"] = transactions["purpose"].str.upper() + + return individuals, organizations, transactions From cccc7cc2665793e4777974b6464d29e9b594feb5 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 19 Feb 2024 16:21:54 +0000 Subject: [PATCH 04/37] slight edits --- utils/preprocess.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/utils/preprocess.py b/utils/preprocess.py index 2831996a..55a99810 100644 --- a/utils/preprocess.py +++ b/utils/preprocess.py @@ -32,7 +32,9 @@ def preprocess_pipeline( """ # Preprocess organizations dataframe organizations["name"] = ( - organizations["name"].astype(str, skipna=True).apply(standardize_corp_names) + organizations["name"] + .astype(str, skipna=True) + .apply(standardize_corp_names) ) # Preprocess individuals dataframe @@ -44,8 +46,12 @@ def preprocess_pipeline( ) # Standardize company names in individuals dataframe - individuals["company"] = individuals["company"].apply(standardize_corp_names) - individuals["company"] = individuals["company"].apply(cleaning_company_column) + individuals["company"] = individuals["company"].apply( + standardize_corp_names + ) + individuals["company"] = individuals["company"].apply( + cleaning_company_column + ) # Address functions, assuming address column is named 'address' individuals["Address Line 1"] = individuals["Address"].apply( @@ -63,12 +69,20 @@ def preprocess_pipeline( individuals["full_name"].notnull() ] if individuals["first_name"].isnull().any(): - name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) first_name = name.apply(lambda x: x["first"]) individuals["first_name"] = first_name if individuals["last_name"].isnull().any(): - name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) last_name = name.apply(lambda x: x["last"]) individuals["last_name"] = last_name From 57c6070bb8c85743a5ebb5b2584db5427b32a35a Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 19 Feb 2024 16:23:12 +0000 Subject: [PATCH 05/37] removing preprocess function from linkage.py --- utils/linkage.py | 84 ------------------------------------------------ 1 file changed, 84 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 97b0ad6e..2c80939a 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,10 +1,7 @@ import re -from typing import Tuple -import pandas as pd import textdistance as td import usaddress -from nameparser import HumanName from utils.constants import COMPANY_TYPES @@ -257,84 +254,3 @@ def cleaning_company_column(company_entry: str) -> str: else: return company_edited - - -def preprocess_pipeline( - individuals: pd.DataFrame, - Address: str, - organizations: pd.DataFrame, - transactions: pd.DataFrame, -) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Preprocesses data for record linkage - - Args: - Individuals: dataframe of individual contributions - Address: column name of address - Organizations: dataframe of organization contributions - Transactions: dataframe of transactions - Returns: - preprocessed tuple of dataframes - first element is the individuals dataframe, - second element is the organizations dataframe, - third element is the transactions dataframe - """ - # Preprocess organizations dataframe - organizations["name"] = ( - organizations["name"].astype(str).apply(standardize_corp_names) - ) - - # Preprocess individuals dataframe - if "Unnamed: 0" in individuals.columns: - individuals.drop(columns="Unnamed: 0", inplace=True) - - individuals = individuals.astype( - {"first_name": str, "last_name": str, "full_name": str, "company": str} - ) - - # Standardize company names in individuals dataframe - individuals["company"] = individuals["company"].apply( - standardize_corp_names - ) - individuals["company"] = individuals["company"].apply( - cleaning_company_column - ) - - # Address functions, assuming address column is named 'address' - individuals["Address Line 1"] = individuals[Address].apply( - get_address_line_1_from_full_address - ) - individuals["Street Name"] = individuals["Address Line 1"].apply( - get_street_from_address_line_1 - ) - individuals["Address Number"] = individuals["Address Line 1"].apply( - get_address_number_from_address_line_1 - ) - - # Check if first name or last names are empty, if so, extract from full name column - individuals["full_name"] = individuals["full_name"].astype(str) - if individuals["first_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) - first_name = name.apply(lambda x: x["first"]) - individuals["first_name"] = first_name - - if individuals["last_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) - last_name = name.apply(lambda x: x["last"]) - individuals["last_name"] = last_name - - # Transactions - if "Unnamed: 0" in transactions.columns: - transactions.drop(columns="Unnamed: 0", inplace=True) - - transactions["purpose"] = transactions["purpose"].str.upper() - - return individuals, organizations, transactions From 277663672fcd1faf2cee83f51096d39e71dedbbe Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 19 Feb 2024 16:24:17 +0000 Subject: [PATCH 06/37] slight changes --- utils/preprocess.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/utils/preprocess.py b/utils/preprocess.py index 55a99810..f3755eec 100644 --- a/utils/preprocess.py +++ b/utils/preprocess.py @@ -32,9 +32,7 @@ def preprocess_pipeline( """ # Preprocess organizations dataframe organizations["name"] = ( - organizations["name"] - .astype(str, skipna=True) - .apply(standardize_corp_names) + organizations["name"].astype(str, skipna=True).apply(standardize_corp_names) ) # Preprocess individuals dataframe @@ -46,14 +44,10 @@ def preprocess_pipeline( ) # Standardize company names in individuals dataframe - individuals["company"] = individuals["company"].apply( - standardize_corp_names - ) - individuals["company"] = individuals["company"].apply( - cleaning_company_column - ) + individuals["company"] = individuals["company"].apply(standardize_corp_names) + individuals["company"] = individuals["company"].apply(cleaning_company_column) - # Address functions, assuming address column is named 'address' + # Address functions, assuming address column is named 'Address' individuals["Address Line 1"] = individuals["Address"].apply( get_address_line_1_from_full_address ) @@ -69,20 +63,12 @@ def preprocess_pipeline( individuals["full_name"].notnull() ] if individuals["first_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) first_name = name.apply(lambda x: x["first"]) individuals["first_name"] = first_name if individuals["last_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) last_name = name.apply(lambda x: x["last"]) individuals["last_name"] = last_name From 1ea09b4034a687c458dad3d5cbe573c24b8bf59b Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 19 Feb 2024 13:50:39 -0600 Subject: [PATCH 07/37] Renaming File --- utils/{preprocess.py => linkage_pipeline.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename utils/{preprocess.py => linkage_pipeline.py} (100%) diff --git a/utils/preprocess.py b/utils/linkage_pipeline.py similarity index 100% rename from utils/preprocess.py rename to utils/linkage_pipeline.py From 21af2c951ae837a94a7603af71bdb267349b0f4d Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 22 Feb 2024 07:38:04 +0000 Subject: [PATCH 08/37] updates --- utils/linkage_pipeline.py | 87 ++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 16 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index f3755eec..0f7be5e5 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -3,10 +3,13 @@ import pandas as pd from nameparser import HumanName +from utils.constants import BASE_FILEPATH from utils.linkage import ( cleaning_company_column, + deduplicate_perfect_matches, get_address_line_1_from_full_address, get_address_number_from_address_line_1, + get_likely_name, get_street_from_address_line_1, standardize_corp_names, ) @@ -32,50 +35,102 @@ def preprocess_pipeline( """ # Preprocess organizations dataframe organizations["name"] = ( - organizations["name"].astype(str, skipna=True).apply(standardize_corp_names) + organizations["name"].astype(str).apply(standardize_corp_names) ) + if "Unnamed: 0" in organizations.columns: + organizations.drop(columns="Unnamed: 0", inplace=True) # Preprocess individuals dataframe if "Unnamed: 0" in individuals.columns: individuals.drop(columns="Unnamed: 0", inplace=True) individuals = individuals.astype( - {"first_name": str, "last_name": str, "full_name": str, "company": str} + { + "first_name": str, + "last_name": str, + "full_name": str, + "company": "string", + } ) # Standardize company names in individuals dataframe - individuals["company"] = individuals["company"].apply(standardize_corp_names) - individuals["company"] = individuals["company"].apply(cleaning_company_column) - - # Address functions, assuming address column is named 'Address' - individuals["Address Line 1"] = individuals["Address"].apply( - get_address_line_1_from_full_address - ) - individuals["Street Name"] = individuals["Address Line 1"].apply( - get_street_from_address_line_1 + individuals["company"] = ( + individuals["company"] + .loc[individuals["company"].notnull()] + .apply(standardize_corp_names) ) - individuals["Address Number"] = individuals["Address Line 1"].apply( - get_address_number_from_address_line_1 + individuals["company"] = ( + individuals["company"] + .loc[individuals["company"].notnull()] + .apply(cleaning_company_column) ) + # Address functions, assuming address column is named 'Address' + # If there is an "Address" column in the first place + if "Address" in individuals.columns: + individuals["Address"] = individuals["Address"].astype(str) + individuals["Address Line 1"] = individuals["Address"].apply( + get_address_line_1_from_full_address + ) + individuals["Street Name"] = individuals["Address Line 1"].apply( + get_street_from_address_line_1 + ) + individuals["Address Number"] = individuals["Address Line 1"].apply( + get_address_number_from_address_line_1 + ) + # Check if first name or last names are empty, if so, extract from full name column individuals["full_name"] = individuals["full_name"].astype(str)[ individuals["full_name"].notnull() ] if individuals["first_name"].isnull().any(): - name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) first_name = name.apply(lambda x: x["first"]) individuals["first_name"] = first_name if individuals["last_name"].isnull().any(): - name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) last_name = name.apply(lambda x: x["last"]) individuals["last_name"] = last_name - # Transactions + individuals["full_name"] = individuals.apply( + lambda row: get_likely_name( + row["first_name"], row["last_name"], row["full_name"] + ), + axis=1, + ) + if "Unnamed: 0" in transactions.columns: transactions.drop(columns="Unnamed: 0", inplace=True) transactions["purpose"] = transactions["purpose"].str.upper() return individuals, organizations, transactions + + +organizations = pd.read_csv( + BASE_FILEPATH / "output" / "complete_organizations_table.csv" +) + +individuals = pd.read_csv( + BASE_FILEPATH / "output" / "complete_individuals_table.csv" +) + +transactions = pd.read_csv( + BASE_FILEPATH / "output" / "complete_transactions_table.csv" +) + +individuals, organizations, transactions = preprocess_pipeline( + individuals, organizations, transactions +) + +individuals = deduplicate_perfect_matches(individuals) +organizations = deduplicate_perfect_matches(organizations) From 4d7bdfb9cfe95b7c0c8e98314b2ca2977fb8c266 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 22 Feb 2024 07:46:05 +0000 Subject: [PATCH 09/37] adding output csv --- utils/linkage_pipeline.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 0f7be5e5..b5e4d451 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -134,3 +134,19 @@ def preprocess_pipeline( individuals = deduplicate_perfect_matches(individuals) organizations = deduplicate_perfect_matches(organizations) + +processed_individuals_output_path = ( + BASE_FILEPATH / "output" / "processed_individuals_table.csv" +) + +processed_organizations_output_path = ( + BASE_FILEPATH / "output" / "processed_organizations_table.csv" +) + +processed_transactions_output_path = ( + BASE_FILEPATH / "output" / "processed_transactions_table.csv" +) + +individuals.to_csv(processed_individuals_output_path) +organizations.to_csv(processed_organizations_output_path) +transactions.to_csv(processed_transactions_output_path) From 42ca58e75333f4d91836b7dc64134adc0247810b Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 28 Feb 2024 04:13:59 +0000 Subject: [PATCH 10/37] pipeline changes --- utils/linkage_pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index b5e4d451..613d3244 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -133,7 +133,6 @@ def preprocess_pipeline( ) individuals = deduplicate_perfect_matches(individuals) -organizations = deduplicate_perfect_matches(organizations) processed_individuals_output_path = ( BASE_FILEPATH / "output" / "processed_individuals_table.csv" From 77bc2b3e4ac276b2d290a092d8f1ae51dd6a41a4 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 28 Feb 2024 04:19:36 +0000 Subject: [PATCH 11/37] adding removed files --- utils/classify.py | 75 +++++++++++++++++++++++++ utils/tests/test_linkage.py | 107 ++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 utils/classify.py create mode 100644 utils/tests/test_linkage.py diff --git a/utils/classify.py b/utils/classify.py new file mode 100644 index 00000000..db574ace --- /dev/null +++ b/utils/classify.py @@ -0,0 +1,75 @@ +import pandas as pd + +# we want to run down a list of people and, hopefully, their adresses, plus a list of +# corporations, groups, etc, and classify them, basically just looking for matches + +# do we want to just input all the names/people (there's not many, less than 200 +# for sure),give a string similarity match score, and extract the top ten for +# manual review? this should give us a feeling for how to set our threshold +# we might also, once we have all the data, buckle down and just classify +# some of them manually + +inds_list = [] + +# a list of individual names + + +def similarity_calculator( + df: pd.DataFrame, subject: str, n: int, comparison_func +) -> pd.DataFrame: + """Find best matches to a subject name in a pandas dataframe + + For a given individual or organization, the subject, we search through the + 'name'column of a dataframe, select the n highest matches according to a + selected comparison function, and return those as a dataframe. This is meant + to be used manually to search for matches. For quick automated processing, see + automated_classifier(). + + Note that the comparison function must take in two inputs, both strings, and + output a percentage match + """ + + similarities_df = df.copy() + + similarities = similarities_df["name"].apply( + lambda x: comparison_func(x, subject) + ) + + similarities_df["similarities"] = similarities + + top_n_matches = similarities_df.sort_values( + by=["similarities"], ascending=False + )[0:n] + + return top_n_matches + + +def automated_classifier( + df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func +): + """Using similarity_calculator, classify entities automatically + + Feeding a dictionary of names and the associated statuses, we compare + the string matches and, if they exceed a certain threshold, classify + them as belonging to some group specified in the subjects dictionary. + """ + + similarities_df = df.copy() + + for subject in subjects_dict: + similarities = similarities_df["name"].apply( + lambda x, sub=subject: comparison_func(x, sub) + ) + matches = similarities >= threshold + + status = subjects_dict[subject] + + similarities_df["classification"] = pd.Series(matches).apply( + lambda x, stat=status: stat if x else "neutral" + ) + + return similarities_df + + # we can use the indices and/or select manually, just add a new + # column to the subjects table + # that marks fossil fuels, green energy, or neither diff --git a/utils/tests/test_linkage.py b/utils/tests/test_linkage.py new file mode 100644 index 00000000..3695a399 --- /dev/null +++ b/utils/tests/test_linkage.py @@ -0,0 +1,107 @@ +import numpy as np +import pandas as pd +import pytest + +from utils.linkage import ( + calculate_row_similarity, + calculate_string_similarity, + row_matches, +) + +# import pytest + + +# creating a test for calculate_row_similarity and row_matches + +# to put in data: +d = { + "name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"], + "address": [ + "3 Blue Drive, Chicago", + "4 Blue Drive, Chicago", + "8 Fancy Way, Chicago", + ], +} +test_df = pd.DataFrame(data=d) + + +@pytest.fixture +def row_similarity_scen_1(): + return test_df + + +@pytest.fixture +def row_similarity_scen_2(): + return test_df + + +def test_row_similarity_scen_1(row_similarity_scen_1): + wrong = calculate_row_similarity( + row_similarity_scen_1.iloc[[0]], + row_similarity_scen_1.iloc[[1]], + np.array([0.8, 0.2]), + calculate_string_similarity, + ) + right = calculate_row_similarity( + row_similarity_scen_1.iloc[[0]], + row_similarity_scen_1.iloc[[2]], + np.array([0.8, 0.2]), + calculate_string_similarity, + ) + + assert right > wrong + + +def test_row_similarity_scen_2(row_similarity_scen_2): + wrong = calculate_row_similarity( + row_similarity_scen_2.iloc[[0]], + row_similarity_scen_2.iloc[[1]], + np.array([0.2, 0.8]), + calculate_string_similarity, + ) + right = calculate_row_similarity( + row_similarity_scen_2.iloc[[0]], + row_similarity_scen_2.iloc[[2]], + np.array([0.2, 0.8]), + calculate_string_similarity, + ) + + assert right < wrong + + +d2 = { + "name": [ + "bob von rosevich", + "anantarya smith", + "bob j vonrosevich", + "missy elliot", + "mr johnson", + "quarantin directino", + "missy eliot", + "joseph johnson", + ], + "address": [ + "3 Blue Drive, Chicago", + "4 Blue Drive, Chicago", + "8 Fancy Way, Chicago", + "8 Fancy Way, Evanston", + "17 Regular Road, Chicago", + "42 Hollywood Boulevard, Chicago", + "8 Fancy Way, Evanston", + "17 Regular Road, Chicago", + ], +} +test_df2 = pd.DataFrame(data=d2) + + +@pytest.fixture +def row_match_scen1(): + return test_df2 + + +def test_row_matches(row_match_scen1): + res = row_matches( + row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity + ) + + assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []} From 3c619375b2a8a7a68a72fea4a97a2fee30360043 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 28 Feb 2024 15:54:25 +0000 Subject: [PATCH 12/37] proper updates --- utils/linkage_pipeline.py | 146 +++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 64 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 613d3244..1f565446 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -1,5 +1,3 @@ -from typing import Tuple - import pandas as pd from nameparser import HumanName @@ -15,45 +13,29 @@ ) -def preprocess_pipeline( - individuals: pd.DataFrame, - organizations: pd.DataFrame, - transactions: pd.DataFrame, -) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: +def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: """ - Preprocesses data for record linkage + Given a dataframe of individual donors, preprocesses the data, + and return a cleaned dataframe. Args: - Individuals: dataframe of individual contributions - Organizations: dataframe of organization contributions - Transactions: dataframe of transactions + individuals: dataframe of individual contributions + Returns: - preprocessed tuple of dataframes - first element is the individuals dataframe, - second element is the organizations dataframe, - third element is the transactions dataframe + cleaned dataframe of individuals """ - # Preprocess organizations dataframe - organizations["name"] = ( - organizations["name"].astype(str).apply(standardize_corp_names) - ) - if "Unnamed: 0" in organizations.columns: - organizations.drop(columns="Unnamed: 0", inplace=True) - - # Preprocess individuals dataframe if "Unnamed: 0" in individuals.columns: individuals.drop(columns="Unnamed: 0", inplace=True) individuals = individuals.astype( { - "first_name": str, - "last_name": str, - "full_name": str, + "first_name": "string", + "last_name": "string", + "full_name": "string", "company": "string", } ) - # Standardize company names in individuals dataframe individuals["company"] = ( individuals["company"] .loc[individuals["company"].notnull()] @@ -66,7 +48,6 @@ def preprocess_pipeline( ) # Address functions, assuming address column is named 'Address' - # If there is an "Address" column in the first place if "Address" in individuals.columns: individuals["Address"] = individuals["Address"].astype(str) individuals["Address Line 1"] = individuals["Address"].apply( @@ -84,20 +65,12 @@ def preprocess_pipeline( individuals["full_name"].notnull() ] if individuals["first_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) first_name = name.apply(lambda x: x["first"]) individuals["first_name"] = first_name if individuals["last_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) last_name = name.apply(lambda x: x["last"]) individuals["last_name"] = last_name @@ -108,44 +81,89 @@ def preprocess_pipeline( axis=1, ) + return individuals + + +def preprocess_organizations(organizations: pd.DataFrame) -> pd.DataFrame: + """ + Given a dataframe of organization donors, preprocesses the data, + and return a cleaned dataframe. + """ + if "Unnamed: 0" in organizations.columns: + organizations.drop(columns="Unnamed: 0", inplace=True) + + organizations["name"] = ( + organizations["name"] + .loc[organizations["name"].notnull()] + .apply(standardize_corp_names) + ) + + return organizations + + +def preprocess_transactions(transactions: pd.DataFrame) -> pd.DataFrame: + """ + Given a dataframe of transactions, preprocesses the data, + and return a cleaned dataframe. + + Args: + transactions: dataframe of transactions + + Returns: + cleaned dataframe of transactions + """ if "Unnamed: 0" in transactions.columns: transactions.drop(columns="Unnamed: 0", inplace=True) transactions["purpose"] = transactions["purpose"].str.upper() - return individuals, organizations, transactions + return transactions -organizations = pd.read_csv( - BASE_FILEPATH / "output" / "complete_organizations_table.csv" -) +def main(): + organizations = pd.read_csv( + BASE_FILEPATH / "output" / "complete_organizations_table.csv" + ) -individuals = pd.read_csv( - BASE_FILEPATH / "output" / "complete_individuals_table.csv" -) + individuals = pd.read_csv( + BASE_FILEPATH / "output" / "complete_individuals_table.csv" + ) -transactions = pd.read_csv( - BASE_FILEPATH / "output" / "complete_transactions_table.csv" -) + transactions = pd.read_csv( + BASE_FILEPATH / "output" / "complete_transactions_table.csv" + ) -individuals, organizations, transactions = preprocess_pipeline( - individuals, organizations, transactions -) + individuals = preprocess_individuals(individuals) + organizations = preprocess_organizations(organizations) + transactions = preprocess_transactions(transactions) -individuals = deduplicate_perfect_matches(individuals) + # Deduplicates perfect matches and creates a new csv file + # in output titled "deduplicated_UUIDs.csv" + individuals = deduplicate_perfect_matches(individuals) + organizations = deduplicate_perfect_matches(organizations) -processed_individuals_output_path = ( - BASE_FILEPATH / "output" / "processed_individuals_table.csv" -) + cleaned_individuals_output_path = ( + BASE_FILEPATH / "output" / "cleaned_individuals_table.csv" + ) -processed_organizations_output_path = ( - BASE_FILEPATH / "output" / "processed_organizations_table.csv" -) + cleaned_organizations_output_path = ( + BASE_FILEPATH / "output" / "cleaned_organizations_table.csv" + ) + + cleaned_transactions_output_path = ( + BASE_FILEPATH / "output" / "cleaned_transactions_table.csv" + ) + + deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv") + + transactions[["donor_id", "recipient_id"]] = transactions[ + ["donor_id", "recipient_id"] + ].replace(deduped) + + individuals.to_csv(cleaned_individuals_output_path) + organizations.to_csv(cleaned_organizations_output_path) + transactions.to_csv(cleaned_transactions_output_path) -processed_transactions_output_path = ( - BASE_FILEPATH / "output" / "processed_transactions_table.csv" -) -individuals.to_csv(processed_individuals_output_path) -organizations.to_csv(processed_organizations_output_path) -transactions.to_csv(processed_transactions_output_path) +if __name__ == "__main__": + main() From 4e32543c82bec739f90cbf55a5749464d2a5851f Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 28 Feb 2024 16:03:19 +0000 Subject: [PATCH 13/37] removing duplicated function --- utils/linkage.py | 60 ------------------------------------------------ 1 file changed, 60 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 29319907..cae5024d 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,5 +1,3 @@ -import re - import textdistance as td import usaddress from names_dataset import NameDataset @@ -635,61 +633,3 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: elif address_line_1_components[i][1] == "USPSBoxID": return address_line_1_components[i][0] raise ValueError("Can not find Address Number") - - -def cleaning_company_column(company_entry: str) -> str: - """ - Given a string, check if it contains a variation of self employed, unemployed, - or retired and return the standardized version. - - Args: - company: string of inputted company names - Returns: - standardized for retired, self employed, and unemployed, - or original string if no match or empty string - - >>> cleaning_company_column("Retireed") - 'Retired' - >>> cleaning_company_column("self") - 'Self Employed' - >>> cleaning_company_column("None") - 'Unemployed' - >>> cleaning_company_column("N/A") - 'Unemployed' - """ - - if not company_entry: - return company_entry - - company_edited = company_entry.lower() - - if company_edited == "n/a": - return "Unemployed" - - company_edited = re.sub(r"[^\w\s]", "", company_edited) - - if ( - company_edited == "retired" - or company_edited == "retiree" - or company_edited == "retire" - or "retiree" in company_edited - ): - return "Retired" - - elif ( - "self employe" in company_edited - or "freelance" in company_edited - or company_edited == "self" - or company_edited == "independent contractor" - ): - return "Self Employed" - elif ( - "unemploye" in company_edited - or company_edited == "none" - or company_edited == "not employed" - or company_edited == "nan" - ): - return "Unemployed" - - else: - return company_edited From d94243af71ebbcf97ef7ba50d1cc06f5e15a5ce4 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 28 Feb 2024 16:14:33 +0000 Subject: [PATCH 14/37] attempting to pass dev checks --- utils/linkage.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index cae5024d..a6fcbdab 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -340,9 +340,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names[i] = determine_comma_role(names[i]) names[i] = names[i].replace(".", "").split(" ") - names[i] = [ - name_part for name_part in names[i] if name_part not in titles - ] + names[i] = [name_part for name_part in names[i] if name_part not in titles] names[i] = " ".join(names[i]) # one last check to remove any pieces that might add extra whitespace @@ -431,9 +429,7 @@ def name_rank(first_name: str, last_name: str) -> list: if first_name_result and isinstance(first_name_result, dict): first_name_data = first_name_result.get("first_name") if first_name_data and "rank" in first_name_data: - first_name_rank = first_name_data["rank"].get( - "United States", 0 - ) + first_name_rank = first_name_data["rank"].get("United States", 0) else: first_name_rank = None if isinstance(last_name, str): From df41e42d4134a50305f139ca0e7b53d181f31810 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 28 Feb 2024 16:21:24 +0000 Subject: [PATCH 15/37] reformatting files --- utils/linkage.py | 8 ++++++-- utils/linkage_pipeline.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index a6fcbdab..cae5024d 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -340,7 +340,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names[i] = determine_comma_role(names[i]) names[i] = names[i].replace(".", "").split(" ") - names[i] = [name_part for name_part in names[i] if name_part not in titles] + names[i] = [ + name_part for name_part in names[i] if name_part not in titles + ] names[i] = " ".join(names[i]) # one last check to remove any pieces that might add extra whitespace @@ -429,7 +431,9 @@ def name_rank(first_name: str, last_name: str) -> list: if first_name_result and isinstance(first_name_result, dict): first_name_data = first_name_result.get("first_name") if first_name_data and "rank" in first_name_data: - first_name_rank = first_name_data["rank"].get("United States", 0) + first_name_rank = first_name_data["rank"].get( + "United States", 0 + ) else: first_name_rank = None if isinstance(last_name, str): diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 1f565446..779469b5 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -65,12 +65,20 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: individuals["full_name"].notnull() ] if individuals["first_name"].isnull().any(): - name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) first_name = name.apply(lambda x: x["first"]) individuals["first_name"] = first_name if individuals["last_name"].isnull().any(): - name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) + name = ( + individuals["full_name"] + .apply(HumanName) + .apply(lambda x: x.as_dict()) + ) last_name = name.apply(lambda x: x["last"]) individuals["last_name"] = last_name From 26d47736e65212150aff8e619d73f3723b859bdc Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 29 Feb 2024 15:01:36 +0000 Subject: [PATCH 16/37] classify function --- utils/linkage_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index f13b4235..e9fcf06c 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -1,6 +1,5 @@ import pandas as pd - -# from classify import classify_wrapper +from classify import classify_wrapper from nameparser import HumanName from utils.constants import BASE_FILEPATH @@ -172,6 +171,8 @@ def main(): ["donor_id", "recipient_id"] ].replace(deduped) + individuals, organizations = classify_wrapper(individuals, organizations) + individuals.to_csv(cleaned_individuals_output_path, index=False) organizations.to_csv(cleaned_organizations_output_path, index=False) transactions.to_csv(cleaned_transactions_output_path, index=False) From 3266ce7e965a09e816c875e2e79be1d4f062f2df Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 4 Mar 2024 02:04:09 +0000 Subject: [PATCH 17/37] slight changes --- utils/linkage_pipeline.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index e9fcf06c..5f251f3b 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -68,20 +68,12 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: individuals["full_name"].notnull() ] if individuals["first_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) first_name = name.apply(lambda x: x["first"]) individuals["first_name"] = first_name if individuals["last_name"].isnull().any(): - name = ( - individuals["full_name"] - .apply(HumanName) - .apply(lambda x: x.as_dict()) - ) + name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict()) last_name = name.apply(lambda x: x["last"]) individuals["last_name"] = last_name @@ -167,12 +159,15 @@ def main(): deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv") + # Classifies individuals and organizations with a new 'classification' + # column containing 'neutral', 'f', or 'c' + individuals, organizations = classify_wrapper(individuals, organizations) + + # Update the transactions table with the deduplicated UUIDs transactions[["donor_id", "recipient_id"]] = transactions[ ["donor_id", "recipient_id"] ].replace(deduped) - individuals, organizations = classify_wrapper(individuals, organizations) - individuals.to_csv(cleaned_individuals_output_path, index=False) organizations.to_csv(cleaned_organizations_output_path, index=False) transactions.to_csv(cleaned_transactions_output_path, index=False) From d262deeccc886c0103cc2c12866d83bb8b843370 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 4 Mar 2024 04:00:12 +0000 Subject: [PATCH 18/37] possible splink implementation fix --- utils/linkage_pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 1b4db66e..9baa5204 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -171,9 +171,10 @@ def main(): deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv") # Splink deduplication - individuals = splink_dedupe( - individuals, individuals_settings, individuals_blocking - ) + individuals["unique_id"] = individuals["id"] + organizations["unique_id"] = organizations["id"] + + individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking) organizations = splink_dedupe( organizations, organizations_settings, organizations_blocking From b8da98e509ad572dc736228a49d0f067eed063e2 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 4 Mar 2024 17:26:11 +0000 Subject: [PATCH 19/37] updating splink function --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 5988a8e3..d7237037 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -683,7 +683,7 @@ def splink_dedupe( deduped_df = pd.merge( first_instance_df, - match_list_df[["cluster_id"]], + match_list_df[["cluster_id", "duplicated"]], on="cluster_id", how="left", ) From 0185093f0ca00189f9959399693d18127b530c0b Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 4 Mar 2024 17:26:27 +0000 Subject: [PATCH 20/37] pipeline updates --- utils/linkage_pipeline.py | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 9baa5204..e80bd032 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -151,44 +151,40 @@ def main(): organizations = preprocess_organizations(organizations) transactions = preprocess_transactions(transactions) - # Deduplicates perfect matches and creates a new csv file - # in output titled "deduplicated_UUIDs.csv" + individuals, organizations = classify_wrapper(individuals, organizations) + individuals = deduplicate_perfect_matches(individuals) organizations = deduplicate_perfect_matches(organizations) - cleaned_individuals_output_path = ( - BASE_FILEPATH / "output" / "cleaned_individuals_table.csv" - ) - - cleaned_organizations_output_path = ( - BASE_FILEPATH / "output" / "cleaned_organizations_table.csv" - ) - - cleaned_transactions_output_path = ( - BASE_FILEPATH / "output" / "cleaned_transactions_table.csv" - ) - deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv") - # Splink deduplication individuals["unique_id"] = individuals["id"] organizations["unique_id"] = organizations["id"] - individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking) - organizations = splink_dedupe( organizations, organizations_settings, organizations_blocking ) - # Classifies individuals and organizations with a new 'classification' - # column containing 'neutral', 'f', or 'c' - individuals, organizations = classify_wrapper(individuals, organizations) + individuals = splink_dedupe( + individuals, individuals_settings, individuals_blocking + ) - # Update the transactions table with the deduplicated UUIDs transactions[["donor_id", "recipient_id"]] = transactions[ ["donor_id", "recipient_id"] ].replace(deduped) + cleaned_individuals_output_path = ( + BASE_FILEPATH / "output" / "cleaned_individuals_table.csv" + ) + + cleaned_organizations_output_path = ( + BASE_FILEPATH / "output" / "cleaned_organizations_table.csv" + ) + + cleaned_transactions_output_path = ( + BASE_FILEPATH / "output" / "cleaned_transactions_table.csv" + ) + individuals.to_csv(cleaned_individuals_output_path, index=False) organizations.to_csv(cleaned_organizations_output_path, index=False) transactions.to_csv(cleaned_transactions_output_path, index=False) From f05778b1171fe12f17d3d104311679b76f3a751d Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 4 Mar 2024 17:29:21 +0000 Subject: [PATCH 21/37] passing linter --- utils/linkage_pipeline.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index e80bd032..537e79d3 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -1,7 +1,7 @@ import pandas as pd -from classify import classify_wrapper from nameparser import HumanName +from utils.classify import classify_wrapper from utils.constants import ( BASE_FILEPATH, individuals_blocking, @@ -165,9 +165,7 @@ def main(): organizations, organizations_settings, organizations_blocking ) - individuals = splink_dedupe( - individuals, individuals_settings, individuals_blocking - ) + individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking) transactions[["donor_id", "recipient_id"]] = transactions[ ["donor_id", "recipient_id"] From 6de450df8e8b91eb40b1803e4bbcd4f698dd9dea Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 4 Mar 2024 17:31:16 +0000 Subject: [PATCH 22/37] linter --- utils/linkage_pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 537e79d3..ac911559 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -165,7 +165,9 @@ def main(): organizations, organizations_settings, organizations_blocking ) - individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking) + individuals = splink_dedupe( + individuals, individuals_settings, individuals_blocking + ) transactions[["donor_id", "recipient_id"]] = transactions[ ["donor_id", "recipient_id"] From 51cc9def6d793165c9022c21124e4a40e30a6c38 Mon Sep 17 00:00:00 2001 From: Nicolas Posner Date: Mon, 4 Mar 2024 14:57:36 -0600 Subject: [PATCH 23/37] updated classify test --- utils/tests/test_classifier.py | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 utils/tests/test_classifier.py diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py new file mode 100644 index 00000000..602c52ac --- /dev/null +++ b/utils/tests/test_classifier.py @@ -0,0 +1,45 @@ +import numpy as np +import pandas as pd +import pytest + +from utils.classify import matcher + +d = { + "name": [ + "bob von rosevich", + "anantarya smith", + "bob j vonrosevich", + "missy elliot", + "mr johnson", + "quarantin directino", + "missy eliot", + "joseph johnson", + ], + "address": [ + "3 Blue Drive, Chicago", + "4 Blue Drive, Chicago", + "8 Fancy Way, Chicago", + "8 Fancy Way, Evanston", + "17 Regular Road, Chicago", + "42 Hollywood Boulevard, Chicago", + "8 Fancy Way, Evanston", + "17 Regular Road, Chicago", + ], +} + +test_df = pd.DataFrame(data=d) + +test_df["classification"] = "neutral" + + +@pytest.fixture +def matcher_scen_1(): + return test_df + + +def test_matcher_scen_1(matcher_scen_1): + res = matcher(matcher_scen_1, "Fancy", "address", "f") + + assert np.all( + res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"]) + ) From 4cc7ce4c8a7d8edc50a7f032a96a25b0c74db60f Mon Sep 17 00:00:00 2001 From: Nicolas Posner Date: Mon, 4 Mar 2024 15:03:32 -0600 Subject: [PATCH 24/37] fix pytest --- utils/classify.py | 19 ++++++------------- utils/tests/test_classifier.py | 3 ++- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/utils/classify.py b/utils/classify.py index 3c24f941..4061970a 100644 --- a/utils/classify.py +++ b/utils/classify.py @@ -3,7 +3,9 @@ from utils.constants import c_org_names, f_companies, f_org_names -def classify_wrapper(individuals_df, organizations_df): +def classify_wrapper( + individuals_df: pd.DataFrame, organizations_df: pd.DataFrame +): """Wrapper for classificaiton in linkage pipeline Initialize the classify column in both dataframes and @@ -25,7 +27,7 @@ def classify_wrapper(individuals_df, organizations_df): return classified_individuals, classified_orgs -def matcher(df, substring, column, category): +def matcher(df: pd.DataFrame, substring: str, column: str, category: str): """Applies a label to the classification column based on substrings We run through a given column containing strings in the dataframe. We @@ -42,7 +44,7 @@ def matcher(df, substring, column, category): return df -def classify_individuals(individuals_df): +def classify_individuals(individuals_df: pd.DataFrame): """Part of the classification pipeline We apply the matcher function to the individuals dataframe @@ -56,7 +58,7 @@ def classify_individuals(individuals_df): return individuals_df -def classify_orgs(organizations_df): +def classify_orgs(organizations_df: pd.DataFrame): """Part of the classification pipeline We apply the matcher function to the organizations dataframe @@ -73,11 +75,6 @@ def classify_orgs(organizations_df): return organizations_df -inds_list = [] - -# a list of individual names - - def similarity_calculator( df: pd.DataFrame, subject: str, n: int, comparison_func ) -> pd.DataFrame: @@ -133,7 +130,3 @@ def automated_classifier( ) return similarities_df - - # we can use the indices and/or select manually, just add a new - # column to the subjects table - # that marks fossil fuels, green energy, or neither diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py index 602c52ac..b6bce883 100644 --- a/utils/tests/test_classifier.py +++ b/utils/tests/test_classifier.py @@ -38,7 +38,8 @@ def matcher_scen_1(): def test_matcher_scen_1(matcher_scen_1): - res = matcher(matcher_scen_1, "Fancy", "address", "f") + matcher(matcher_scen_1, "Fancy", "address", "f") + res = test_df[test_df["classification"] == "f"]["name"].values assert np.all( res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"]) From 94f807c1693a04039c6f7f95114da81897ce489f Mon Sep 17 00:00:00 2001 From: Nicolas Posner Date: Mon, 4 Mar 2024 15:12:14 -0600 Subject: [PATCH 25/37] Revert "fix pytest" This reverts commit 4cc7ce4c8a7d8edc50a7f032a96a25b0c74db60f. i accidentally put this on the wrong branch --- utils/classify.py | 19 +++++++++++++------ utils/tests/test_classifier.py | 3 +-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/utils/classify.py b/utils/classify.py index 4061970a..3c24f941 100644 --- a/utils/classify.py +++ b/utils/classify.py @@ -3,9 +3,7 @@ from utils.constants import c_org_names, f_companies, f_org_names -def classify_wrapper( - individuals_df: pd.DataFrame, organizations_df: pd.DataFrame -): +def classify_wrapper(individuals_df, organizations_df): """Wrapper for classificaiton in linkage pipeline Initialize the classify column in both dataframes and @@ -27,7 +25,7 @@ def classify_wrapper( return classified_individuals, classified_orgs -def matcher(df: pd.DataFrame, substring: str, column: str, category: str): +def matcher(df, substring, column, category): """Applies a label to the classification column based on substrings We run through a given column containing strings in the dataframe. We @@ -44,7 +42,7 @@ def matcher(df: pd.DataFrame, substring: str, column: str, category: str): return df -def classify_individuals(individuals_df: pd.DataFrame): +def classify_individuals(individuals_df): """Part of the classification pipeline We apply the matcher function to the individuals dataframe @@ -58,7 +56,7 @@ def classify_individuals(individuals_df: pd.DataFrame): return individuals_df -def classify_orgs(organizations_df: pd.DataFrame): +def classify_orgs(organizations_df): """Part of the classification pipeline We apply the matcher function to the organizations dataframe @@ -75,6 +73,11 @@ def classify_orgs(organizations_df: pd.DataFrame): return organizations_df +inds_list = [] + +# a list of individual names + + def similarity_calculator( df: pd.DataFrame, subject: str, n: int, comparison_func ) -> pd.DataFrame: @@ -130,3 +133,7 @@ def automated_classifier( ) return similarities_df + + # we can use the indices and/or select manually, just add a new + # column to the subjects table + # that marks fossil fuels, green energy, or neither diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py index b6bce883..602c52ac 100644 --- a/utils/tests/test_classifier.py +++ b/utils/tests/test_classifier.py @@ -38,8 +38,7 @@ def matcher_scen_1(): def test_matcher_scen_1(matcher_scen_1): - matcher(matcher_scen_1, "Fancy", "address", "f") - res = test_df[test_df["classification"] == "f"]["name"].values + res = matcher(matcher_scen_1, "Fancy", "address", "f") assert np.all( res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"]) From d62f3b70049e55b6f26eaad2774d9bd7dca8c2e3 Mon Sep 17 00:00:00 2001 From: Nicolas Posner Date: Mon, 4 Mar 2024 15:14:07 -0600 Subject: [PATCH 26/37] Revert "updated classify test" This reverts commit 51cc9def6d793165c9022c21124e4a40e30a6c38. accidentally on wrong branch --- utils/tests/test_classifier.py | 45 ---------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 utils/tests/test_classifier.py diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py deleted file mode 100644 index 602c52ac..00000000 --- a/utils/tests/test_classifier.py +++ /dev/null @@ -1,45 +0,0 @@ -import numpy as np -import pandas as pd -import pytest - -from utils.classify import matcher - -d = { - "name": [ - "bob von rosevich", - "anantarya smith", - "bob j vonrosevich", - "missy elliot", - "mr johnson", - "quarantin directino", - "missy eliot", - "joseph johnson", - ], - "address": [ - "3 Blue Drive, Chicago", - "4 Blue Drive, Chicago", - "8 Fancy Way, Chicago", - "8 Fancy Way, Evanston", - "17 Regular Road, Chicago", - "42 Hollywood Boulevard, Chicago", - "8 Fancy Way, Evanston", - "17 Regular Road, Chicago", - ], -} - -test_df = pd.DataFrame(data=d) - -test_df["classification"] = "neutral" - - -@pytest.fixture -def matcher_scen_1(): - return test_df - - -def test_matcher_scen_1(matcher_scen_1): - res = matcher(matcher_scen_1, "Fancy", "address", "f") - - assert np.all( - res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"]) - ) From 743b30618689ec698101052c47c705e3476e85bc Mon Sep 17 00:00:00 2001 From: Nicolas Posner Date: Mon, 4 Mar 2024 20:59:25 -0600 Subject: [PATCH 27/37] updating readme and makefile as well as location of data for linkage_pipeline --- Makefile | 7 +++---- README.md | 9 +++++---- data/README.md | 6 ++++++ output/README.md | 4 ++++ utils/linkage_pipeline.py | 6 +++--- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 36577581..3de9758d 100644 --- a/Makefile +++ b/Makefile @@ -30,8 +30,7 @@ run-notebooks: --no-browser --allow-root -#running the linkage pipeline and creating the network graph -#still waiting on linkage_pipeline completion to get this into final shape +output_network_graph: + python linkage_pipeline.py -output network_graph: all_individuals.csv all_organizations.csv all_transactions.csv - python linkage_pipeline.py \ No newline at end of file +.PHONY: output_network_graph \ No newline at end of file diff --git a/README.md b/README.md index 879a41e0..4be8c9ba 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,9 @@ If you prefer to develop inside a container with VS Code then do the following s 6. For future reference, the above pipeline also stores the information mapping given id to our database id (generated via uuid) in a csv file in the format of (state)IDMap.csv (example: ArizonaIDMap.csv) in the output folder ### Record Linkage and Network Pipeline -1. Save the standardized tables "complete_individuals_table.csv", "complete_organizations_table.csv", and "complete_transactions_table.csv" (collected from the above pipeline or data from the project's Google Drive) in the following format: repo_root / "output" / "file" -2. **UPDATE:** Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, and an interactive network visual -3. The pipeline will output the deduplicated tables saved as "cleaned_individuals_table.csv", "cleaned_organizations_table.csv", and "cleaned_transactions_table.csv". A mapping file, "deduplicated_UUIDs" tracks the UUIDs designated as duplicates. +1. Download the complete_data.zip file from the Climate Cabinet Data Clinic Google Drive using this link: https://drive.google.com/file/d/1zbjt7iBU0NAWSBcUyEsjvuumn3VgI4z9/view?usp=sharing. After downloading this .zip, unzip it to find three files: complete_individuals.csv, complete_organizations.csv, and complete_transactions.csv. Upload these files into the data folder and ensure that their names are correct. They must follow this format: repo_root / "output" / "file" +2. Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, and create a NetworkX Graph object. +3. The pipeline will output a NetworkX Graph object and a txt file containing graph metrics into the output folder. ## Repository Structure @@ -65,7 +65,8 @@ If the data is larger than 50MB than you should not add it to the repo and inste This [README.md file](/data/README.md) should be kept up to date. ### output -Should contain work product generated by the analysis. Keep in mind that results should (generally) be excluded from the git repository. +This folder is empty by default. The final outputs of the Makefile will be placed here, consisting of a NetworkX Graph object and a txt file containing graph metrics. + ## Team Member diff --git a/data/README.md b/data/README.md index 5326bff8..df9336b7 100644 --- a/data/README.md +++ b/data/README.md @@ -2,6 +2,12 @@ This directory contains information for use in this project. +## Makefile and Final Pipeline +- This folder is empty by default. In order to run the Makefile, download the complete_data.zip file from the Climate Cabinet Data Clinic Google Drive using this link: https://drive.google.com/file/d/1zbjt7iBU0NAWSBcUyEsjvuumn3VgI4z9/view?usp=sharing + + - After downloading this .zip, unzip it to find three files: complete_individuals.csv, complete_organizations.csv, and complete_transactions.csv. Upload these files into the data folder and ensure that their names are correct. Once they are in place, you may run the Makefile. + + ## Arizona Campaign Finance Data ### Summary diff --git a/output/README.md b/output/README.md index 932298fd..06e91212 100644 --- a/output/README.md +++ b/output/README.md @@ -1,2 +1,6 @@ # Output README --- + +## Makefile and Final Pipeline + +- This folder is empty by default. The output of the Makefile process will be output into this folder, consisting of a NetworkX Graph object and a txt file containing graph metrics. diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index ac911559..499726e9 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -136,15 +136,15 @@ def preprocess_transactions(transactions: pd.DataFrame) -> pd.DataFrame: def main(): organizations = pd.read_csv( - BASE_FILEPATH / "output" / "complete_organizations_table.csv" + BASE_FILEPATH / "data" / "complete_organizations_table.csv" ) individuals = pd.read_csv( - BASE_FILEPATH / "output" / "complete_individuals_table.csv" + BASE_FILEPATH / "data" / "complete_individuals_table.csv" ) transactions = pd.read_csv( - BASE_FILEPATH / "output" / "complete_transactions_table.csv" + BASE_FILEPATH / "data" / "complete_transactions_table.csv" ) individuals = preprocess_individuals(individuals) From a571d91cb5238089aca9fde1f27878828cc7a08a Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Tue, 5 Mar 2024 06:21:34 +0000 Subject: [PATCH 28/37] slight update to splink_dedupe function --- utils/linkage.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index d7237037..484c1060 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -341,9 +341,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names[i] = determine_comma_role(names[i]) names[i] = names[i].replace(".", "").split(" ") - names[i] = [ - name_part for name_part in names[i] if name_part not in titles - ] + names[i] = [name_part for name_part in names[i] if name_part not in titles] names[i] = " ".join(names[i]) # one last check to remove any pieces that might add extra whitespace @@ -432,9 +430,7 @@ def name_rank(first_name: str, last_name: str) -> list: if first_name_result and isinstance(first_name_result, dict): first_name_data = first_name_result.get("first_name") if first_name_data and "rank" in first_name_data: - first_name_rank = first_name_data["rank"].get( - "United States", 0 - ) + first_name_rank = first_name_data["rank"].get("United States", 0) else: first_name_rank = None if isinstance(last_name, str): @@ -636,9 +632,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: raise ValueError("Can not find Address Number") -def splink_dedupe( - df: pd.DataFrame, settings: dict, blocking: list -) -> pd.DataFrame: +def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame: """Given a dataframe and config settings, return a deduplicated dataframe @@ -689,6 +683,9 @@ def splink_dedupe( ) deduped_df.rename(columns={"cluster_id": "unique_id"}, inplace=True) + deduped_df["duplicated"] = deduped_df["duplicated"].apply( + lambda x: x if isinstance(x, list) else [x] + ) convert_duplicates_to_dict(deduped_df) deduped_df.drop(columns=["duplicated"]) From 1db28399005bb2c5ee38e9b4bfd3c6f2d3fb77c2 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Tue, 5 Mar 2024 06:22:21 +0000 Subject: [PATCH 29/37] pre-commit fixes --- utils/linkage.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 484c1060..43febf41 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -341,7 +341,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names[i] = determine_comma_role(names[i]) names[i] = names[i].replace(".", "").split(" ") - names[i] = [name_part for name_part in names[i] if name_part not in titles] + names[i] = [ + name_part for name_part in names[i] if name_part not in titles + ] names[i] = " ".join(names[i]) # one last check to remove any pieces that might add extra whitespace @@ -430,7 +432,9 @@ def name_rank(first_name: str, last_name: str) -> list: if first_name_result and isinstance(first_name_result, dict): first_name_data = first_name_result.get("first_name") if first_name_data and "rank" in first_name_data: - first_name_rank = first_name_data["rank"].get("United States", 0) + first_name_rank = first_name_data["rank"].get( + "United States", 0 + ) else: first_name_rank = None if isinstance(last_name, str): @@ -632,7 +636,9 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: raise ValueError("Can not find Address Number") -def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame: +def splink_dedupe( + df: pd.DataFrame, settings: dict, blocking: list +) -> pd.DataFrame: """Given a dataframe and config settings, return a deduplicated dataframe From 7ebe2a26e3f8c18ec3f9aebd8aeae9872cfa1050 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 01:38:37 +0000 Subject: [PATCH 30/37] slight changes --- utils/linkage_pipeline.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py index 499726e9..b9a87fe8 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_pipeline.py @@ -19,6 +19,7 @@ splink_dedupe, standardize_corp_names, ) +from utils.network import construct_network_graph def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: @@ -95,6 +96,18 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: axis=1, ) + individuals["sort_priority"] = ( + ~individuals["first_name"].isna() + & ~individuals["last_name"].isna() + & ~individuals["company"].isna() + ) * 2 + (~individuals["party"].isna()) + + individuals = individuals.sort_values( + by="sort_priority", ascending=False + ).drop(columns=["sort_priority"]) + + individuals["unique_id"] = individuals["id"] + return individuals @@ -112,6 +125,8 @@ def preprocess_organizations(organizations: pd.DataFrame) -> pd.DataFrame: .apply(standardize_corp_names) ) + organizations["unique_id"] = organizations["id"] + return organizations @@ -131,6 +146,11 @@ def preprocess_transactions(transactions: pd.DataFrame) -> pd.DataFrame: transactions["purpose"] = transactions["purpose"].str.upper() + deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv") + transactions[["donor_id", "recipient_id"]] = transactions[ + ["donor_id", "recipient_id"] + ].replace(deduped) + return transactions @@ -149,17 +169,13 @@ def main(): individuals = preprocess_individuals(individuals) organizations = preprocess_organizations(organizations) - transactions = preprocess_transactions(transactions) individuals, organizations = classify_wrapper(individuals, organizations) individuals = deduplicate_perfect_matches(individuals) organizations = deduplicate_perfect_matches(organizations) - deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv") - - individuals["unique_id"] = individuals["id"] - organizations["unique_id"] = organizations["id"] + transactions = preprocess_transactions(transactions) organizations = splink_dedupe( organizations, organizations_settings, organizations_blocking @@ -169,10 +185,6 @@ def main(): individuals, individuals_settings, individuals_blocking ) - transactions[["donor_id", "recipient_id"]] = transactions[ - ["donor_id", "recipient_id"] - ].replace(deduped) - cleaned_individuals_output_path = ( BASE_FILEPATH / "output" / "cleaned_individuals_table.csv" ) @@ -189,6 +201,10 @@ def main(): organizations.to_csv(cleaned_organizations_output_path, index=False) transactions.to_csv(cleaned_transactions_output_path, index=False) + construct_network_graph( + 2018, 2024, [individuals, organizations, transactions] + ) + if __name__ == "__main__": main() From 9a0352151fb451ddf1846e89d2948921e3bee149 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 01:44:30 +0000 Subject: [PATCH 31/37] renaming file --- utils/{linkage_pipeline.py => linkage_and_network_pipeline.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename utils/{linkage_pipeline.py => linkage_and_network_pipeline.py} (99%) diff --git a/utils/linkage_pipeline.py b/utils/linkage_and_network_pipeline.py similarity index 99% rename from utils/linkage_pipeline.py rename to utils/linkage_and_network_pipeline.py index b9a87fe8..134d5f2d 100644 --- a/utils/linkage_pipeline.py +++ b/utils/linkage_and_network_pipeline.py @@ -202,7 +202,7 @@ def main(): transactions.to_csv(cleaned_transactions_output_path, index=False) construct_network_graph( - 2018, 2024, [individuals, organizations, transactions] + 2018, 2023, [individuals, organizations, transactions] ) From d4161f61db0df0e24bc3bd002999ceca6b0f0c70 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 02:07:32 +0000 Subject: [PATCH 32/37] updating functions to latest versions --- utils/linkage.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 28f12dd4..5791da59 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -137,7 +137,12 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: >>> get_likely_name("Jane","","Doe, Jane, Elisabeth") 'Jane Elisabeth Doe' """ - # first ensure clean input by deleting spaces: + # first, convert any Nans to empty strings '' + first_name, last_name, full_name = [ + "" if x is np.NAN else x for x in [first_name, last_name, full_name] + ] + + # second, ensure clean input by deleting spaces: first_name, last_name, full_name = list( map(lambda x: x.lower().strip(), [first_name, last_name, full_name]) ) @@ -220,21 +225,23 @@ def get_street_from_address_line_1(address_line_1: str) -> str: def convert_duplicates_to_dict(df: pd.DataFrame) -> None: - """Saves to the "output" directory a file where each row represents a string - matching to another string + """For each uuid, maps it to all other uuids for which it has been deemed a + match. - Given a dataframe where each row contains one string in a column and a list - of strings in another column, the function maps each string in the list to - the single string. + Given a dataframe where the uuids of all rows deemed similar are stored in a + list and all but the first row of each paired uuid is dropped, this function + maps the matched uuids to a single uuid. Args: - A pandas dataframe + A pandas df containing a column called 'duplicated', where each row is a + list of all uuids deemed a match. In each list, all uuids but the first + have their rows already dropped. Returns None. However it outputs a file to the output directory, with 2 - columns. The first, which indicates the duplicated UUIDs, is labeled - 'duplicated_uuids', and the 2nd, which shows the uuids to which the - deduplicated entries match to, is labeled 'mapped_uuids'. + columns. The first lists all the uuids in df, and is labeled + 'original_uuids.' The 2nd shows the uuids to which each entry is mapped + to, and is labeled 'mapped_uuids'. """ deduped_dict = {} for i in range(len(df)): @@ -245,7 +252,7 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> None: # now convert dictionary into a csv file deduped_df = pd.DataFrame.from_dict(deduped_dict, "index") deduped_df = deduped_df.reset_index().rename( - columns={"index": "duplicated_uuids", 0: "mapped_uuids"} + columns={"index": "original_uuids", 0: "mapped_uuid"} ) deduped_df.to_csv( repo_root / "output" / "deduplicated_UUIDs.csv", @@ -273,7 +280,9 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: # now find the duplicates along all columns but the ID new_df = ( - new_df.groupby(df.columns[1:].tolist(), dropna=False)["id"] + new_df.groupby(df.columns.difference(["id"]).tolist(), dropna=False)[ + "id" + ] .agg(list) .reset_index() .rename(columns={"id": "duplicated"}) From 45347e26a39b50a951a69e2c77c50d16b1fd0bfc Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 02:09:14 +0000 Subject: [PATCH 33/37] slight changes to match function changes in linkage.py --- utils/linkage_and_network_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/linkage_and_network_pipeline.py b/utils/linkage_and_network_pipeline.py index 134d5f2d..bd6bcfbd 100644 --- a/utils/linkage_and_network_pipeline.py +++ b/utils/linkage_and_network_pipeline.py @@ -89,13 +89,13 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: individuals["full_name"] = individuals.apply( lambda row: get_likely_name( - row["first_name"] if pd.notnull(row["first_name"]) else "", - row["last_name"] if pd.notnull(row["last_name"]) else "", - row["full_name"] if pd.notnull(row["full_name"]) else "", + row["first_name"], row["last_name"], row["full_name"] ), axis=1, ) + # Ensure that columns with values are prioritized and appear first + # important for splink implementation individuals["sort_priority"] = ( ~individuals["first_name"].isna() & ~individuals["last_name"].isna() From ad2ed0f5e9a30bd5c246c01ab4e3d4550a8f3dc3 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 18:36:38 +0000 Subject: [PATCH 34/37] slight changes --- Makefile | 5 ++-- notebooks/Test.ipynb | 39 --------------------------- setup.py | 2 +- utils/linkage_and_network_pipeline.py | 34 +++++++++++++---------- 4 files changed, 24 insertions(+), 56 deletions(-) delete mode 100644 notebooks/Test.ipynb diff --git a/Makefile b/Makefile index 07383c3c..48879489 100644 --- a/Makefile +++ b/Makefile @@ -33,5 +33,6 @@ run-notebooks: #running the linkage pipeline and creating the network graph #still waiting on linkage_pipeline completion to get this into final shape -output network_graph: all_individuals.csv all_organizations.csv all_transactions.csv - python linkage_pipeline.py +run-linkage-and-network-pipeline: + docker build -t $(project_image_name) -f Dockerfile $(current_abs_path) + docker run -v $(current_abs_path):/project -t $(project_image_name) python utils/linkage_pipeline.py \ No newline at end of file diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb deleted file mode 100644 index 5df942e1..00000000 --- a/notebooks/Test.ipynb +++ /dev/null @@ -1,39 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Example Notebook file demonstrating how to use the file structure\n", - "from utils.preprocess_util_lib_example import save_random_dataframe\n", - "from pathlib import Path\n", - "\n", - "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/setup.py b/setup.py index 63ef672a..07404acd 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup setup( - name="2023-fall-clinic-climate-cabinet", + name="2024-winter-clinic-climate-cabinet", version="0.1.0", packages=find_packages( include=[ diff --git a/utils/linkage_and_network_pipeline.py b/utils/linkage_and_network_pipeline.py index bd6bcfbd..7e5f8cec 100644 --- a/utils/linkage_and_network_pipeline.py +++ b/utils/linkage_and_network_pipeline.py @@ -1,3 +1,4 @@ +import networkx as nx import pandas as pd from nameparser import HumanName @@ -19,7 +20,11 @@ splink_dedupe, standardize_corp_names, ) -from utils.network import construct_network_graph +from utils.network import ( + create_network_graph, + combine_datasets_for_network_graph, + construct_network_graph, +) def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: @@ -102,9 +107,9 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: & ~individuals["company"].isna() ) * 2 + (~individuals["party"].isna()) - individuals = individuals.sort_values( - by="sort_priority", ascending=False - ).drop(columns=["sort_priority"]) + individuals = individuals.sort_values(by="sort_priority", ascending=False).drop( + columns=["sort_priority"] + ) individuals["unique_id"] = individuals["id"] @@ -159,9 +164,7 @@ def main(): BASE_FILEPATH / "data" / "complete_organizations_table.csv" ) - individuals = pd.read_csv( - BASE_FILEPATH / "data" / "complete_individuals_table.csv" - ) + individuals = pd.read_csv(BASE_FILEPATH / "data" / "complete_individuals_table.csv") transactions = pd.read_csv( BASE_FILEPATH / "data" / "complete_transactions_table.csv" @@ -175,15 +178,13 @@ def main(): individuals = deduplicate_perfect_matches(individuals) organizations = deduplicate_perfect_matches(organizations) - transactions = preprocess_transactions(transactions) - organizations = splink_dedupe( organizations, organizations_settings, organizations_blocking ) - individuals = splink_dedupe( - individuals, individuals_settings, individuals_blocking - ) + individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking) + + transactions = preprocess_transactions(transactions) cleaned_individuals_output_path = ( BASE_FILEPATH / "output" / "cleaned_individuals_table.csv" @@ -201,9 +202,14 @@ def main(): organizations.to_csv(cleaned_organizations_output_path, index=False) transactions.to_csv(cleaned_transactions_output_path, index=False) - construct_network_graph( - 2018, 2023, [individuals, organizations, transactions] + aggreg_df = combine_datasets_for_network_graph( + [individuals, organizations, transactions] ) + g = create_network_graph(aggreg_df) + g_output_path = BASE_FILEPATH / "output" / "g.gml" + nx.write_graphml(g, g_output_path) + + construct_network_graph(2018, 2023, [individuals, organizations, transactions]) if __name__ == "__main__": From 4b0de47ef040c04e6adc78409e272fb04e132129 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 18:48:41 +0000 Subject: [PATCH 35/37] readme changes --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a610a4ec..da6bba26 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ If you prefer to develop inside a container with VS Code then do the following s ### Record Linkage and Network Pipeline 1. Save the standardized tables "complete_individuals_table.csv", "complete_organizations_table.csv", and "complete_transactions_table.csv" (collected from the above pipeline or data from the project's Google Drive) in the following format: repo_root / "output" / "file" -2. **UPDATE:** Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, convert the standardized tables into a NetworkX Graph, and show an interactive network visual. +2. **UPDATE:** Run the pipeline by calling ```make run-linkage-and-network-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, convert the standardized tables into a NetworkX Graph, and show an interactive network visual. 3. The pipeline will output the deduplicated tables saved as "cleaned_individuals_table.csv", "cleaned_organizations_table.csv", and "cleaned_transactions_table.csv". A mapping file, "deduplicated_UUIDs" tracks the UUIDs designated as duplicates. The pipeline will also output "Network Graph Node Data", which is the NetworkX Graph object converted into an adjecency list. ## Repository Structure From 0c7902394bce4864e984eab13305ba99855a86dd Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 18:50:36 +0000 Subject: [PATCH 36/37] data/ readme changes --- data/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/data/README.md b/data/README.md index df9336b7..5326bff8 100644 --- a/data/README.md +++ b/data/README.md @@ -2,12 +2,6 @@ This directory contains information for use in this project. -## Makefile and Final Pipeline -- This folder is empty by default. In order to run the Makefile, download the complete_data.zip file from the Climate Cabinet Data Clinic Google Drive using this link: https://drive.google.com/file/d/1zbjt7iBU0NAWSBcUyEsjvuumn3VgI4z9/view?usp=sharing - - - After downloading this .zip, unzip it to find three files: complete_individuals.csv, complete_organizations.csv, and complete_transactions.csv. Upload these files into the data folder and ensure that their names are correct. Once they are in place, you may run the Makefile. - - ## Arizona Campaign Finance Data ### Summary From 48470c21eb14ca164516f241ab5d2646f008a318 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 6 Mar 2024 18:51:15 +0000 Subject: [PATCH 37/37] pre-commit formatting changes --- utils/linkage_and_network_pipeline.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/utils/linkage_and_network_pipeline.py b/utils/linkage_and_network_pipeline.py index 7e5f8cec..86e0ab62 100644 --- a/utils/linkage_and_network_pipeline.py +++ b/utils/linkage_and_network_pipeline.py @@ -21,9 +21,9 @@ standardize_corp_names, ) from utils.network import ( - create_network_graph, combine_datasets_for_network_graph, construct_network_graph, + create_network_graph, ) @@ -107,9 +107,9 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame: & ~individuals["company"].isna() ) * 2 + (~individuals["party"].isna()) - individuals = individuals.sort_values(by="sort_priority", ascending=False).drop( - columns=["sort_priority"] - ) + individuals = individuals.sort_values( + by="sort_priority", ascending=False + ).drop(columns=["sort_priority"]) individuals["unique_id"] = individuals["id"] @@ -164,7 +164,9 @@ def main(): BASE_FILEPATH / "data" / "complete_organizations_table.csv" ) - individuals = pd.read_csv(BASE_FILEPATH / "data" / "complete_individuals_table.csv") + individuals = pd.read_csv( + BASE_FILEPATH / "data" / "complete_individuals_table.csv" + ) transactions = pd.read_csv( BASE_FILEPATH / "data" / "complete_transactions_table.csv" @@ -182,7 +184,9 @@ def main(): organizations, organizations_settings, organizations_blocking ) - individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking) + individuals = splink_dedupe( + individuals, individuals_settings, individuals_blocking + ) transactions = preprocess_transactions(transactions) @@ -209,7 +213,9 @@ def main(): g_output_path = BASE_FILEPATH / "output" / "g.gml" nx.write_graphml(g, g_output_path) - construct_network_graph(2018, 2023, [individuals, organizations, transactions]) + construct_network_graph( + 2018, 2023, [individuals, organizations, transactions] + ) if __name__ == "__main__":