From 7dc5b7054cd6f525c6c542d12bb7bb4b68487d11 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 8 Feb 2024 07:40:57 +0000
Subject: [PATCH 01/37] updating requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index db05b66f..0b21babc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ Requests==2.31.0
 setuptools==68.0.0
 textdistance==4.6.1
 usaddress==0.5.4
+nameparser==1.1.3

From a3310a1871b0415140a4dad8bd41ab46d47fa97b Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 8 Feb 2024 07:41:35 +0000
Subject: [PATCH 02/37] adding pre_process pipeline funcion

---
 utils/linkage.py | 144 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)

diff --git a/utils/linkage.py b/utils/linkage.py
index ac11a5ac..97b0ad6e 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,5 +1,10 @@
+import re
+from typing import Tuple
+
+import pandas as pd
 import textdistance as td
 import usaddress
+from nameparser import HumanName
 
 from utils.constants import COMPANY_TYPES
 
@@ -194,3 +199,142 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
         elif address_line_1_components[i][1] == "USPSBoxID":
             return address_line_1_components[i][0]
     raise ValueError("Can not find Address Number")
+
+
+def cleaning_company_column(company_entry: str) -> str:
+    """
+    Given a string, check if it contains a variation of self employed, unemployed,
+    or retired and return the standardized version.
+
+    Args:
+        company: string of inputted company names
+    Returns:
+        standardized for retired, self employed, and unemployed,
+        or original string if no match or empty string
+
+    >>> cleaning_company_column("Retireed")
+    'Retired'
+    >>> cleaning_company_column("self")
+    'Self Employed'
+    >>> cleaning_company_column("None")
+    'Unemployed'
+    >>> cleaning_company_column("N/A")
+    'Unemployed'
+    """
+
+    if not company_entry:
+        return company_entry
+
+    company_edited = company_entry.lower()
+
+    if company_edited == "n/a":
+        return "Unemployed"
+
+    company_edited = re.sub(r"[^\w\s]", "", company_edited)
+
+    if (
+        company_edited == "retired"
+        or company_edited == "retiree"
+        or company_edited == "retire"
+        or "retiree" in company_edited
+    ):
+        return "Retired"
+
+    elif (
+        "self employe" in company_edited
+        or "freelance" in company_edited
+        or company_edited == "self"
+        or company_edited == "independent contractor"
+    ):
+        return "Self Employed"
+    elif (
+        "unemploye" in company_edited
+        or company_edited == "none"
+        or company_edited == "not employed"
+        or company_edited == "nan"
+    ):
+        return "Unemployed"
+
+    else:
+        return company_edited
+
+
+def preprocess_pipeline(
+    individuals: pd.DataFrame,
+    Address: str,
+    organizations: pd.DataFrame,
+    transactions: pd.DataFrame,
+) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Preprocesses data for record linkage
+
+    Args:
+        Individuals: dataframe of individual contributions
+        Address: column name of address
+        Organizations: dataframe of organization contributions
+        Transactions: dataframe of transactions
+    Returns:
+        preprocessed tuple of dataframes
+        first element is the individuals dataframe,
+        second element is the organizations dataframe,
+        third element is the transactions dataframe
+    """
+    # Preprocess organizations dataframe
+    organizations["name"] = (
+        organizations["name"].astype(str).apply(standardize_corp_names)
+    )
+
+    # Preprocess individuals dataframe
+    if "Unnamed: 0" in individuals.columns:
+        individuals.drop(columns="Unnamed: 0", inplace=True)
+
+    individuals = individuals.astype(
+        {"first_name": str, "last_name": str, "full_name": str, "company": str}
+    )
+
+    # Standardize company names in individuals dataframe
+    individuals["company"] = individuals["company"].apply(
+        standardize_corp_names
+    )
+    individuals["company"] = individuals["company"].apply(
+        cleaning_company_column
+    )
+
+    # Address functions, assuming address column is named 'address'
+    individuals["Address Line 1"] = individuals[Address].apply(
+        get_address_line_1_from_full_address
+    )
+    individuals["Street Name"] = individuals["Address Line 1"].apply(
+        get_street_from_address_line_1
+    )
+    individuals["Address Number"] = individuals["Address Line 1"].apply(
+        get_address_number_from_address_line_1
+    )
+
+    # Check if first name or last names are empty, if so, extract from full name column
+    individuals["full_name"] = individuals["full_name"].astype(str)
+    if individuals["first_name"].isnull().any():
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
+        first_name = name.apply(lambda x: x["first"])
+        individuals["first_name"] = first_name
+
+    if individuals["last_name"].isnull().any():
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
+        last_name = name.apply(lambda x: x["last"])
+        individuals["last_name"] = last_name
+
+    # Transactions
+    if "Unnamed: 0" in transactions.columns:
+        transactions.drop(columns="Unnamed: 0", inplace=True)
+
+    transactions["purpose"] = transactions["purpose"].str.upper()
+
+    return individuals, organizations, transactions

From c3c8defec982adfea07ebba96b735f6cfd5ec29e Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 19 Feb 2024 16:09:46 +0000
Subject: [PATCH 03/37] preprocess file and function initial commit

---
 utils/preprocess.py | 81 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 utils/preprocess.py

diff --git a/utils/preprocess.py b/utils/preprocess.py
new file mode 100644
index 00000000..2831996a
--- /dev/null
+++ b/utils/preprocess.py
@@ -0,0 +1,81 @@
+from typing import Tuple
+
+import pandas as pd
+from nameparser import HumanName
+
+from utils.linkage import (
+    cleaning_company_column,
+    get_address_line_1_from_full_address,
+    get_address_number_from_address_line_1,
+    get_street_from_address_line_1,
+    standardize_corp_names,
+)
+
+
+def preprocess_pipeline(
+    individuals: pd.DataFrame,
+    organizations: pd.DataFrame,
+    transactions: pd.DataFrame,
+) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Preprocesses data for record linkage
+
+    Args:
+        Individuals: dataframe of individual contributions
+        Organizations: dataframe of organization contributions
+        Transactions: dataframe of transactions
+    Returns:
+        preprocessed tuple of dataframes
+        first element is the individuals dataframe,
+        second element is the organizations dataframe,
+        third element is the transactions dataframe
+    """
+    # Preprocess organizations dataframe
+    organizations["name"] = (
+        organizations["name"].astype(str, skipna=True).apply(standardize_corp_names)
+    )
+
+    # Preprocess individuals dataframe
+    if "Unnamed: 0" in individuals.columns:
+        individuals.drop(columns="Unnamed: 0", inplace=True)
+
+    individuals = individuals.astype(
+        {"first_name": str, "last_name": str, "full_name": str, "company": str}
+    )
+
+    # Standardize company names in individuals dataframe
+    individuals["company"] = individuals["company"].apply(standardize_corp_names)
+    individuals["company"] = individuals["company"].apply(cleaning_company_column)
+
+    # Address functions, assuming address column is named 'address'
+    individuals["Address Line 1"] = individuals["Address"].apply(
+        get_address_line_1_from_full_address
+    )
+    individuals["Street Name"] = individuals["Address Line 1"].apply(
+        get_street_from_address_line_1
+    )
+    individuals["Address Number"] = individuals["Address Line 1"].apply(
+        get_address_number_from_address_line_1
+    )
+
+    # Check if first name or last names are empty, if so, extract from full name column
+    individuals["full_name"] = individuals["full_name"].astype(str)[
+        individuals["full_name"].notnull()
+    ]
+    if individuals["first_name"].isnull().any():
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        first_name = name.apply(lambda x: x["first"])
+        individuals["first_name"] = first_name
+
+    if individuals["last_name"].isnull().any():
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        last_name = name.apply(lambda x: x["last"])
+        individuals["last_name"] = last_name
+
+    # Transactions
+    if "Unnamed: 0" in transactions.columns:
+        transactions.drop(columns="Unnamed: 0", inplace=True)
+
+    transactions["purpose"] = transactions["purpose"].str.upper()
+
+    return individuals, organizations, transactions

From cccc7cc2665793e4777974b6464d29e9b594feb5 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 19 Feb 2024 16:21:54 +0000
Subject: [PATCH 04/37] slight edits

---
 utils/preprocess.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/utils/preprocess.py b/utils/preprocess.py
index 2831996a..55a99810 100644
--- a/utils/preprocess.py
+++ b/utils/preprocess.py
@@ -32,7 +32,9 @@ def preprocess_pipeline(
     """
     # Preprocess organizations dataframe
     organizations["name"] = (
-        organizations["name"].astype(str, skipna=True).apply(standardize_corp_names)
+        organizations["name"]
+        .astype(str, skipna=True)
+        .apply(standardize_corp_names)
     )
 
     # Preprocess individuals dataframe
@@ -44,8 +46,12 @@ def preprocess_pipeline(
     )
 
     # Standardize company names in individuals dataframe
-    individuals["company"] = individuals["company"].apply(standardize_corp_names)
-    individuals["company"] = individuals["company"].apply(cleaning_company_column)
+    individuals["company"] = individuals["company"].apply(
+        standardize_corp_names
+    )
+    individuals["company"] = individuals["company"].apply(
+        cleaning_company_column
+    )
 
     # Address functions, assuming address column is named 'address'
     individuals["Address Line 1"] = individuals["Address"].apply(
@@ -63,12 +69,20 @@ def preprocess_pipeline(
         individuals["full_name"].notnull()
     ]
     if individuals["first_name"].isnull().any():
-        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
         first_name = name.apply(lambda x: x["first"])
         individuals["first_name"] = first_name
 
     if individuals["last_name"].isnull().any():
-        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
         last_name = name.apply(lambda x: x["last"])
         individuals["last_name"] = last_name
 

From 57c6070bb8c85743a5ebb5b2584db5427b32a35a Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 19 Feb 2024 16:23:12 +0000
Subject: [PATCH 05/37] removing preprocess function from linkage.py

---
 utils/linkage.py | 84 ------------------------------------------------
 1 file changed, 84 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 97b0ad6e..2c80939a 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,10 +1,7 @@
 import re
-from typing import Tuple
 
-import pandas as pd
 import textdistance as td
 import usaddress
-from nameparser import HumanName
 
 from utils.constants import COMPANY_TYPES
 
@@ -257,84 +254,3 @@ def cleaning_company_column(company_entry: str) -> str:
 
     else:
         return company_edited
-
-
-def preprocess_pipeline(
-    individuals: pd.DataFrame,
-    Address: str,
-    organizations: pd.DataFrame,
-    transactions: pd.DataFrame,
-) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """
-    Preprocesses data for record linkage
-
-    Args:
-        Individuals: dataframe of individual contributions
-        Address: column name of address
-        Organizations: dataframe of organization contributions
-        Transactions: dataframe of transactions
-    Returns:
-        preprocessed tuple of dataframes
-        first element is the individuals dataframe,
-        second element is the organizations dataframe,
-        third element is the transactions dataframe
-    """
-    # Preprocess organizations dataframe
-    organizations["name"] = (
-        organizations["name"].astype(str).apply(standardize_corp_names)
-    )
-
-    # Preprocess individuals dataframe
-    if "Unnamed: 0" in individuals.columns:
-        individuals.drop(columns="Unnamed: 0", inplace=True)
-
-    individuals = individuals.astype(
-        {"first_name": str, "last_name": str, "full_name": str, "company": str}
-    )
-
-    # Standardize company names in individuals dataframe
-    individuals["company"] = individuals["company"].apply(
-        standardize_corp_names
-    )
-    individuals["company"] = individuals["company"].apply(
-        cleaning_company_column
-    )
-
-    # Address functions, assuming address column is named 'address'
-    individuals["Address Line 1"] = individuals[Address].apply(
-        get_address_line_1_from_full_address
-    )
-    individuals["Street Name"] = individuals["Address Line 1"].apply(
-        get_street_from_address_line_1
-    )
-    individuals["Address Number"] = individuals["Address Line 1"].apply(
-        get_address_number_from_address_line_1
-    )
-
-    # Check if first name or last names are empty, if so, extract from full name column
-    individuals["full_name"] = individuals["full_name"].astype(str)
-    if individuals["first_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
-        first_name = name.apply(lambda x: x["first"])
-        individuals["first_name"] = first_name
-
-    if individuals["last_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
-        last_name = name.apply(lambda x: x["last"])
-        individuals["last_name"] = last_name
-
-    # Transactions
-    if "Unnamed: 0" in transactions.columns:
-        transactions.drop(columns="Unnamed: 0", inplace=True)
-
-    transactions["purpose"] = transactions["purpose"].str.upper()
-
-    return individuals, organizations, transactions

From 277663672fcd1faf2cee83f51096d39e71dedbbe Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 19 Feb 2024 16:24:17 +0000
Subject: [PATCH 06/37] slight changes

---
 utils/preprocess.py | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/utils/preprocess.py b/utils/preprocess.py
index 55a99810..f3755eec 100644
--- a/utils/preprocess.py
+++ b/utils/preprocess.py
@@ -32,9 +32,7 @@ def preprocess_pipeline(
     """
     # Preprocess organizations dataframe
     organizations["name"] = (
-        organizations["name"]
-        .astype(str, skipna=True)
-        .apply(standardize_corp_names)
+        organizations["name"].astype(str, skipna=True).apply(standardize_corp_names)
     )
 
     # Preprocess individuals dataframe
@@ -46,14 +44,10 @@ def preprocess_pipeline(
     )
 
     # Standardize company names in individuals dataframe
-    individuals["company"] = individuals["company"].apply(
-        standardize_corp_names
-    )
-    individuals["company"] = individuals["company"].apply(
-        cleaning_company_column
-    )
+    individuals["company"] = individuals["company"].apply(standardize_corp_names)
+    individuals["company"] = individuals["company"].apply(cleaning_company_column)
 
-    # Address functions, assuming address column is named 'address'
+    # Address functions, assuming address column is named 'Address'
     individuals["Address Line 1"] = individuals["Address"].apply(
         get_address_line_1_from_full_address
     )
@@ -69,20 +63,12 @@ def preprocess_pipeline(
         individuals["full_name"].notnull()
     ]
     if individuals["first_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
         first_name = name.apply(lambda x: x["first"])
         individuals["first_name"] = first_name
 
     if individuals["last_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
         last_name = name.apply(lambda x: x["last"])
         individuals["last_name"] = last_name
 

From 1ea09b4034a687c458dad3d5cbe573c24b8bf59b Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 19 Feb 2024 13:50:39 -0600
Subject: [PATCH 07/37] Renaming File

---
 utils/{preprocess.py => linkage_pipeline.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename utils/{preprocess.py => linkage_pipeline.py} (100%)

diff --git a/utils/preprocess.py b/utils/linkage_pipeline.py
similarity index 100%
rename from utils/preprocess.py
rename to utils/linkage_pipeline.py

From 21af2c951ae837a94a7603af71bdb267349b0f4d Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 22 Feb 2024 07:38:04 +0000
Subject: [PATCH 08/37] updates

---
 utils/linkage_pipeline.py | 87 ++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 16 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index f3755eec..0f7be5e5 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -3,10 +3,13 @@
 import pandas as pd
 from nameparser import HumanName
 
+from utils.constants import BASE_FILEPATH
 from utils.linkage import (
     cleaning_company_column,
+    deduplicate_perfect_matches,
     get_address_line_1_from_full_address,
     get_address_number_from_address_line_1,
+    get_likely_name,
     get_street_from_address_line_1,
     standardize_corp_names,
 )
@@ -32,50 +35,102 @@ def preprocess_pipeline(
     """
     # Preprocess organizations dataframe
     organizations["name"] = (
-        organizations["name"].astype(str, skipna=True).apply(standardize_corp_names)
+        organizations["name"].astype(str).apply(standardize_corp_names)
     )
+    if "Unnamed: 0" in organizations.columns:
+        organizations.drop(columns="Unnamed: 0", inplace=True)
 
     # Preprocess individuals dataframe
     if "Unnamed: 0" in individuals.columns:
         individuals.drop(columns="Unnamed: 0", inplace=True)
 
     individuals = individuals.astype(
-        {"first_name": str, "last_name": str, "full_name": str, "company": str}
+        {
+            "first_name": str,
+            "last_name": str,
+            "full_name": str,
+            "company": "string",
+        }
     )
 
     # Standardize company names in individuals dataframe
-    individuals["company"] = individuals["company"].apply(standardize_corp_names)
-    individuals["company"] = individuals["company"].apply(cleaning_company_column)
-
-    # Address functions, assuming address column is named 'Address'
-    individuals["Address Line 1"] = individuals["Address"].apply(
-        get_address_line_1_from_full_address
-    )
-    individuals["Street Name"] = individuals["Address Line 1"].apply(
-        get_street_from_address_line_1
+    individuals["company"] = (
+        individuals["company"]
+        .loc[individuals["company"].notnull()]
+        .apply(standardize_corp_names)
     )
-    individuals["Address Number"] = individuals["Address Line 1"].apply(
-        get_address_number_from_address_line_1
+    individuals["company"] = (
+        individuals["company"]
+        .loc[individuals["company"].notnull()]
+        .apply(cleaning_company_column)
     )
 
+    # Address functions, assuming address column is named 'Address'
+    # If there is an "Address" column in the first place
+    if "Address" in individuals.columns:
+        individuals["Address"] = individuals["Address"].astype(str)
+        individuals["Address Line 1"] = individuals["Address"].apply(
+            get_address_line_1_from_full_address
+        )
+        individuals["Street Name"] = individuals["Address Line 1"].apply(
+            get_street_from_address_line_1
+        )
+        individuals["Address Number"] = individuals["Address Line 1"].apply(
+            get_address_number_from_address_line_1
+        )
+
     # Check if first name or last names are empty, if so, extract from full name column
     individuals["full_name"] = individuals["full_name"].astype(str)[
         individuals["full_name"].notnull()
     ]
     if individuals["first_name"].isnull().any():
-        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
         first_name = name.apply(lambda x: x["first"])
         individuals["first_name"] = first_name
 
     if individuals["last_name"].isnull().any():
-        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
         last_name = name.apply(lambda x: x["last"])
         individuals["last_name"] = last_name
 
-    # Transactions
+    individuals["full_name"] = individuals.apply(
+        lambda row: get_likely_name(
+            row["first_name"], row["last_name"], row["full_name"]
+        ),
+        axis=1,
+    )
+
     if "Unnamed: 0" in transactions.columns:
         transactions.drop(columns="Unnamed: 0", inplace=True)
 
     transactions["purpose"] = transactions["purpose"].str.upper()
 
     return individuals, organizations, transactions
+
+
+organizations = pd.read_csv(
+    BASE_FILEPATH / "output" / "complete_organizations_table.csv"
+)
+
+individuals = pd.read_csv(
+    BASE_FILEPATH / "output" / "complete_individuals_table.csv"
+)
+
+transactions = pd.read_csv(
+    BASE_FILEPATH / "output" / "complete_transactions_table.csv"
+)
+
+individuals, organizations, transactions = preprocess_pipeline(
+    individuals, organizations, transactions
+)
+
+individuals = deduplicate_perfect_matches(individuals)
+organizations = deduplicate_perfect_matches(organizations)

From 4d7bdfb9cfe95b7c0c8e98314b2ca2977fb8c266 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 22 Feb 2024 07:46:05 +0000
Subject: [PATCH 09/37] adding output csv

---
 utils/linkage_pipeline.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 0f7be5e5..b5e4d451 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -134,3 +134,19 @@ def preprocess_pipeline(
 
 individuals = deduplicate_perfect_matches(individuals)
 organizations = deduplicate_perfect_matches(organizations)
+
+processed_individuals_output_path = (
+    BASE_FILEPATH / "output" / "processed_individuals_table.csv"
+)
+
+processed_organizations_output_path = (
+    BASE_FILEPATH / "output" / "processed_organizations_table.csv"
+)
+
+processed_transactions_output_path = (
+    BASE_FILEPATH / "output" / "processed_transactions_table.csv"
+)
+
+individuals.to_csv(processed_individuals_output_path)
+organizations.to_csv(processed_organizations_output_path)
+transactions.to_csv(processed_transactions_output_path)

From 42ca58e75333f4d91836b7dc64134adc0247810b Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 28 Feb 2024 04:13:59 +0000
Subject: [PATCH 10/37] pipeline changes

---
 utils/linkage_pipeline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index b5e4d451..613d3244 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -133,7 +133,6 @@ def preprocess_pipeline(
 )
 
 individuals = deduplicate_perfect_matches(individuals)
-organizations = deduplicate_perfect_matches(organizations)
 
 processed_individuals_output_path = (
     BASE_FILEPATH / "output" / "processed_individuals_table.csv"

From 77bc2b3e4ac276b2d290a092d8f1ae51dd6a41a4 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 28 Feb 2024 04:19:36 +0000
Subject: [PATCH 11/37] adding removed files

---
 utils/classify.py           |  75 +++++++++++++++++++++++++
 utils/tests/test_linkage.py | 107 ++++++++++++++++++++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 utils/classify.py
 create mode 100644 utils/tests/test_linkage.py

diff --git a/utils/classify.py b/utils/classify.py
new file mode 100644
index 00000000..db574ace
--- /dev/null
+++ b/utils/classify.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+# we want to run down a list of people and, hopefully, their adresses, plus a list of
+# corporations, groups, etc, and classify them, basically just looking for matches
+
+# do we want to just input all the names/people (there's not many, less than 200
+# for sure),give a string similarity match score, and extract the top ten for
+# manual review? this should give us a feeling for how to set our threshold
+# we might also, once we have all the data, buckle down and just classify
+# some of them manually
+
+inds_list = []
+
+# a list of individual names
+
+
+def similarity_calculator(
+    df: pd.DataFrame, subject: str, n: int, comparison_func
+) -> pd.DataFrame:
+    """Find best matches to a subject name in a pandas dataframe
+
+    For a given individual or organization, the subject, we search through the
+    'name'column of a dataframe, select the n highest matches according to a
+    selected comparison function, and return those as a dataframe. This is meant
+    to be used manually to search for matches. For quick automated processing, see
+    automated_classifier().
+
+    Note that the comparison function must take in two inputs, both strings, and
+    output a percentage match
+    """
+
+    similarities_df = df.copy()
+
+    similarities = similarities_df["name"].apply(
+        lambda x: comparison_func(x, subject)
+    )
+
+    similarities_df["similarities"] = similarities
+
+    top_n_matches = similarities_df.sort_values(
+        by=["similarities"], ascending=False
+    )[0:n]
+
+    return top_n_matches
+
+
+def automated_classifier(
+    df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func
+):
+    """Using similarity_calculator, classify entities automatically
+
+    Feeding a dictionary of names and the associated statuses, we compare
+    the string matches and, if they exceed a certain threshold, classify
+    them as belonging to some group specified in the subjects dictionary.
+    """
+
+    similarities_df = df.copy()
+
+    for subject in subjects_dict:
+        similarities = similarities_df["name"].apply(
+            lambda x, sub=subject: comparison_func(x, sub)
+        )
+        matches = similarities >= threshold
+
+        status = subjects_dict[subject]
+
+        similarities_df["classification"] = pd.Series(matches).apply(
+            lambda x, stat=status: stat if x else "neutral"
+        )
+
+    return similarities_df
+
+    # we can use the indices and/or select manually, just add a new
+    # column to the subjects table
+    # that marks fossil fuels, green energy, or neither
diff --git a/utils/tests/test_linkage.py b/utils/tests/test_linkage.py
new file mode 100644
index 00000000..3695a399
--- /dev/null
+++ b/utils/tests/test_linkage.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from utils.linkage import (
+    calculate_row_similarity,
+    calculate_string_similarity,
+    row_matches,
+)
+
+# import pytest
+
+
+# creating a test for calculate_row_similarity and row_matches
+
+# to put in data:
+d = {
+    "name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"],
+    "address": [
+        "3 Blue Drive, Chicago",
+        "4 Blue Drive, Chicago",
+        "8 Fancy Way, Chicago",
+    ],
+}
+test_df = pd.DataFrame(data=d)
+
+
+@pytest.fixture
+def row_similarity_scen_1():
+    return test_df
+
+
+@pytest.fixture
+def row_similarity_scen_2():
+    return test_df
+
+
+def test_row_similarity_scen_1(row_similarity_scen_1):
+    wrong = calculate_row_similarity(
+        row_similarity_scen_1.iloc[[0]],
+        row_similarity_scen_1.iloc[[1]],
+        np.array([0.8, 0.2]),
+        calculate_string_similarity,
+    )
+    right = calculate_row_similarity(
+        row_similarity_scen_1.iloc[[0]],
+        row_similarity_scen_1.iloc[[2]],
+        np.array([0.8, 0.2]),
+        calculate_string_similarity,
+    )
+
+    assert right > wrong
+
+
+def test_row_similarity_scen_2(row_similarity_scen_2):
+    wrong = calculate_row_similarity(
+        row_similarity_scen_2.iloc[[0]],
+        row_similarity_scen_2.iloc[[1]],
+        np.array([0.2, 0.8]),
+        calculate_string_similarity,
+    )
+    right = calculate_row_similarity(
+        row_similarity_scen_2.iloc[[0]],
+        row_similarity_scen_2.iloc[[2]],
+        np.array([0.2, 0.8]),
+        calculate_string_similarity,
+    )
+
+    assert right < wrong
+
+
+d2 = {
+    "name": [
+        "bob von rosevich",
+        "anantarya smith",
+        "bob j vonrosevich",
+        "missy elliot",
+        "mr johnson",
+        "quarantin directino",
+        "missy eliot",
+        "joseph johnson",
+    ],
+    "address": [
+        "3 Blue Drive, Chicago",
+        "4 Blue Drive, Chicago",
+        "8 Fancy Way, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+        "42 Hollywood Boulevard, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+    ],
+}
+test_df2 = pd.DataFrame(data=d2)
+
+
+@pytest.fixture
+def row_match_scen1():
+    return test_df2
+
+
+def test_row_matches(row_match_scen1):
+    res = row_matches(
+        row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity
+    )
+
+    assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []}

From 3c619375b2a8a7a68a72fea4a97a2fee30360043 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 28 Feb 2024 15:54:25 +0000
Subject: [PATCH 12/37] proper updates

---
 utils/linkage_pipeline.py | 146 +++++++++++++++++++++-----------------
 1 file changed, 82 insertions(+), 64 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 613d3244..1f565446 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import pandas as pd
 from nameparser import HumanName
 
@@ -15,45 +13,29 @@
 )
 
 
-def preprocess_pipeline(
-    individuals: pd.DataFrame,
-    organizations: pd.DataFrame,
-    transactions: pd.DataFrame,
-) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
     """
-    Preprocesses data for record linkage
+    Given a dataframe of individual donors, preprocesses the data,
+    and return a cleaned dataframe.
 
     Args:
-        Individuals: dataframe of individual contributions
-        Organizations: dataframe of organization contributions
-        Transactions: dataframe of transactions
+        individuals: dataframe of individual contributions
+
     Returns:
-        preprocessed tuple of dataframes
-        first element is the individuals dataframe,
-        second element is the organizations dataframe,
-        third element is the transactions dataframe
+        cleaned dataframe of individuals
     """
-    # Preprocess organizations dataframe
-    organizations["name"] = (
-        organizations["name"].astype(str).apply(standardize_corp_names)
-    )
-    if "Unnamed: 0" in organizations.columns:
-        organizations.drop(columns="Unnamed: 0", inplace=True)
-
-    # Preprocess individuals dataframe
     if "Unnamed: 0" in individuals.columns:
         individuals.drop(columns="Unnamed: 0", inplace=True)
 
     individuals = individuals.astype(
         {
-            "first_name": str,
-            "last_name": str,
-            "full_name": str,
+            "first_name": "string",
+            "last_name": "string",
+            "full_name": "string",
             "company": "string",
         }
     )
 
-    # Standardize company names in individuals dataframe
     individuals["company"] = (
         individuals["company"]
         .loc[individuals["company"].notnull()]
@@ -66,7 +48,6 @@ def preprocess_pipeline(
     )
 
     # Address functions, assuming address column is named 'Address'
-    # If there is an "Address" column in the first place
     if "Address" in individuals.columns:
         individuals["Address"] = individuals["Address"].astype(str)
         individuals["Address Line 1"] = individuals["Address"].apply(
@@ -84,20 +65,12 @@ def preprocess_pipeline(
         individuals["full_name"].notnull()
     ]
     if individuals["first_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
         first_name = name.apply(lambda x: x["first"])
         individuals["first_name"] = first_name
 
     if individuals["last_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
         last_name = name.apply(lambda x: x["last"])
         individuals["last_name"] = last_name
 
@@ -108,44 +81,89 @@ def preprocess_pipeline(
         axis=1,
     )
 
+    return individuals
+
+
+def preprocess_organizations(organizations: pd.DataFrame) -> pd.DataFrame:
+    """
+    Given a dataframe of organization donors, preprocesses the data,
+    and return a cleaned dataframe.
+    """
+    if "Unnamed: 0" in organizations.columns:
+        organizations.drop(columns="Unnamed: 0", inplace=True)
+
+    organizations["name"] = (
+        organizations["name"]
+        .loc[organizations["name"].notnull()]
+        .apply(standardize_corp_names)
+    )
+
+    return organizations
+
+
+def preprocess_transactions(transactions: pd.DataFrame) -> pd.DataFrame:
+    """
+    Given a dataframe of transactions, preprocesses the data,
+    and return a cleaned dataframe.
+
+    Args:
+        transactions: dataframe of transactions
+
+    Returns:
+        cleaned dataframe of transactions
+    """
     if "Unnamed: 0" in transactions.columns:
         transactions.drop(columns="Unnamed: 0", inplace=True)
 
     transactions["purpose"] = transactions["purpose"].str.upper()
 
-    return individuals, organizations, transactions
+    return transactions
 
 
-organizations = pd.read_csv(
-    BASE_FILEPATH / "output" / "complete_organizations_table.csv"
-)
+def main():
+    organizations = pd.read_csv(
+        BASE_FILEPATH / "output" / "complete_organizations_table.csv"
+    )
 
-individuals = pd.read_csv(
-    BASE_FILEPATH / "output" / "complete_individuals_table.csv"
-)
+    individuals = pd.read_csv(
+        BASE_FILEPATH / "output" / "complete_individuals_table.csv"
+    )
 
-transactions = pd.read_csv(
-    BASE_FILEPATH / "output" / "complete_transactions_table.csv"
-)
+    transactions = pd.read_csv(
+        BASE_FILEPATH / "output" / "complete_transactions_table.csv"
+    )
 
-individuals, organizations, transactions = preprocess_pipeline(
-    individuals, organizations, transactions
-)
+    individuals = preprocess_individuals(individuals)
+    organizations = preprocess_organizations(organizations)
+    transactions = preprocess_transactions(transactions)
 
-individuals = deduplicate_perfect_matches(individuals)
+    # Deduplicates perfect matches and creates a new csv file
+    # in output titled "deduplicated_UUIDs.csv"
+    individuals = deduplicate_perfect_matches(individuals)
+    organizations = deduplicate_perfect_matches(organizations)
 
-processed_individuals_output_path = (
-    BASE_FILEPATH / "output" / "processed_individuals_table.csv"
-)
+    cleaned_individuals_output_path = (
+        BASE_FILEPATH / "output" / "cleaned_individuals_table.csv"
+    )
 
-processed_organizations_output_path = (
-    BASE_FILEPATH / "output" / "processed_organizations_table.csv"
-)
+    cleaned_organizations_output_path = (
+        BASE_FILEPATH / "output" / "cleaned_organizations_table.csv"
+    )
+
+    cleaned_transactions_output_path = (
+        BASE_FILEPATH / "output" / "cleaned_transactions_table.csv"
+    )
+
+    deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv")
+
+    transactions[["donor_id", "recipient_id"]] = transactions[
+        ["donor_id", "recipient_id"]
+    ].replace(deduped)
+
+    individuals.to_csv(cleaned_individuals_output_path)
+    organizations.to_csv(cleaned_organizations_output_path)
+    transactions.to_csv(cleaned_transactions_output_path)
 
-processed_transactions_output_path = (
-    BASE_FILEPATH / "output" / "processed_transactions_table.csv"
-)
 
-individuals.to_csv(processed_individuals_output_path)
-organizations.to_csv(processed_organizations_output_path)
-transactions.to_csv(processed_transactions_output_path)
+if __name__ == "__main__":
+    main()

From 4e32543c82bec739f90cbf55a5749464d2a5851f Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 28 Feb 2024 16:03:19 +0000
Subject: [PATCH 13/37] removing duplicated function

---
 utils/linkage.py | 60 ------------------------------------------------
 1 file changed, 60 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 29319907..cae5024d 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,5 +1,3 @@
-import re
-
 import textdistance as td
 import usaddress
 from names_dataset import NameDataset
@@ -635,61 +633,3 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
         elif address_line_1_components[i][1] == "USPSBoxID":
             return address_line_1_components[i][0]
     raise ValueError("Can not find Address Number")
-
-
-def cleaning_company_column(company_entry: str) -> str:
-    """
-    Given a string, check if it contains a variation of self employed, unemployed,
-    or retired and return the standardized version.
-
-    Args:
-        company: string of inputted company names
-    Returns:
-        standardized for retired, self employed, and unemployed,
-        or original string if no match or empty string
-
-    >>> cleaning_company_column("Retireed")
-    'Retired'
-    >>> cleaning_company_column("self")
-    'Self Employed'
-    >>> cleaning_company_column("None")
-    'Unemployed'
-    >>> cleaning_company_column("N/A")
-    'Unemployed'
-    """
-
-    if not company_entry:
-        return company_entry
-
-    company_edited = company_entry.lower()
-
-    if company_edited == "n/a":
-        return "Unemployed"
-
-    company_edited = re.sub(r"[^\w\s]", "", company_edited)
-
-    if (
-        company_edited == "retired"
-        or company_edited == "retiree"
-        or company_edited == "retire"
-        or "retiree" in company_edited
-    ):
-        return "Retired"
-
-    elif (
-        "self employe" in company_edited
-        or "freelance" in company_edited
-        or company_edited == "self"
-        or company_edited == "independent contractor"
-    ):
-        return "Self Employed"
-    elif (
-        "unemploye" in company_edited
-        or company_edited == "none"
-        or company_edited == "not employed"
-        or company_edited == "nan"
-    ):
-        return "Unemployed"
-
-    else:
-        return company_edited

From d94243af71ebbcf97ef7ba50d1cc06f5e15a5ce4 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 28 Feb 2024 16:14:33 +0000
Subject: [PATCH 14/37] attempting to pass dev checks

---
 utils/linkage.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index cae5024d..a6fcbdab 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -340,9 +340,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
             names[i] = determine_comma_role(names[i])
 
         names[i] = names[i].replace(".", "").split(" ")
-        names[i] = [
-            name_part for name_part in names[i] if name_part not in titles
-        ]
+        names[i] = [name_part for name_part in names[i] if name_part not in titles]
         names[i] = " ".join(names[i])
 
     # one last check to remove any pieces that might add extra whitespace
@@ -431,9 +429,7 @@ def name_rank(first_name: str, last_name: str) -> list:
         if first_name_result and isinstance(first_name_result, dict):
             first_name_data = first_name_result.get("first_name")
             if first_name_data and "rank" in first_name_data:
-                first_name_rank = first_name_data["rank"].get(
-                    "United States", 0
-                )
+                first_name_rank = first_name_data["rank"].get("United States", 0)
     else:
         first_name_rank = None
     if isinstance(last_name, str):

From df41e42d4134a50305f139ca0e7b53d181f31810 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 28 Feb 2024 16:21:24 +0000
Subject: [PATCH 15/37] reformatting files

---
 utils/linkage.py          |  8 ++++++--
 utils/linkage_pipeline.py | 12 ++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index a6fcbdab..cae5024d 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -340,7 +340,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
             names[i] = determine_comma_role(names[i])
 
         names[i] = names[i].replace(".", "").split(" ")
-        names[i] = [name_part for name_part in names[i] if name_part not in titles]
+        names[i] = [
+            name_part for name_part in names[i] if name_part not in titles
+        ]
         names[i] = " ".join(names[i])
 
     # one last check to remove any pieces that might add extra whitespace
@@ -429,7 +431,9 @@ def name_rank(first_name: str, last_name: str) -> list:
         if first_name_result and isinstance(first_name_result, dict):
             first_name_data = first_name_result.get("first_name")
             if first_name_data and "rank" in first_name_data:
-                first_name_rank = first_name_data["rank"].get("United States", 0)
+                first_name_rank = first_name_data["rank"].get(
+                    "United States", 0
+                )
     else:
         first_name_rank = None
     if isinstance(last_name, str):
diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 1f565446..779469b5 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -65,12 +65,20 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
         individuals["full_name"].notnull()
     ]
     if individuals["first_name"].isnull().any():
-        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
         first_name = name.apply(lambda x: x["first"])
         individuals["first_name"] = first_name
 
     if individuals["last_name"].isnull().any():
-        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
+        name = (
+            individuals["full_name"]
+            .apply(HumanName)
+            .apply(lambda x: x.as_dict())
+        )
         last_name = name.apply(lambda x: x["last"])
         individuals["last_name"] = last_name
 

From 26d47736e65212150aff8e619d73f3723b859bdc Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 29 Feb 2024 15:01:36 +0000
Subject: [PATCH 16/37] classify function

---
 utils/linkage_pipeline.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index f13b4235..e9fcf06c 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -1,6 +1,5 @@
 import pandas as pd
-
-# from classify import classify_wrapper
+from classify import classify_wrapper
 from nameparser import HumanName
 
 from utils.constants import BASE_FILEPATH
@@ -172,6 +171,8 @@ def main():
         ["donor_id", "recipient_id"]
     ].replace(deduped)
 
+    individuals, organizations = classify_wrapper(individuals, organizations)
+
     individuals.to_csv(cleaned_individuals_output_path, index=False)
     organizations.to_csv(cleaned_organizations_output_path, index=False)
     transactions.to_csv(cleaned_transactions_output_path, index=False)

From 3266ce7e965a09e816c875e2e79be1d4f062f2df Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 4 Mar 2024 02:04:09 +0000
Subject: [PATCH 17/37] slight changes

---
 utils/linkage_pipeline.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index e9fcf06c..5f251f3b 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -68,20 +68,12 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
         individuals["full_name"].notnull()
     ]
     if individuals["first_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
         first_name = name.apply(lambda x: x["first"])
         individuals["first_name"] = first_name
 
     if individuals["last_name"].isnull().any():
-        name = (
-            individuals["full_name"]
-            .apply(HumanName)
-            .apply(lambda x: x.as_dict())
-        )
+        name = individuals["full_name"].apply(HumanName).apply(lambda x: x.as_dict())
         last_name = name.apply(lambda x: x["last"])
         individuals["last_name"] = last_name
 
@@ -167,12 +159,15 @@ def main():
 
     deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv")
 
+    # Classifies individuals and organizations with a new 'classification'
+    # column containing 'neutral', 'f', or 'c'
+    individuals, organizations = classify_wrapper(individuals, organizations)
+
+    # Update the transactions table with the deduplicated UUIDs
     transactions[["donor_id", "recipient_id"]] = transactions[
         ["donor_id", "recipient_id"]
     ].replace(deduped)
 
-    individuals, organizations = classify_wrapper(individuals, organizations)
-
     individuals.to_csv(cleaned_individuals_output_path, index=False)
     organizations.to_csv(cleaned_organizations_output_path, index=False)
     transactions.to_csv(cleaned_transactions_output_path, index=False)

From d262deeccc886c0103cc2c12866d83bb8b843370 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 4 Mar 2024 04:00:12 +0000
Subject: [PATCH 18/37] possible splink implementation fix

---
 utils/linkage_pipeline.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 1b4db66e..9baa5204 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -171,9 +171,10 @@ def main():
     deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv")
 
     # Splink deduplication
-    individuals = splink_dedupe(
-        individuals, individuals_settings, individuals_blocking
-    )
+    individuals["unique_id"] = individuals["id"]
+    organizations["unique_id"] = organizations["id"]
+
+    individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking)
 
     organizations = splink_dedupe(
         organizations, organizations_settings, organizations_blocking

From b8da98e509ad572dc736228a49d0f067eed063e2 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 4 Mar 2024 17:26:11 +0000
Subject: [PATCH 19/37] updating splink function

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 5988a8e3..d7237037 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -683,7 +683,7 @@ def splink_dedupe(
 
     deduped_df = pd.merge(
         first_instance_df,
-        match_list_df[["cluster_id"]],
+        match_list_df[["cluster_id", "duplicated"]],
         on="cluster_id",
         how="left",
     )

From 0185093f0ca00189f9959399693d18127b530c0b Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 4 Mar 2024 17:26:27 +0000
Subject: [PATCH 20/37] pipeline updates

---
 utils/linkage_pipeline.py | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 9baa5204..e80bd032 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -151,44 +151,40 @@ def main():
     organizations = preprocess_organizations(organizations)
     transactions = preprocess_transactions(transactions)
 
-    # Deduplicates perfect matches and creates a new csv file
-    # in output titled "deduplicated_UUIDs.csv"
+    individuals, organizations = classify_wrapper(individuals, organizations)
+
     individuals = deduplicate_perfect_matches(individuals)
     organizations = deduplicate_perfect_matches(organizations)
 
-    cleaned_individuals_output_path = (
-        BASE_FILEPATH / "output" / "cleaned_individuals_table.csv"
-    )
-
-    cleaned_organizations_output_path = (
-        BASE_FILEPATH / "output" / "cleaned_organizations_table.csv"
-    )
-
-    cleaned_transactions_output_path = (
-        BASE_FILEPATH / "output" / "cleaned_transactions_table.csv"
-    )
-
     deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv")
 
-    # Splink deduplication
     individuals["unique_id"] = individuals["id"]
     organizations["unique_id"] = organizations["id"]
 
-    individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking)
-
     organizations = splink_dedupe(
         organizations, organizations_settings, organizations_blocking
     )
 
-    # Classifies individuals and organizations with a new 'classification'
-    # column containing 'neutral', 'f', or 'c'
-    individuals, organizations = classify_wrapper(individuals, organizations)
+    individuals = splink_dedupe(
+        individuals, individuals_settings, individuals_blocking
+    )
 
-    # Update the transactions table with the deduplicated UUIDs
     transactions[["donor_id", "recipient_id"]] = transactions[
         ["donor_id", "recipient_id"]
     ].replace(deduped)
 
+    cleaned_individuals_output_path = (
+        BASE_FILEPATH / "output" / "cleaned_individuals_table.csv"
+    )
+
+    cleaned_organizations_output_path = (
+        BASE_FILEPATH / "output" / "cleaned_organizations_table.csv"
+    )
+
+    cleaned_transactions_output_path = (
+        BASE_FILEPATH / "output" / "cleaned_transactions_table.csv"
+    )
+
     individuals.to_csv(cleaned_individuals_output_path, index=False)
     organizations.to_csv(cleaned_organizations_output_path, index=False)
     transactions.to_csv(cleaned_transactions_output_path, index=False)

From f05778b1171fe12f17d3d104311679b76f3a751d Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 4 Mar 2024 17:29:21 +0000
Subject: [PATCH 21/37] passing linter

---
 utils/linkage_pipeline.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index e80bd032..537e79d3 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -1,7 +1,7 @@
 import pandas as pd
-from classify import classify_wrapper
 from nameparser import HumanName
 
+from utils.classify import classify_wrapper
 from utils.constants import (
     BASE_FILEPATH,
     individuals_blocking,
@@ -165,9 +165,7 @@ def main():
         organizations, organizations_settings, organizations_blocking
     )
 
-    individuals = splink_dedupe(
-        individuals, individuals_settings, individuals_blocking
-    )
+    individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking)
 
     transactions[["donor_id", "recipient_id"]] = transactions[
         ["donor_id", "recipient_id"]

From 6de450df8e8b91eb40b1803e4bbcd4f698dd9dea Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 4 Mar 2024 17:31:16 +0000
Subject: [PATCH 22/37] linter

---
 utils/linkage_pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 537e79d3..ac911559 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -165,7 +165,9 @@ def main():
         organizations, organizations_settings, organizations_blocking
     )
 
-    individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking)
+    individuals = splink_dedupe(
+        individuals, individuals_settings, individuals_blocking
+    )
 
     transactions[["donor_id", "recipient_id"]] = transactions[
         ["donor_id", "recipient_id"]

From 51cc9def6d793165c9022c21124e4a40e30a6c38 Mon Sep 17 00:00:00 2001
From: Nicolas Posner <nicol@MacBook-Pro-5.local>
Date: Mon, 4 Mar 2024 14:57:36 -0600
Subject: [PATCH 23/37] updated classify test

---
 utils/tests/test_classifier.py | 45 ++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 utils/tests/test_classifier.py

diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py
new file mode 100644
index 00000000..602c52ac
--- /dev/null
+++ b/utils/tests/test_classifier.py
@@ -0,0 +1,45 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from utils.classify import matcher
+
+d = {
+    "name": [
+        "bob von rosevich",
+        "anantarya smith",
+        "bob j vonrosevich",
+        "missy elliot",
+        "mr johnson",
+        "quarantin directino",
+        "missy eliot",
+        "joseph johnson",
+    ],
+    "address": [
+        "3 Blue Drive, Chicago",
+        "4 Blue Drive, Chicago",
+        "8 Fancy Way, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+        "42 Hollywood Boulevard, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+    ],
+}
+
+test_df = pd.DataFrame(data=d)
+
+test_df["classification"] = "neutral"
+
+
+@pytest.fixture
+def matcher_scen_1():
+    return test_df
+
+
+def test_matcher_scen_1(matcher_scen_1):
+    res = matcher(matcher_scen_1, "Fancy", "address", "f")
+
+    assert np.all(
+        res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])
+    )

From 4cc7ce4c8a7d8edc50a7f032a96a25b0c74db60f Mon Sep 17 00:00:00 2001
From: Nicolas Posner <nicol@MacBook-Pro-5.local>
Date: Mon, 4 Mar 2024 15:03:32 -0600
Subject: [PATCH 24/37] fix pytest

---
 utils/classify.py              | 19 ++++++-------------
 utils/tests/test_classifier.py |  3 ++-
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/utils/classify.py b/utils/classify.py
index 3c24f941..4061970a 100644
--- a/utils/classify.py
+++ b/utils/classify.py
@@ -3,7 +3,9 @@
 from utils.constants import c_org_names, f_companies, f_org_names
 
 
-def classify_wrapper(individuals_df, organizations_df):
+def classify_wrapper(
+    individuals_df: pd.DataFrame, organizations_df: pd.DataFrame
+):
     """Wrapper for classificaiton in linkage pipeline
 
     Initialize the classify column in both dataframes and
@@ -25,7 +27,7 @@ def classify_wrapper(individuals_df, organizations_df):
     return classified_individuals, classified_orgs
 
 
-def matcher(df, substring, column, category):
+def matcher(df: pd.DataFrame, substring: str, column: str, category: str):
     """Applies a label to the classification column based on substrings
 
     We run through a given column containing strings in the dataframe. We
@@ -42,7 +44,7 @@ def matcher(df, substring, column, category):
     return df
 
 
-def classify_individuals(individuals_df):
+def classify_individuals(individuals_df: pd.DataFrame):
     """Part of the classification pipeline
 
     We apply the matcher function to the individuals dataframe
@@ -56,7 +58,7 @@ def classify_individuals(individuals_df):
     return individuals_df
 
 
-def classify_orgs(organizations_df):
+def classify_orgs(organizations_df: pd.DataFrame):
     """Part of the classification pipeline
 
     We apply the matcher function to the organizations dataframe
@@ -73,11 +75,6 @@ def classify_orgs(organizations_df):
     return organizations_df
 
 
-inds_list = []
-
-# a list of individual names
-
-
 def similarity_calculator(
     df: pd.DataFrame, subject: str, n: int, comparison_func
 ) -> pd.DataFrame:
@@ -133,7 +130,3 @@ def automated_classifier(
         )
 
     return similarities_df
-
-    # we can use the indices and/or select manually, just add a new
-    # column to the subjects table
-    # that marks fossil fuels, green energy, or neither
diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py
index 602c52ac..b6bce883 100644
--- a/utils/tests/test_classifier.py
+++ b/utils/tests/test_classifier.py
@@ -38,7 +38,8 @@ def matcher_scen_1():
 
 
 def test_matcher_scen_1(matcher_scen_1):
-    res = matcher(matcher_scen_1, "Fancy", "address", "f")
+    matcher(matcher_scen_1, "Fancy", "address", "f")
+    res = test_df[test_df["classification"] == "f"]["name"].values
 
     assert np.all(
         res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])

From 94f807c1693a04039c6f7f95114da81897ce489f Mon Sep 17 00:00:00 2001
From: Nicolas Posner <nicol@MacBook-Pro-5.local>
Date: Mon, 4 Mar 2024 15:12:14 -0600
Subject: [PATCH 25/37] Revert "fix pytest"

This reverts commit 4cc7ce4c8a7d8edc50a7f032a96a25b0c74db60f.

i accidentally put this on the wrong branch
---
 utils/classify.py              | 19 +++++++++++++------
 utils/tests/test_classifier.py |  3 +--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/utils/classify.py b/utils/classify.py
index 4061970a..3c24f941 100644
--- a/utils/classify.py
+++ b/utils/classify.py
@@ -3,9 +3,7 @@
 from utils.constants import c_org_names, f_companies, f_org_names
 
 
-def classify_wrapper(
-    individuals_df: pd.DataFrame, organizations_df: pd.DataFrame
-):
+def classify_wrapper(individuals_df, organizations_df):
     """Wrapper for classificaiton in linkage pipeline
 
     Initialize the classify column in both dataframes and
@@ -27,7 +25,7 @@ def classify_wrapper(
     return classified_individuals, classified_orgs
 
 
-def matcher(df: pd.DataFrame, substring: str, column: str, category: str):
+def matcher(df, substring, column, category):
     """Applies a label to the classification column based on substrings
 
     We run through a given column containing strings in the dataframe. We
@@ -44,7 +42,7 @@ def matcher(df: pd.DataFrame, substring: str, column: str, category: str):
     return df
 
 
-def classify_individuals(individuals_df: pd.DataFrame):
+def classify_individuals(individuals_df):
     """Part of the classification pipeline
 
     We apply the matcher function to the individuals dataframe
@@ -58,7 +56,7 @@ def classify_individuals(individuals_df: pd.DataFrame):
     return individuals_df
 
 
-def classify_orgs(organizations_df: pd.DataFrame):
+def classify_orgs(organizations_df):
     """Part of the classification pipeline
 
     We apply the matcher function to the organizations dataframe
@@ -75,6 +73,11 @@ def classify_orgs(organizations_df: pd.DataFrame):
     return organizations_df
 
 
+inds_list = []
+
+# a list of individual names
+
+
 def similarity_calculator(
     df: pd.DataFrame, subject: str, n: int, comparison_func
 ) -> pd.DataFrame:
@@ -130,3 +133,7 @@ def automated_classifier(
         )
 
     return similarities_df
+
+    # we can use the indices and/or select manually, just add a new
+    # column to the subjects table
+    # that marks fossil fuels, green energy, or neither
diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py
index b6bce883..602c52ac 100644
--- a/utils/tests/test_classifier.py
+++ b/utils/tests/test_classifier.py
@@ -38,8 +38,7 @@ def matcher_scen_1():
 
 
 def test_matcher_scen_1(matcher_scen_1):
-    matcher(matcher_scen_1, "Fancy", "address", "f")
-    res = test_df[test_df["classification"] == "f"]["name"].values
+    res = matcher(matcher_scen_1, "Fancy", "address", "f")
 
     assert np.all(
         res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])

From d62f3b70049e55b6f26eaad2774d9bd7dca8c2e3 Mon Sep 17 00:00:00 2001
From: Nicolas Posner <nicol@MacBook-Pro-5.local>
Date: Mon, 4 Mar 2024 15:14:07 -0600
Subject: [PATCH 26/37] Revert "updated classify test"

This reverts commit 51cc9def6d793165c9022c21124e4a40e30a6c38.

accidentally on wrong branch
---
 utils/tests/test_classifier.py | 45 ----------------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 utils/tests/test_classifier.py

diff --git a/utils/tests/test_classifier.py b/utils/tests/test_classifier.py
deleted file mode 100644
index 602c52ac..00000000
--- a/utils/tests/test_classifier.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import numpy as np
-import pandas as pd
-import pytest
-
-from utils.classify import matcher
-
-d = {
-    "name": [
-        "bob von rosevich",
-        "anantarya smith",
-        "bob j vonrosevich",
-        "missy elliot",
-        "mr johnson",
-        "quarantin directino",
-        "missy eliot",
-        "joseph johnson",
-    ],
-    "address": [
-        "3 Blue Drive, Chicago",
-        "4 Blue Drive, Chicago",
-        "8 Fancy Way, Chicago",
-        "8 Fancy Way, Evanston",
-        "17 Regular Road, Chicago",
-        "42 Hollywood Boulevard, Chicago",
-        "8 Fancy Way, Evanston",
-        "17 Regular Road, Chicago",
-    ],
-}
-
-test_df = pd.DataFrame(data=d)
-
-test_df["classification"] = "neutral"
-
-
-@pytest.fixture
-def matcher_scen_1():
-    return test_df
-
-
-def test_matcher_scen_1(matcher_scen_1):
-    res = matcher(matcher_scen_1, "Fancy", "address", "f")
-
-    assert np.all(
-        res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])
-    )

From 743b30618689ec698101052c47c705e3476e85bc Mon Sep 17 00:00:00 2001
From: Nicolas Posner <nicol@MacBook-Pro-5.local>
Date: Mon, 4 Mar 2024 20:59:25 -0600
Subject: [PATCH 27/37] updating readme and makefile as well as location of
 data for linkage_pipeline

---
 Makefile                  | 7 +++----
 README.md                 | 9 +++++----
 data/README.md            | 6 ++++++
 output/README.md          | 4 ++++
 utils/linkage_pipeline.py | 6 +++---
 5 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 36577581..3de9758d 100644
--- a/Makefile
+++ b/Makefile
@@ -30,8 +30,7 @@ run-notebooks:
 	--no-browser --allow-root
 
 
-#running the linkage pipeline and creating the network graph
-#still waiting on linkage_pipeline completion to get this into final shape
+output_network_graph: 
+    python linkage_pipeline.py
 
-output network_graph: all_individuals.csv all_organizations.csv all_transactions.csv
-	python linkage_pipeline.py
\ No newline at end of file
+.PHONY: output_network_graph
\ No newline at end of file
diff --git a/README.md b/README.md
index 879a41e0..4be8c9ba 100644
--- a/README.md
+++ b/README.md
@@ -44,9 +44,9 @@ If you prefer to develop inside a container with VS Code then do the following s
 6. For future reference, the above pipeline also stores the information mapping given id to our database id (generated via uuid) in a csv file in the format of (state)IDMap.csv (example: ArizonaIDMap.csv) in the output folder
 
 ### Record Linkage and Network Pipeline
-1. Save the standardized tables "complete_individuals_table.csv", "complete_organizations_table.csv", and "complete_transactions_table.csv" (collected from the above pipeline or data from the project's Google Drive) in the following format: repo_root / "output" / "file"
-2. **UPDATE:** Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, and an interactive network visual
-3. The pipeline will output the deduplicated tables saved as "cleaned_individuals_table.csv", "cleaned_organizations_table.csv", and "cleaned_transactions_table.csv". A mapping file, "deduplicated_UUIDs" tracks the UUIDs designated as duplicates.
+1. Download the complete_data.zip file from the Climate Cabinet Data Clinic Google Drive using this link: https://drive.google.com/file/d/1zbjt7iBU0NAWSBcUyEsjvuumn3VgI4z9/view?usp=sharing. After downloading this .zip, unzip it to find three files: complete_individuals.csv, complete_organizations.csv, and complete_transactions.csv. Upload these files into the data folder and ensure that their names are correct. They must follow this format: repo_root / "output" / "file"
+2. Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, and create a NetworkX Graph object. 
+3. The pipeline will output a NetworkX Graph object and a txt file containing graph metrics into the output folder. 
 
 ## Repository Structure
 
@@ -65,7 +65,8 @@ If the data is larger than 50MB than you should not add it to the repo and inste
 This [README.md file](/data/README.md) should be kept up to date.
 
 ### output
-Should contain work product generated by the analysis. Keep in mind that results should (generally) be excluded from the git repository.
+This folder is empty by default. The final outputs of the Makefile will be placed here, consisting of a NetworkX Graph object and a txt file containing graph metrics. 
+
 
 
 ## Team Member
diff --git a/data/README.md b/data/README.md
index 5326bff8..df9336b7 100644
--- a/data/README.md
+++ b/data/README.md
@@ -2,6 +2,12 @@
 
 This directory contains information for use in this project. 
 
+## Makefile and Final Pipeline
+- This folder is empty by default. In order to run the Makefile, download the complete_data.zip file from the Climate Cabinet Data Clinic Google Drive using this link: https://drive.google.com/file/d/1zbjt7iBU0NAWSBcUyEsjvuumn3VgI4z9/view?usp=sharing
+
+ - After downloading this .zip, unzip it to find three files: complete_individuals.csv, complete_organizations.csv, and complete_transactions.csv. Upload these files into the data folder and ensure that their names are correct. Once they are in place, you may run the Makefile. 
+
+
 ## Arizona Campaign Finance Data
 
 ### Summary
diff --git a/output/README.md b/output/README.md
index 932298fd..06e91212 100644
--- a/output/README.md
+++ b/output/README.md
@@ -1,2 +1,6 @@
 # Output README
 ---
+
+## Makefile and Final Pipeline
+
+- This folder is empty by default. The output of the Makefile process will be output into this folder, consisting of a NetworkX Graph object and a txt file containing graph metrics. 
diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index ac911559..499726e9 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -136,15 +136,15 @@ def preprocess_transactions(transactions: pd.DataFrame) -> pd.DataFrame:
 
 def main():
     organizations = pd.read_csv(
-        BASE_FILEPATH / "output" / "complete_organizations_table.csv"
+        BASE_FILEPATH / "data" / "complete_organizations_table.csv"
     )
 
     individuals = pd.read_csv(
-        BASE_FILEPATH / "output" / "complete_individuals_table.csv"
+        BASE_FILEPATH / "data" / "complete_individuals_table.csv"
     )
 
     transactions = pd.read_csv(
-        BASE_FILEPATH / "output" / "complete_transactions_table.csv"
+        BASE_FILEPATH / "data" / "complete_transactions_table.csv"
     )
 
     individuals = preprocess_individuals(individuals)

From a571d91cb5238089aca9fde1f27878828cc7a08a Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Tue, 5 Mar 2024 06:21:34 +0000
Subject: [PATCH 28/37] slight update to splink_dedupe function

---
 utils/linkage.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index d7237037..484c1060 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -341,9 +341,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
             names[i] = determine_comma_role(names[i])
 
         names[i] = names[i].replace(".", "").split(" ")
-        names[i] = [
-            name_part for name_part in names[i] if name_part not in titles
-        ]
+        names[i] = [name_part for name_part in names[i] if name_part not in titles]
         names[i] = " ".join(names[i])
 
     # one last check to remove any pieces that might add extra whitespace
@@ -432,9 +430,7 @@ def name_rank(first_name: str, last_name: str) -> list:
         if first_name_result and isinstance(first_name_result, dict):
             first_name_data = first_name_result.get("first_name")
             if first_name_data and "rank" in first_name_data:
-                first_name_rank = first_name_data["rank"].get(
-                    "United States", 0
-                )
+                first_name_rank = first_name_data["rank"].get("United States", 0)
     else:
         first_name_rank = None
     if isinstance(last_name, str):
@@ -636,9 +632,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
     raise ValueError("Can not find Address Number")
 
 
-def splink_dedupe(
-    df: pd.DataFrame, settings: dict, blocking: list
-) -> pd.DataFrame:
+def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame:
     """Given a dataframe and config settings, return a
     deduplicated dataframe
 
@@ -689,6 +683,9 @@ def splink_dedupe(
     )
     deduped_df.rename(columns={"cluster_id": "unique_id"}, inplace=True)
 
+    deduped_df["duplicated"] = deduped_df["duplicated"].apply(
+        lambda x: x if isinstance(x, list) else [x]
+    )
     convert_duplicates_to_dict(deduped_df)
 
     deduped_df.drop(columns=["duplicated"])

From 1db28399005bb2c5ee38e9b4bfd3c6f2d3fb77c2 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Tue, 5 Mar 2024 06:22:21 +0000
Subject: [PATCH 29/37] pre-commit fixes

---
 utils/linkage.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 484c1060..43febf41 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -341,7 +341,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
             names[i] = determine_comma_role(names[i])
 
         names[i] = names[i].replace(".", "").split(" ")
-        names[i] = [name_part for name_part in names[i] if name_part not in titles]
+        names[i] = [
+            name_part for name_part in names[i] if name_part not in titles
+        ]
         names[i] = " ".join(names[i])
 
     # one last check to remove any pieces that might add extra whitespace
@@ -430,7 +432,9 @@ def name_rank(first_name: str, last_name: str) -> list:
         if first_name_result and isinstance(first_name_result, dict):
             first_name_data = first_name_result.get("first_name")
             if first_name_data and "rank" in first_name_data:
-                first_name_rank = first_name_data["rank"].get("United States", 0)
+                first_name_rank = first_name_data["rank"].get(
+                    "United States", 0
+                )
     else:
         first_name_rank = None
     if isinstance(last_name, str):
@@ -632,7 +636,9 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
     raise ValueError("Can not find Address Number")
 
 
-def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame:
+def splink_dedupe(
+    df: pd.DataFrame, settings: dict, blocking: list
+) -> pd.DataFrame:
     """Given a dataframe and config settings, return a
     deduplicated dataframe
 

From 7ebe2a26e3f8c18ec3f9aebd8aeae9872cfa1050 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 01:38:37 +0000
Subject: [PATCH 30/37] slight changes

---
 utils/linkage_pipeline.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_pipeline.py
index 499726e9..b9a87fe8 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_pipeline.py
@@ -19,6 +19,7 @@
     splink_dedupe,
     standardize_corp_names,
 )
+from utils.network import construct_network_graph
 
 
 def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
@@ -95,6 +96,18 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
         axis=1,
     )
 
+    individuals["sort_priority"] = (
+        ~individuals["first_name"].isna()
+        & ~individuals["last_name"].isna()
+        & ~individuals["company"].isna()
+    ) * 2 + (~individuals["party"].isna())
+
+    individuals = individuals.sort_values(
+        by="sort_priority", ascending=False
+    ).drop(columns=["sort_priority"])
+
+    individuals["unique_id"] = individuals["id"]
+
     return individuals
 
 
@@ -112,6 +125,8 @@ def preprocess_organizations(organizations: pd.DataFrame) -> pd.DataFrame:
         .apply(standardize_corp_names)
     )
 
+    organizations["unique_id"] = organizations["id"]
+
     return organizations
 
 
@@ -131,6 +146,11 @@ def preprocess_transactions(transactions: pd.DataFrame) -> pd.DataFrame:
 
     transactions["purpose"] = transactions["purpose"].str.upper()
 
+    deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv")
+    transactions[["donor_id", "recipient_id"]] = transactions[
+        ["donor_id", "recipient_id"]
+    ].replace(deduped)
+
     return transactions
 
 
@@ -149,17 +169,13 @@ def main():
 
     individuals = preprocess_individuals(individuals)
     organizations = preprocess_organizations(organizations)
-    transactions = preprocess_transactions(transactions)
 
     individuals, organizations = classify_wrapper(individuals, organizations)
 
     individuals = deduplicate_perfect_matches(individuals)
     organizations = deduplicate_perfect_matches(organizations)
 
-    deduped = pd.read_csv(BASE_FILEPATH / "output" / "deduplicated_UUIDs.csv")
-
-    individuals["unique_id"] = individuals["id"]
-    organizations["unique_id"] = organizations["id"]
+    transactions = preprocess_transactions(transactions)
 
     organizations = splink_dedupe(
         organizations, organizations_settings, organizations_blocking
@@ -169,10 +185,6 @@ def main():
         individuals, individuals_settings, individuals_blocking
     )
 
-    transactions[["donor_id", "recipient_id"]] = transactions[
-        ["donor_id", "recipient_id"]
-    ].replace(deduped)
-
     cleaned_individuals_output_path = (
         BASE_FILEPATH / "output" / "cleaned_individuals_table.csv"
     )
@@ -189,6 +201,10 @@ def main():
     organizations.to_csv(cleaned_organizations_output_path, index=False)
     transactions.to_csv(cleaned_transactions_output_path, index=False)
 
+    construct_network_graph(
+        2018, 2024, [individuals, organizations, transactions]
+    )
+
 
 if __name__ == "__main__":
     main()

From 9a0352151fb451ddf1846e89d2948921e3bee149 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 01:44:30 +0000
Subject: [PATCH 31/37] renaming file

---
 utils/{linkage_pipeline.py => linkage_and_network_pipeline.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename utils/{linkage_pipeline.py => linkage_and_network_pipeline.py} (99%)

diff --git a/utils/linkage_pipeline.py b/utils/linkage_and_network_pipeline.py
similarity index 99%
rename from utils/linkage_pipeline.py
rename to utils/linkage_and_network_pipeline.py
index b9a87fe8..134d5f2d 100644
--- a/utils/linkage_pipeline.py
+++ b/utils/linkage_and_network_pipeline.py
@@ -202,7 +202,7 @@ def main():
     transactions.to_csv(cleaned_transactions_output_path, index=False)
 
     construct_network_graph(
-        2018, 2024, [individuals, organizations, transactions]
+        2018, 2023, [individuals, organizations, transactions]
     )
 
 

From d4161f61db0df0e24bc3bd002999ceca6b0f0c70 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 02:07:32 +0000
Subject: [PATCH 32/37] updating functions to latest versions

---
 utils/linkage.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 28f12dd4..5791da59 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -137,7 +137,12 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     >>> get_likely_name("Jane","","Doe, Jane, Elisabeth")
     'Jane Elisabeth Doe'
     """
-    # first ensure clean input by deleting spaces:
+    # first, convert any Nans to empty strings ''
+    first_name, last_name, full_name = [
+        "" if x is np.NAN else x for x in [first_name, last_name, full_name]
+    ]
+
+    # second, ensure clean input by deleting spaces:
     first_name, last_name, full_name = list(
         map(lambda x: x.lower().strip(), [first_name, last_name, full_name])
     )
@@ -220,21 +225,23 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
 
 
 def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
-    """Saves to the "output" directory a file where each row represents a string
-    matching to another string
+    """For each uuid, maps it to all other uuids for which it has been deemed a
+    match.
 
-    Given a dataframe where each row contains one string in a column and a list
-    of strings in another column, the function maps each string in the list to
-    the single string.
+    Given a dataframe where the uuids of all rows deemed similar are stored in a
+    list and all but the first row of each paired uuid is dropped, this function
+    maps the matched uuids to a single uuid.
 
     Args:
-        A pandas dataframe
+        A pandas df containing a column called 'duplicated', where each row is a
+        list of all uuids deemed a match. In each list, all uuids but the first
+        have their rows already dropped.
 
     Returns
         None. However it outputs a file to the output directory, with 2
-        columns. The first, which indicates the duplicated UUIDs, is labeled
-        'duplicated_uuids', and the 2nd, which shows the uuids to which the
-        deduplicated entries match to, is labeled 'mapped_uuids'.
+        columns. The first lists all the uuids in df, and is labeled
+        'original_uuids.' The 2nd shows the uuids to which each entry is mapped
+        to, and is labeled 'mapped_uuids'.
     """
     deduped_dict = {}
     for i in range(len(df)):
@@ -245,7 +252,7 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
     # now convert dictionary into a csv file
     deduped_df = pd.DataFrame.from_dict(deduped_dict, "index")
     deduped_df = deduped_df.reset_index().rename(
-        columns={"index": "duplicated_uuids", 0: "mapped_uuids"}
+        columns={"index": "original_uuids", 0: "mapped_uuid"}
     )
     deduped_df.to_csv(
         repo_root / "output" / "deduplicated_UUIDs.csv",
@@ -273,7 +280,9 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
 
     # now find the duplicates along all columns but the ID
     new_df = (
-        new_df.groupby(df.columns[1:].tolist(), dropna=False)["id"]
+        new_df.groupby(df.columns.difference(["id"]).tolist(), dropna=False)[
+            "id"
+        ]
         .agg(list)
         .reset_index()
         .rename(columns={"id": "duplicated"})

From 45347e26a39b50a951a69e2c77c50d16b1fd0bfc Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 02:09:14 +0000
Subject: [PATCH 33/37] slight changes to match function changes in linkage.py

---
 utils/linkage_and_network_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/linkage_and_network_pipeline.py b/utils/linkage_and_network_pipeline.py
index 134d5f2d..bd6bcfbd 100644
--- a/utils/linkage_and_network_pipeline.py
+++ b/utils/linkage_and_network_pipeline.py
@@ -89,13 +89,13 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
 
     individuals["full_name"] = individuals.apply(
         lambda row: get_likely_name(
-            row["first_name"] if pd.notnull(row["first_name"]) else "",
-            row["last_name"] if pd.notnull(row["last_name"]) else "",
-            row["full_name"] if pd.notnull(row["full_name"]) else "",
+            row["first_name"], row["last_name"], row["full_name"]
         ),
         axis=1,
     )
 
+    # Ensure that columns with values are prioritized and appear first
+    # important for splink implementation
     individuals["sort_priority"] = (
         ~individuals["first_name"].isna()
         & ~individuals["last_name"].isna()

From ad2ed0f5e9a30bd5c246c01ab4e3d4550a8f3dc3 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 18:36:38 +0000
Subject: [PATCH 34/37] slight changes

---
 Makefile                              |  5 ++--
 notebooks/Test.ipynb                  | 39 ---------------------------
 setup.py                              |  2 +-
 utils/linkage_and_network_pipeline.py | 34 +++++++++++++----------
 4 files changed, 24 insertions(+), 56 deletions(-)
 delete mode 100644 notebooks/Test.ipynb

diff --git a/Makefile b/Makefile
index 07383c3c..48879489 100644
--- a/Makefile
+++ b/Makefile
@@ -33,5 +33,6 @@ run-notebooks:
 #running the linkage pipeline and creating the network graph
 #still waiting on linkage_pipeline completion to get this into final shape
 
-output network_graph: all_individuals.csv all_organizations.csv all_transactions.csv
-	python linkage_pipeline.py
+run-linkage-and-network-pipeline:
+	docker build -t $(project_image_name) -f Dockerfile $(current_abs_path)
+	docker run -v $(current_abs_path):/project -t $(project_image_name) python utils/linkage_pipeline.py
\ No newline at end of file
diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
deleted file mode 100644
index 5df942e1..00000000
--- a/notebooks/Test.ipynb
+++ /dev/null
@@ -1,39 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Example Notebook file demonstrating how to use the file structure\n",
-    "from utils.preprocess_util_lib_example import save_random_dataframe\n",
-    "from pathlib import Path\n",
-    "\n",
-    "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/setup.py b/setup.py
index 63ef672a..07404acd 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import find_packages, setup
 
 setup(
-    name="2023-fall-clinic-climate-cabinet",
+    name="2024-winter-clinic-climate-cabinet",
     version="0.1.0",
     packages=find_packages(
         include=[
diff --git a/utils/linkage_and_network_pipeline.py b/utils/linkage_and_network_pipeline.py
index bd6bcfbd..7e5f8cec 100644
--- a/utils/linkage_and_network_pipeline.py
+++ b/utils/linkage_and_network_pipeline.py
@@ -1,3 +1,4 @@
+import networkx as nx
 import pandas as pd
 from nameparser import HumanName
 
@@ -19,7 +20,11 @@
     splink_dedupe,
     standardize_corp_names,
 )
-from utils.network import construct_network_graph
+from utils.network import (
+    create_network_graph,
+    combine_datasets_for_network_graph,
+    construct_network_graph,
+)
 
 
 def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
@@ -102,9 +107,9 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
         & ~individuals["company"].isna()
     ) * 2 + (~individuals["party"].isna())
 
-    individuals = individuals.sort_values(
-        by="sort_priority", ascending=False
-    ).drop(columns=["sort_priority"])
+    individuals = individuals.sort_values(by="sort_priority", ascending=False).drop(
+        columns=["sort_priority"]
+    )
 
     individuals["unique_id"] = individuals["id"]
 
@@ -159,9 +164,7 @@ def main():
         BASE_FILEPATH / "data" / "complete_organizations_table.csv"
     )
 
-    individuals = pd.read_csv(
-        BASE_FILEPATH / "data" / "complete_individuals_table.csv"
-    )
+    individuals = pd.read_csv(BASE_FILEPATH / "data" / "complete_individuals_table.csv")
 
     transactions = pd.read_csv(
         BASE_FILEPATH / "data" / "complete_transactions_table.csv"
@@ -175,15 +178,13 @@ def main():
     individuals = deduplicate_perfect_matches(individuals)
     organizations = deduplicate_perfect_matches(organizations)
 
-    transactions = preprocess_transactions(transactions)
-
     organizations = splink_dedupe(
         organizations, organizations_settings, organizations_blocking
     )
 
-    individuals = splink_dedupe(
-        individuals, individuals_settings, individuals_blocking
-    )
+    individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking)
+
+    transactions = preprocess_transactions(transactions)
 
     cleaned_individuals_output_path = (
         BASE_FILEPATH / "output" / "cleaned_individuals_table.csv"
@@ -201,9 +202,14 @@ def main():
     organizations.to_csv(cleaned_organizations_output_path, index=False)
     transactions.to_csv(cleaned_transactions_output_path, index=False)
 
-    construct_network_graph(
-        2018, 2023, [individuals, organizations, transactions]
+    aggreg_df = combine_datasets_for_network_graph(
+        [individuals, organizations, transactions]
     )
+    g = create_network_graph(aggreg_df)
+    g_output_path = BASE_FILEPATH / "output" / "g.gml"
+    nx.write_graphml(g, g_output_path)
+
+    construct_network_graph(2018, 2023, [individuals, organizations, transactions])
 
 
 if __name__ == "__main__":

From 4b0de47ef040c04e6adc78409e272fb04e132129 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 18:48:41 +0000
Subject: [PATCH 35/37] readme changes

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a610a4ec..da6bba26 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ If you prefer to develop inside a container with VS Code then do the following s
 
 ### Record Linkage and Network Pipeline
 1. Save the standardized tables "complete_individuals_table.csv", "complete_organizations_table.csv", and "complete_transactions_table.csv" (collected from the above pipeline or data from the project's Google Drive) in the following format: repo_root / "output" / "file"
-2. **UPDATE:** Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, convert the standardized tables into a NetworkX Graph, and show an interactive network visual.
+2. **UPDATE:** Run the pipeline by calling ```make run-linkage-and-network-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, convert the standardized tables into a NetworkX Graph, and show an interactive network visual.
 3. The pipeline will output the deduplicated tables saved as "cleaned_individuals_table.csv", "cleaned_organizations_table.csv", and "cleaned_transactions_table.csv". A mapping file, "deduplicated_UUIDs" tracks the UUIDs designated as duplicates. The pipeline will also output "Network Graph Node Data", which is the NetworkX Graph object converted into an adjecency list.
 
 ## Repository Structure

From 0c7902394bce4864e984eab13305ba99855a86dd Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 18:50:36 +0000
Subject: [PATCH 36/37] data/ readme changes

---
 data/README.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/data/README.md b/data/README.md
index df9336b7..5326bff8 100644
--- a/data/README.md
+++ b/data/README.md
@@ -2,12 +2,6 @@
 
 This directory contains information for use in this project. 
 
-## Makefile and Final Pipeline
-- This folder is empty by default. In order to run the Makefile, download the complete_data.zip file from the Climate Cabinet Data Clinic Google Drive using this link: https://drive.google.com/file/d/1zbjt7iBU0NAWSBcUyEsjvuumn3VgI4z9/view?usp=sharing
-
- - After downloading this .zip, unzip it to find three files: complete_individuals.csv, complete_organizations.csv, and complete_transactions.csv. Upload these files into the data folder and ensure that their names are correct. Once they are in place, you may run the Makefile. 
-
-
 ## Arizona Campaign Finance Data
 
 ### Summary

From 48470c21eb14ca164516f241ab5d2646f008a318 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 6 Mar 2024 18:51:15 +0000
Subject: [PATCH 37/37] pre-commit formatting changes

---
 utils/linkage_and_network_pipeline.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/utils/linkage_and_network_pipeline.py b/utils/linkage_and_network_pipeline.py
index 7e5f8cec..86e0ab62 100644
--- a/utils/linkage_and_network_pipeline.py
+++ b/utils/linkage_and_network_pipeline.py
@@ -21,9 +21,9 @@
     standardize_corp_names,
 )
 from utils.network import (
-    create_network_graph,
     combine_datasets_for_network_graph,
     construct_network_graph,
+    create_network_graph,
 )
 
 
@@ -107,9 +107,9 @@ def preprocess_individuals(individuals: pd.DataFrame) -> pd.DataFrame:
         & ~individuals["company"].isna()
     ) * 2 + (~individuals["party"].isna())
 
-    individuals = individuals.sort_values(by="sort_priority", ascending=False).drop(
-        columns=["sort_priority"]
-    )
+    individuals = individuals.sort_values(
+        by="sort_priority", ascending=False
+    ).drop(columns=["sort_priority"])
 
     individuals["unique_id"] = individuals["id"]
 
@@ -164,7 +164,9 @@ def main():
         BASE_FILEPATH / "data" / "complete_organizations_table.csv"
     )
 
-    individuals = pd.read_csv(BASE_FILEPATH / "data" / "complete_individuals_table.csv")
+    individuals = pd.read_csv(
+        BASE_FILEPATH / "data" / "complete_individuals_table.csv"
+    )
 
     transactions = pd.read_csv(
         BASE_FILEPATH / "data" / "complete_transactions_table.csv"
@@ -182,7 +184,9 @@ def main():
         organizations, organizations_settings, organizations_blocking
     )
 
-    individuals = splink_dedupe(individuals, individuals_settings, individuals_blocking)
+    individuals = splink_dedupe(
+        individuals, individuals_settings, individuals_blocking
+    )
 
     transactions = preprocess_transactions(transactions)
 
@@ -209,7 +213,9 @@ def main():
     g_output_path = BASE_FILEPATH / "output" / "g.gml"
     nx.write_graphml(g, g_output_path)
 
-    construct_network_graph(2018, 2023, [individuals, organizations, transactions])
+    construct_network_graph(
+        2018, 2023, [individuals, organizations, transactions]
+    )
 
 
 if __name__ == "__main__":