From 46a5c3b8d297598d561da2aaecc3d90493f8427e Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@h001.ds.uchicago.edu>
Date: Thu, 18 Jan 2024 00:05:53 -0600
Subject: [PATCH 01/42] get_likely function done

---
 setup.py         |   2 +-
 utils/linkage.py | 120 +++++++++++++++++------------------------------
 2 files changed, 44 insertions(+), 78 deletions(-)

diff --git a/setup.py b/setup.py
index 63ef672..07404ac 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import find_packages, setup
 
 setup(
-    name="2023-fall-clinic-climate-cabinet",
+    name="2024-winter-clinic-climate-cabinet",
     version="0.1.0",
     packages=find_packages(
         include=[
diff --git a/utils/linkage.py b/utils/linkage.py
index aa56307..c3ddf1b 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -3,33 +3,6 @@
 """
 
 
-def calculate_string_similarity(string1: str, string2: str) -> float:
-    """Returns how similar two strings are on a scale of 0 to 1
-
-    The exact meaning of the metric is open, but the following must hold true:
-    1. equivalent strings must return 1
-    2. strings with no similar characters must return 0
-    3. strings with higher intuitive similarity must return higher scores
-
-    Args:
-        string1: any string
-        string2: any string
-    Returns:
-        similarity score
-
-    Sample Usage:
-    >>> calculate_string_similarity("exact match", "exact match")
-    1.0
-    >>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb")
-    0.0
-    >>> similar_score = calculate_string_similarity("very similar", "vary similar")
-    >>> different_score = calculate_string_similarity("very similar", "very not close")
-    >>> similar_socre > different_score
-    True
-    """
-    pass
-
-
 def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     """Given name related columns, return a person's likely name
 
@@ -56,54 +29,47 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     >>> get_likely_name("Jane Doe", "Doe", "Jane Doe")
     "Jane Doe"
     """
-    pass
-
-
-def get_address_line_1_from_full_address(address: str) -> str:
-    """Given a full address, return the first line of the formatted address
-
-    Address line 1 usually includes street address or PO Box information.
-
-    Args:
-        address: raw string representing full address
-    Returns:
-        address_line_1
-
-    Sample Usage:
-    >>> get_address_line_1_from_full_address("6727 W. Corrine Dr.  Peoria,AZ 85381")
-    "6727 W. Corrine Dr."
-    >>> get_address_line_1_from_full_address("P.O. Box 5456  Sun City West ,AZ 85375")
-    "P.O. Box 5456"
-    >>> get_address_line_1_from_full_address("119 S 5th St  Niles,MI 49120")
-    "119 S 5th St"
-    >>> get_address_line_1_from_full_address(
-    ...     "1415 PARKER STREET APT 251	DETROIT	MI	48214-0000"
-    ... )
-    "1415 PARKER STREET"
-    """
-    pass
 
-
-def get_street_from_address_line_1(address_line_1: str) -> str:
-    """Given an address line 1, return the street name
-
-    Args:
-        address_line_1: either street information or PO box
-    Returns:
-        street name
-    Raises:
-        ValueError: if string is malformed and no street can be reasonably
-            found.
-
-    >>> get_street_from_address_line_1("5645 N. UBER ST")
-    "UBER ST"
-    >>> get_street_from_address_line_1("")
-    Traceback (most recent call last):
-        ...
-    ValueError: address_line_1 must have whitespace
-    >>> get_street_from_address_line_1("PO Box 1111")
-    Traceback (most recent call last):
-        ...
-    ValueError: address_line_1 is PO Box
-    """
-    pass
+    # if data is clean:
+    if first_name + " " + last_name == full_name:
+        return full_name
+
+    # some names have titles or professions associated with the name. We need to
+    # remove those from the name.
+    titles = [
+        "mr",
+        "ms",
+        "mrs",
+        "miss",
+        "prof",
+        "dr",
+        "doctor",
+        "sir",
+        "madam",
+        "professor",
+    ]
+    names = [first_name, last_name, full_name]
+
+    for i in range(len(names)):
+        # if there is a ',' switch around the names
+        if "," in names[i]:
+            index = names[i].find(",")
+            first_part = names[i][index + 1 :]
+            last_part = names[i][0:index]
+            names[i] = first_part + " " + last_part
+
+        names[i] = names[i].lower().replace(".", "").split(" ")
+        names[i] = [
+            name_part for name_part in names[i] if name_part not in titles
+        ]
+        names[i] = " ".join(names[i])
+
+    names = " ".join(names)
+    names = names.split(" ")
+    final_name = []
+    [
+        final_name.append(x)
+        for x in names
+        if ((x not in final_name) & (len(x) != 0))
+    ]
+    return " ".join(final_name)

From 073c935e3861ebc12a086ae5bc01fee4acadc373 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@h001.ds.uchicago.edu>
Date: Thu, 18 Jan 2024 00:15:34 -0600
Subject: [PATCH 02/42] added .title() function to return proper name format

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index c3ddf1b..7461049 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -72,4 +72,4 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
         for x in names
         if ((x not in final_name) & (len(x) != 0))
     ]
-    return " ".join(final_name)
+    return " ".join(final_name).title()

From 16a51dc7ba80ecd3ecdc4653f8623c1b5a8fb9a1 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@h001.ds.uchicago.edu>
Date: Thu, 18 Jan 2024 00:55:07 -0600
Subject: [PATCH 03/42] struggling with converting single quotes into double
 quotes for function output

---
 utils/linkage.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 7461049..f695a0a 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -67,9 +67,5 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     names = " ".join(names)
     names = names.split(" ")
     final_name = []
-    [
-        final_name.append(x)
-        for x in names
-        if ((x not in final_name) & (len(x) != 0))
-    ]
-    return " ".join(final_name).title()
+    [final_name.append(x) for x in names if x not in final_name]
+    return " ".join(final_name).title().strip()

From c446aaf79b5843a416d6951cc19a7f1554347f1a Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Sat, 20 Jan 2024 03:03:36 -0600
Subject: [PATCH 04/42] updates to get_likely_name function after feedback to
 consider generational suffixes and handle more edge cases

---
 utils/linkage.py | 64 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 10 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index f695a0a..f43e09f 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,6 +1,47 @@
-"""
-Module for performing record linkage on state campaign finance dataset
-"""
+def determine_comma_role(name: str) -> str:
+    """Given a string (someone's name), attempts to determine the role of the
+    comma in the name and where it ought to belong.
+
+    Some assumptions are made:
+        * If a suffix is included in the name and the name is not just the last
+          name(i.e "Doe, Jr), the format is
+          (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth
+
+        * If a comma is used anywhere else, it is in the format of
+          (last_name, first and middle name) i.e Doe, Jane Elisabeth
+
+    Args:
+        name: a string representing a name/names of individuals
+    Returns:
+        the name with or without a comma based on some conditions
+    """
+    suffixes = [
+        "sr",
+        "jr",
+        "i",
+        "ii",
+        "iii",
+        "iv",
+        "v",
+        "vi",
+        "vii",
+        "viii",
+        "ix",
+        "x",
+    ]
+    name_parts = name.split(",")
+    # if the comma is just in the end as a typo:
+    if len(name_parts[1]) == 0:
+        return name_parts[0]
+    # if just the suffix in the end, leave the name as it is
+    if name_parts[1].strip() in suffixes:
+        return name
+    # at this point either it's just poor name placement, or the suffix is
+    # in the beginning of the name. Either way, the first part of the list is the
+    # true last name.
+    last_part = name_parts.pop(0)
+    first_part = " ".join(name_parts)
+    return first_part + " " + last_part
 
 
 def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
@@ -29,6 +70,10 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     >>> get_likely_name("Jane Doe", "Doe", "Jane Doe")
     "Jane Doe"
     """
+    # first ensure clean input by deleting spaces:
+    first_name, last_name, full_name = list(
+        map(lambda x: x.lower().strip(), [first_name, last_name, full_name])
+    )
 
     # if data is clean:
     if first_name + " " + last_name == full_name:
@@ -51,20 +96,19 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     names = [first_name, last_name, full_name]
 
     for i in range(len(names)):
-        # if there is a ',' switch around the names
+        # if there is a ',' deal with it accordingly
         if "," in names[i]:
-            index = names[i].find(",")
-            first_part = names[i][index + 1 :]
-            last_part = names[i][0:index]
-            names[i] = first_part + " " + last_part
-
-        names[i] = names[i].lower().replace(".", "").split(" ")
+            names[i] = determine_comma_role(names[i])
+            print(names[i])
+        names[i] = names[i].replace(".", "").split(" ")
         names[i] = [
             name_part for name_part in names[i] if name_part not in titles
         ]
         names[i] = " ".join(names[i])
+        print(names[i])
 
     names = " ".join(names)
+    print("after comma: ", names)
     names = names.split(" ")
     final_name = []
     [final_name.append(x) for x in names if x not in final_name]

From efc02e22ebc298095c2abf7a4adcd13db02b2a2d Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Sat, 20 Jan 2024 03:11:05 -0600
Subject: [PATCH 05/42] adjusted the sample usage output to single quotes as
 per Avery's suggestion

---
 utils/linkage.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index f43e09f..9b9ba22 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -62,13 +62,19 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
 
     Sample Usage:
     >>> get_likely_name("Jane", "Doe", "")
-    "Jane Doe"
+    'Jane Doe'
     >>> get_likely_name("", "", "Jane Doe")
-    "Jane Doe"
+    'Jane Doe'
     >>> get_likely_name("", "Doe, Jane", "")
-    "Jane Doe"
+    'Jane Doe'
     >>> get_likely_name("Jane Doe", "Doe", "Jane Doe")
-    "Jane Doe"
+    'Jane Doe'
+    >>> get_likely_name("Jane","","Doe, Sr")
+    'Jane Doe, Sr'
+    >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV)
+    'Jane Elisabeth Doe, Iv'
+    >>> get_likely_name("","",Jane Elisabeth Doe, IV")
+    'Jane Elisabeth Doe Iv'
     """
     # first ensure clean input by deleting spaces:
     first_name, last_name, full_name = list(
@@ -99,16 +105,14 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
         # if there is a ',' deal with it accordingly
         if "," in names[i]:
             names[i] = determine_comma_role(names[i])
-            print(names[i])
+
         names[i] = names[i].replace(".", "").split(" ")
         names[i] = [
             name_part for name_part in names[i] if name_part not in titles
         ]
         names[i] = " ".join(names[i])
-        print(names[i])
 
     names = " ".join(names)
-    print("after comma: ", names)
     names = names.split(" ")
     final_name = []
     [final_name.append(x) for x in names if x not in final_name]

From 6c37c4576c39ec2d1ac6856c036ed6dceef6c628 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Sat, 20 Jan 2024 03:25:32 -0600
Subject: [PATCH 06/42] took care of empty strings that were adding extra
 whitespace to o output

---
 utils/linkage.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 9b9ba22..521c75c 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -111,7 +111,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
             name_part for name_part in names[i] if name_part not in titles
         ]
         names[i] = " ".join(names[i])
-
+    
+    #one last check to remove any pieces that might add extra whitespace
+    names = list(filter(lambda x: x != '', names))
     names = " ".join(names)
     names = names.split(" ")
     final_name = []

From 81e52dbdb537e4ee6caae02462c49ba7a2ef1d1a Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Sat, 20 Jan 2024 03:27:12 -0600
Subject: [PATCH 07/42] took care of empty strings that were adding extra
 whitespace to output

---
 utils/linkage.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 521c75c..4c1d24f 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -111,9 +111,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
             name_part for name_part in names[i] if name_part not in titles
         ]
         names[i] = " ".join(names[i])
-    
-    #one last check to remove any pieces that might add extra whitespace
-    names = list(filter(lambda x: x != '', names))
+
+    # one last check to remove any pieces that might add extra whitespace
+    names = list(filter(lambda x: x != "", names))
     names = " ".join(names)
     names = names.split(" ")
     final_name = []

From 2dcb7d9592be19be15e688101509a25581848dcc Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Sat, 20 Jan 2024 03:30:06 -0600
Subject: [PATCH 08/42] fixed error in sample usage output

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 4c1d24f..df15117 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -71,7 +71,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     'Jane Doe'
     >>> get_likely_name("Jane","","Doe, Sr")
     'Jane Doe, Sr'
-    >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV)
+    >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV")
     'Jane Elisabeth Doe, Iv'
     >>> get_likely_name("","",Jane Elisabeth Doe, IV")
     'Jane Elisabeth Doe Iv'

From 20f4e938e09fa98d1f5acddf7e6eee5c8c2684b5 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 25 Jan 2024 05:16:07 +0000
Subject: [PATCH 09/42] adding cleaning_company_column function

---
 utils/linkage.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/utils/linkage.py b/utils/linkage.py
index fe4dfd3..86485d3 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,6 +1,7 @@
 """
 Module for performing record linkage on state campaign finance dataset
 """
+import pandas as pd
 import usaddress
 
 
@@ -47,3 +48,54 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
             string.append(key)
 
     return " ".join(string)
+
+
+"""
+Module for standardizing the 'company' columnn of the state campaign finance dataset
+"""
+
+
+def cleaning_company_column(company: str) -> str:
+    """
+    Given a string, check if it contains a variation of self employed, unemployed,
+    or retired and return the standardized version.
+
+    Args:
+        company: string of inputted company names
+    Returns:
+        standardized for retired, self employed, and unemployed,
+        or original string if no match or empty string
+
+    >>> cleaning_company_column("Retireed")
+    'Retired'
+    >>> cleaning_company_column("self")
+    'Self Employed'
+    >>> cleaning_company_column("None")
+    'Unemployed'
+    """
+    if pd.isnull(company):
+        return company
+
+    company_edited = company.lower()
+    company_edited = company_edited.strip()
+    company_edited = company_edited.replace(".", " ")
+    company_edited = company_edited.replace(",", " ")
+    company_edited = company_edited.replace("-", " ")
+
+    if "retire" in company_edited:
+        return "Retired"
+    elif "self employe" in company_edited or company_edited == "self":
+        return "Self Employed"
+    elif (
+        "unemploye" in company_edited
+        or company_edited == "none"
+        or company_edited == "not employed"
+    ):
+        return "Unemployed"
+
+    else:
+        return company
+
+
+# Example implementation of the function standardize_company_column for a dataframe
+# df['standardized_company'] = df['company'].apply(standardize_company_column)

From baf56f5707c31b222f97322c2b244892982873a5 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Mon, 29 Jan 2024 10:26:49 -0600
Subject: [PATCH 10/42] testing if merge was done correctly after git pull

---
 requirements.txt  |  2 ++
 utils/linkage.py  | 86 +++++++++++++++++++++++++++++++++++++++++++++--
 utils/pipeline.py |  1 +
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6658f0e..944e1c5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,5 @@ beautifulsoup4==4.11.1
 numpy==1.25.0
 Requests==2.31.0
 setuptools==68.0.0
+textdistance==4.6.1
+usaddress==0.5.4
\ No newline at end of file
diff --git a/utils/linkage.py b/utils/linkage.py
index df15117..e88a4a3 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,3 +1,41 @@
+"""
+Module for performing record linkage on state campaign finance dataset
+"""
+import textdistance as td
+import usaddress
+
+def calculate_string_similarity(string1: str, string2: str) -> float:
+    """Returns how similar two strings are on a scale of 0 to 1
+
+    This version utilizes Jaro-Winkler distance, which is a metric of
+    edit distance. Jaro-Winkler specially prioritizes the early
+    characters in a string.
+
+    Since the ends of strings are often more valuable in matching names
+    and addresses, we reverse the strings before matching them.
+
+    https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js
+
+    The exact meaning of the metric is open, but the following must hold true:
+    1. equivalent strings must return 1
+    2. strings with no similar characters must return 0
+    3. strings with higher intuitive similarity must return higher scores
+    similarity score
+
+    Sample Usage:
+    >>> calculate_string_similarity("exact match", "exact match")
+    1.0
+    >>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb")
+    0.0
+    >>> similar_score = calculate_string_similarity("very similar", "vary similar")
+    >>> different_score = calculate_string_similarity("very similar", "very not close")
+    >>> similar_score > different_score
+    True
+    """
+
+    return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1]))
+
 def determine_comma_role(name: str) -> str:
     """Given a string (someone's name), attempts to determine the role of the
     comma in the name and where it ought to belong.
@@ -9,7 +47,6 @@ def determine_comma_role(name: str) -> str:
 
         * If a comma is used anywhere else, it is in the format of
           (last_name, first and middle name) i.e Doe, Jane Elisabeth
-
     Args:
         name: a string representing a name/names of individuals
     Returns:
@@ -73,7 +110,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     'Jane Doe, Sr'
     >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV")
     'Jane Elisabeth Doe, Iv'
-    >>> get_likely_name("","",Jane Elisabeth Doe, IV")
+    >>> get_likely_name("","","Jane Elisabeth Doe, IV")
     'Jane Elisabeth Doe Iv'
     """
     # first ensure clean input by deleting spaces:
@@ -119,3 +156,48 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     final_name = []
     [final_name.append(x) for x in names if x not in final_name]
     return " ".join(final_name).title().strip()
+    
+
+def get_street_from_address_line_1(address_line_1: str) -> str:
+    """Given an address line 1, return the street name
+
+    Args:
+        address_line_1: either street information or PO box
+    Returns:
+        street name
+    Raises:
+        ValueError: if string is malformed and no street can be reasonably
+            found.
+
+    >>> get_street_from_address_line_1("5645 N. UBER ST")
+    'UBER ST'
+    >>> get_street_from_address_line_1("")
+    Traceback (most recent call last):
+        ...
+    ValueError: address_line_1 must have whitespace
+    >>> get_street_from_address_line_1("PO Box 1111")
+    Traceback (most recent call last):
+        ...
+    ValueError: address_line_1 is PO Box
+    >>> get_street_from_address_line_1("300 59 St.")
+    '59 St.'
+    >>> get_street_from_address_line_1("Uber St.")
+    'Uber St.'
+    >>> get_street_from_address_line_1("3NW 59th St")
+    '59th St'
+    """
+    if not address_line_1 or address_line_1.isspace():
+        raise ValueError("address_line_1 must have whitespace")
+
+    address_line_lower = address_line_1.lower()
+
+    if "po box" in address_line_lower:
+        raise ValueError("address_line_1 is PO Box")
+
+    string = []
+    address = usaddress.parse(address_line_1)
+    for key, val in address:
+        if val in ["StreetName", "StreetNamePostType"]:
+            string.append(key)
+
+    return " ".join(string)
diff --git a/utils/pipeline.py b/utils/pipeline.py
index 7a288fd..e6b7a12 100644
--- a/utils/pipeline.py
+++ b/utils/pipeline.py
@@ -18,6 +18,7 @@
     single_state_organizations_tables = []
     single_state_transactions_tables = []
     for state_cleaner in state_cleaners:
+        print("Cleaning...")
         (
             individuals_table,
             organizations_table,

From 3d6500cfb5ef60aa3a745c593802cb605f840800 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g005.ds.uchicago.edu>
Date: Mon, 29 Jan 2024 11:17:58 -0600
Subject: [PATCH 11/42] undoing the mistake of previous commit where I
 committed files from the data and output directories

---
 utils/linkage.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 1500dc0..e419c6f 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -4,6 +4,7 @@
 import textdistance as td
 import usaddress
 
+
 def calculate_string_similarity(string1: str, string2: str) -> float:
     """Returns how similar two strings are on a scale of 0 to 1
 
@@ -75,8 +76,8 @@ def determine_comma_role(name: str) -> str:
     if name_parts[1].strip() in suffixes:
         return name
     # at this point either it's just poor name placement, or the suffix is
-    # in the beginning of the name. Either way, the first part of the list is the
-    # true last name.
+    # in the beginning of the name. Either way, the first part of the list is
+    # the true last name.
     last_part = name_parts.pop(0)
     first_part = " ".join(name_parts)
     return first_part + " " + last_part
@@ -157,7 +158,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     final_name = []
     [final_name.append(x) for x in names if x not in final_name]
     return " ".join(final_name).title().strip()
-    
+
 
 def get_street_from_address_line_1(address_line_1: str) -> str:
     """Given an address line 1, return the street name

From ca8b3f7aa83262e8c8de1064d962a2f19f16da86 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Mon, 29 Jan 2024 23:21:27 -0600
Subject: [PATCH 12/42] standardizing corporate names function

---
 utils/linkage.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/utils/linkage.py b/utils/linkage.py
index 44f24e5..f99ab5a 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -87,3 +87,69 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
             string.append(key)
 
     return " ".join(string)
+
+
+def standardize_corp_names(company_name: str) -> str:
+    """Given an employer name, return the standardized version
+
+    Args:
+        company_name: corporate name
+    Returns:
+        standardized company name
+        
+    >>>standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')
+    'MI BEER WINE WHOLESALERS ASSOCIATION'
+    
+    >>>standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION')
+    'MI COMMUNITY COLLEGE ASSOCIATION'
+    
+    >>>standardize_corp_names('STEPHANIES CHANGEMAKER FUND')
+    'STEPHANIES CHANGEMAKER FUND' 
+    
+    """
+
+
+    company_name_split = company_name.upper().split(' ')
+
+    company_types = {
+    'CORP': 'CORPORATION',
+    'CO': 'CORPORATION',
+    'LLC': 'LIMITED LIABILITY COMPANY',
+    'PTNR': 'PARTNERSHIP',
+    'LP': 'LIMITED PARTNERSHIP',
+    'LLP': 'LIMITED LIABILITY PARTNERSHIP',
+    'SOLE PROP': 'SOLE PROPRIETORSHIP',
+    'SP': 'SOLE PROPRIETORSHIP',
+    'NPO': 'NONPROFIT ORGANIZATION',
+    'PC': 'PROFESSIONAL CORPORATION',
+    'CO-OP': 'COOPERATIVE',
+    'LTD': 'LIMITED COMPANY',
+    'JSC': 'JOINT STOCK COMPANY',
+    'HOLDCO': 'HOLDING COMPANY',
+    'PLC': 'PUBLIC LIMITED COMPANY',
+    'PVT LTD': 'PRIVATE LIMITED COMPANY',
+    'INC': 'INCORPORATED',
+    'ASSOC': 'ASSOCIATION',
+    'FDN': 'FOUNDATION',
+    'TR': 'TRUST',
+    'SOC': 'SOCIETY',
+    'CONSORT': 'CONSORTIUM',
+    'SYND': 'SYNDICATE',
+    'GRP': 'GROUP',
+    'CORP SOLE': 'CORPORATION SOLE',
+    'JV': 'JOINT VENTURE',
+    'SUB': 'SUBSIDIARY',
+    'FRANCHISE': 'FRANCHISE',
+    'PA': 'PROFESSIONAL ASSOCIATION',
+    'CIC': 'COMMUNITY INTEREST COMPANY',
+    
+    'PAC': 'POLITICAL ACTION COMMITTEE'
+}
+
+    for i in range(len(company_name_split)):
+        if company_name_split[i] in list(company_types.keys()):
+            hold = company_name_split[i]
+            company_name_split[i] = company_types[hold]
+
+    new_company_name = ' '.join(company_name_split)
+    return new_company_name
\ No newline at end of file

From 663f08daf061f79cfba23a97cbaadfd9ff67d6a6 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Tue, 30 Jan 2024 20:02:15 -0600
Subject: [PATCH 13/42] corp names function update

---
 utils/linkage.py | 82 +++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index f99ab5a..65f4cb4 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -96,60 +96,58 @@ def standardize_corp_names(company_name: str) -> str:
         company_name: corporate name
     Returns:
         standardized company name
-        
+
     >>>standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')
     'MI BEER WINE WHOLESALERS ASSOCIATION'
-    
+
     >>>standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION')
     'MI COMMUNITY COLLEGE ASSOCIATION'
-    
+
     >>>standardize_corp_names('STEPHANIES CHANGEMAKER FUND')
-    'STEPHANIES CHANGEMAKER FUND' 
-    
-    """
+    'STEPHANIES CHANGEMAKER FUND'
 
+    """
 
-    company_name_split = company_name.upper().split(' ')
+    company_name_split = company_name.upper().split(" ")
 
     company_types = {
-    'CORP': 'CORPORATION',
-    'CO': 'CORPORATION',
-    'LLC': 'LIMITED LIABILITY COMPANY',
-    'PTNR': 'PARTNERSHIP',
-    'LP': 'LIMITED PARTNERSHIP',
-    'LLP': 'LIMITED LIABILITY PARTNERSHIP',
-    'SOLE PROP': 'SOLE PROPRIETORSHIP',
-    'SP': 'SOLE PROPRIETORSHIP',
-    'NPO': 'NONPROFIT ORGANIZATION',
-    'PC': 'PROFESSIONAL CORPORATION',
-    'CO-OP': 'COOPERATIVE',
-    'LTD': 'LIMITED COMPANY',
-    'JSC': 'JOINT STOCK COMPANY',
-    'HOLDCO': 'HOLDING COMPANY',
-    'PLC': 'PUBLIC LIMITED COMPANY',
-    'PVT LTD': 'PRIVATE LIMITED COMPANY',
-    'INC': 'INCORPORATED',
-    'ASSOC': 'ASSOCIATION',
-    'FDN': 'FOUNDATION',
-    'TR': 'TRUST',
-    'SOC': 'SOCIETY',
-    'CONSORT': 'CONSORTIUM',
-    'SYND': 'SYNDICATE',
-    'GRP': 'GROUP',
-    'CORP SOLE': 'CORPORATION SOLE',
-    'JV': 'JOINT VENTURE',
-    'SUB': 'SUBSIDIARY',
-    'FRANCHISE': 'FRANCHISE',
-    'PA': 'PROFESSIONAL ASSOCIATION',
-    'CIC': 'COMMUNITY INTEREST COMPANY',
-    
-    'PAC': 'POLITICAL ACTION COMMITTEE'
-}
+        "CORP": "CORPORATION",
+        "CO": "CORPORATION",
+        "LLC": "LIMITED LIABILITY COMPANY",
+        "PTNR": "PARTNERSHIP",
+        "LP": "LIMITED PARTNERSHIP",
+        "LLP": "LIMITED LIABILITY PARTNERSHIP",
+        "SOLE PROP": "SOLE PROPRIETORSHIP",
+        "SP": "SOLE PROPRIETORSHIP",
+        "NPO": "NONPROFIT ORGANIZATION",
+        "PC": "PROFESSIONAL CORPORATION",
+        "CO-OP": "COOPERATIVE",
+        "LTD": "LIMITED COMPANY",
+        "JSC": "JOINT STOCK COMPANY",
+        "HOLDCO": "HOLDING COMPANY",
+        "PLC": "PUBLIC LIMITED COMPANY",
+        "PVT LTD": "PRIVATE LIMITED COMPANY",
+        "INC": "INCORPORATED",
+        "ASSOC": "ASSOCIATION",
+        "FDN": "FOUNDATION",
+        "TR": "TRUST",
+        "SOC": "SOCIETY",
+        "CONSORT": "CONSORTIUM",
+        "SYND": "SYNDICATE",
+        "GRP": "GROUP",
+        "CORP SOLE": "CORPORATION SOLE",
+        "JV": "JOINT VENTURE",
+        "SUB": "SUBSIDIARY",
+        "FRANCHISE": "FRANCHISE",
+        "PA": "PROFESSIONAL ASSOCIATION",
+        "CIC": "COMMUNITY INTEREST COMPANY",
+        "PAC": "POLITICAL ACTION COMMITTEE",
+    }
 
     for i in range(len(company_name_split)):
         if company_name_split[i] in list(company_types.keys()):
             hold = company_name_split[i]
             company_name_split[i] = company_types[hold]
 
-    new_company_name = ' '.join(company_name_split)
-    return new_company_name
\ No newline at end of file
+    new_company_name = " ".join(company_name_split)
+    return new_company_name

From 1ab1d4277f03f68a0fdf9b887af729f24f4e1d2c Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Tue, 30 Jan 2024 20:07:25 -0600
Subject: [PATCH 14/42] updated corp names

---
 utils/linkage.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 65f4cb4..49d120b 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -97,13 +97,13 @@ def standardize_corp_names(company_name: str) -> str:
     Returns:
         standardized company name
 
-    >>>standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')
+    >>> standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')
     'MI BEER WINE WHOLESALERS ASSOCIATION'
 
-    >>>standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION')
+    >>> standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION')
     'MI COMMUNITY COLLEGE ASSOCIATION'
 
-    >>>standardize_corp_names('STEPHANIES CHANGEMAKER FUND')
+    >>> standardize_corp_names('STEPHANIES CHANGEMAKER FUND')
     'STEPHANIES CHANGEMAKER FUND'
 
     """

From 6aad87ef2d9598a5745abf64d5eaf3326122041c Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Tue, 30 Jan 2024 21:29:16 -0600
Subject: [PATCH 15/42] moved dict to constants file

---
 utils/constants.py | 36 ++++++++++++++++++++++++++++++++++++
 utils/linkage.py   | 42 ++++++------------------------------------
 2 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/utils/constants.py b/utils/constants.py
index b87d39d..f259db3 100644
--- a/utils/constants.py
+++ b/utils/constants.py
@@ -605,3 +605,39 @@
     " WV ",
     " WY ",
 ]
+
+# utils/linkage.py constants
+
+COMPANY_TYPES = {
+    "CORP": "CORPORATION",
+    "CO": "CORPORATION",
+    "LLC": "LIMITED LIABILITY COMPANY",
+    "PTNR": "PARTNERSHIP",
+    "LP": "LIMITED PARTNERSHIP",
+    "LLP": "LIMITED LIABILITY PARTNERSHIP",
+    "SOLE PROP": "SOLE PROPRIETORSHIP",
+    "SP": "SOLE PROPRIETORSHIP",
+    "NPO": "NONPROFIT ORGANIZATION",
+    "PC": "PROFESSIONAL CORPORATION",
+    "CO-OP": "COOPERATIVE",
+    "LTD": "LIMITED COMPANY",
+    "JSC": "JOINT STOCK COMPANY",
+    "HOLDCO": "HOLDING COMPANY",
+    "PLC": "PUBLIC LIMITED COMPANY",
+    "PVT LTD": "PRIVATE LIMITED COMPANY",
+    "INC": "INCORPORATED",
+    "ASSOC": "ASSOCIATION",
+    "FDN": "FOUNDATION",
+    "TR": "TRUST",
+    "SOC": "SOCIETY",
+    "CONSORT": "CONSORTIUM",
+    "SYND": "SYNDICATE",
+    "GRP": "GROUP",
+    "CORP SOLE": "CORPORATION SOLE",
+    "JV": "JOINT VENTURE",
+    "SUB": "SUBSIDIARY",
+    "FRANCHISE": "FRANCHISE",
+    "PA": "PROFESSIONAL ASSOCIATION",
+    "CIC": "COMMUNITY INTEREST COMPANY",
+    "PAC": "POLITICAL ACTION COMMITTEE",
+}
diff --git a/utils/linkage.py b/utils/linkage.py
index 49d120b..34b2579 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,3 +1,4 @@
+import constants
 import textdistance as td
 import usaddress
 
@@ -110,44 +111,13 @@ def standardize_corp_names(company_name: str) -> str:
 
     company_name_split = company_name.upper().split(" ")
 
-    company_types = {
-        "CORP": "CORPORATION",
-        "CO": "CORPORATION",
-        "LLC": "LIMITED LIABILITY COMPANY",
-        "PTNR": "PARTNERSHIP",
-        "LP": "LIMITED PARTNERSHIP",
-        "LLP": "LIMITED LIABILITY PARTNERSHIP",
-        "SOLE PROP": "SOLE PROPRIETORSHIP",
-        "SP": "SOLE PROPRIETORSHIP",
-        "NPO": "NONPROFIT ORGANIZATION",
-        "PC": "PROFESSIONAL CORPORATION",
-        "CO-OP": "COOPERATIVE",
-        "LTD": "LIMITED COMPANY",
-        "JSC": "JOINT STOCK COMPANY",
-        "HOLDCO": "HOLDING COMPANY",
-        "PLC": "PUBLIC LIMITED COMPANY",
-        "PVT LTD": "PRIVATE LIMITED COMPANY",
-        "INC": "INCORPORATED",
-        "ASSOC": "ASSOCIATION",
-        "FDN": "FOUNDATION",
-        "TR": "TRUST",
-        "SOC": "SOCIETY",
-        "CONSORT": "CONSORTIUM",
-        "SYND": "SYNDICATE",
-        "GRP": "GROUP",
-        "CORP SOLE": "CORPORATION SOLE",
-        "JV": "JOINT VENTURE",
-        "SUB": "SUBSIDIARY",
-        "FRANCHISE": "FRANCHISE",
-        "PA": "PROFESSIONAL ASSOCIATION",
-        "CIC": "COMMUNITY INTEREST COMPANY",
-        "PAC": "POLITICAL ACTION COMMITTEE",
-    }
-
     for i in range(len(company_name_split)):
-        if company_name_split[i] in list(company_types.keys()):
+        if company_name_split[i] in list(constants.COMPANY_TYPES.keys()):
             hold = company_name_split[i]
-            company_name_split[i] = company_types[hold]
+            company_name_split[i] = constants.COMPANY_TYPES[hold]
 
     new_company_name = " ".join(company_name_split)
     return new_company_name
+
+
+print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOCIATION"))

From 5b4de8c3c2c27164cf47df72d3eaa6101335cac1 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Tue, 30 Jan 2024 21:47:56 -0600
Subject: [PATCH 16/42] updated constants file

---
 utils/linkage.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 34b2579..a26a9fe 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -118,6 +118,3 @@ def standardize_corp_names(company_name: str) -> str:
 
     new_company_name = " ".join(company_name_split)
     return new_company_name
-
-
-print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOCIATION"))

From e4fe9fc354e4429d17b754c32b083b0eaae6a4c6 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Tue, 30 Jan 2024 21:53:36 -0600
Subject: [PATCH 17/42] updated constants file

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index a26a9fe..faa8860 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,4 +1,4 @@
-import constants
+import utils.constants
 import textdistance as td
 import usaddress
 

From 844d20e5ddccd35514da4ef52fb321677c15e919 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Tue, 30 Jan 2024 22:17:08 -0600
Subject: [PATCH 18/42] updated constants file

---
 utils/linkage.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index faa8860..9866a9b 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,4 +1,6 @@
-import utils.constants
+from utils.constants import (
+    COMPANY_TYPES
+)
 import textdistance as td
 import usaddress
 
@@ -112,9 +114,11 @@ def standardize_corp_names(company_name: str) -> str:
     company_name_split = company_name.upper().split(" ")
 
     for i in range(len(company_name_split)):
-        if company_name_split[i] in list(constants.COMPANY_TYPES.keys()):
+        if company_name_split[i] in list(COMPANY_TYPES.keys()):
             hold = company_name_split[i]
-            company_name_split[i] = constants.COMPANY_TYPES[hold]
+            company_name_split[i] = COMPANY_TYPES[hold]
 
     new_company_name = " ".join(company_name_split)
     return new_company_name
+
+print(standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC'))

From 976fc3ff4608874a9259977b21397027073ecfd6 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 31 Jan 2024 14:15:18 +0000
Subject: [PATCH 19/42] updated function

---
 utils/linkage.py | 52 +++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 86485d3..1dbf54b 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,7 +1,8 @@
 """
 Module for performing record linkage on state campaign finance dataset
 """
-import pandas as pd
+import re
+
 import usaddress
 
 
@@ -50,12 +51,7 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
     return " ".join(string)
 
 
-"""
-Module for standardizing the 'company' columnn of the state campaign finance dataset
-"""
-
-
-def cleaning_company_column(company: str) -> str:
+def cleaning_company_column(company_entry: str) -> str:
     """
     Given a string, check if it contains a variation of self employed, unemployed,
     or retired and return the standardized version.
@@ -72,30 +68,44 @@ def cleaning_company_column(company: str) -> str:
     'Self Employed'
     >>> cleaning_company_column("None")
     'Unemployed'
+    >>> cleaning_company_column("N/A")
+    'Unemployed'
+    >>> cleaning_company_column("nan")
+    'Unemployed'
     """
-    if pd.isnull(company):
-        return company
 
-    company_edited = company.lower()
-    company_edited = company_edited.strip()
-    company_edited = company_edited.replace(".", " ")
-    company_edited = company_edited.replace(",", " ")
-    company_edited = company_edited.replace("-", " ")
+    if not company_entry:
+        return company_entry
 
-    if "retire" in company_edited:
+    company_edited = company_entry.lower()
+
+    if company_edited == "n/a":
+        return "Unemployed"
+
+    company_edited = re.sub(r"[^\w\s]", "", company_edited)
+
+    if (
+        company_edited == "retired"
+        or company_edited == "retiree"
+        or company_edited == "retire"
+        or "retiree" in company_edited
+    ):
         return "Retired"
-    elif "self employe" in company_edited or company_edited == "self":
+
+    elif (
+        "self employe" in company_edited
+        or "freelance" in company_edited
+        or company_edited == "self"
+        or company_edited == "independent contractor"
+    ):
         return "Self Employed"
     elif (
         "unemploye" in company_edited
         or company_edited == "none"
         or company_edited == "not employed"
+        or company_edited == "nan"
     ):
         return "Unemployed"
 
     else:
-        return company
-
-
-# Example implementation of the function standardize_company_column for a dataframe
-# df['standardized_company'] = df['company'].apply(standardize_company_column)
+        return company_edited

From 87ea3da197ea722b5c54f99b7f5cdd29b890060d Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 31 Jan 2024 09:02:44 -0600
Subject: [PATCH 20/42] Adding Avery's feedback

---
 utils/linkage.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index e419c6f..2cdd11b 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -53,6 +53,16 @@ def determine_comma_role(name: str) -> str:
         name: a string representing a name/names of individuals
     Returns:
         the name with or without a comma based on some conditions
+
+    Sample Usage:
+    >>> determine_comma_role("Jane Doe, Jr")
+    'Jane Doe, Jr'
+    >>> determine_comma_role("Doe, Jane Elisabeth")
+    ' Jane Elisabeth Doe'
+    >>> determine_comma_role("Jane Doe,")
+    'Jane Doe'
+    >>> determine_comma_role("DOe, Jane")
+    ' Jane Doe'
     """
     suffixes = [
         "sr",
@@ -68,19 +78,19 @@ def determine_comma_role(name: str) -> str:
         "ix",
         "x",
     ]
-    name_parts = name.split(",")
+    name_parts = name.lower().split(",")
     # if the comma is just in the end as a typo:
     if len(name_parts[1]) == 0:
-        return name_parts[0]
+        return name_parts[0].title()
     # if just the suffix in the end, leave the name as it is
     if name_parts[1].strip() in suffixes:
-        return name
+        return name.title()
     # at this point either it's just poor name placement, or the suffix is
     # in the beginning of the name. Either way, the first part of the list is
     # the true last name.
     last_part = name_parts.pop(0)
     first_part = " ".join(name_parts)
-    return first_part + " " + last_part
+    return first_part.title() + " " + last_part.title()
 
 
 def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
@@ -114,6 +124,8 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     'Jane Elisabeth Doe, Iv'
     >>> get_likely_name("","","Jane Elisabeth Doe, IV")
     'Jane Elisabeth Doe Iv'
+    >>> get_likely_name("Jane","","Doe, Jane, Elisabeth")
+    'Jane Elisabeth Doe'
     """
     # first ensure clean input by deleting spaces:
     first_name, last_name, full_name = list(
@@ -154,10 +166,10 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     # one last check to remove any pieces that might add extra whitespace
     names = list(filter(lambda x: x != "", names))
     names = " ".join(names)
-    names = names.split(" ")
+    names = names.title().replace("  ", " ").split(" ")
     final_name = []
     [final_name.append(x) for x in names if x not in final_name]
-    return " ".join(final_name).title().strip()
+    return " ".join(final_name).strip()
 
 
 def get_street_from_address_line_1(address_line_1: str) -> str:

From 23a8c1ffca9935aeef5b74341c7562eb1f020fe2 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 31 Jan 2024 09:07:26 -0600
Subject: [PATCH 21/42] Adding Avery's feedback

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 2cdd11b..0450fca 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -123,7 +123,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
     >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV")
     'Jane Elisabeth Doe, Iv'
     >>> get_likely_name("","","Jane Elisabeth Doe, IV")
-    'Jane Elisabeth Doe Iv'
+    'Jane Elisabeth Doe, Iv'
     >>> get_likely_name("Jane","","Doe, Jane, Elisabeth")
     'Jane Elisabeth Doe'
     """

From 4081715a2d4b83875c3def1c086f3d9f1b579e78 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 31 Jan 2024 09:40:58 -0600
Subject: [PATCH 22/42] saving personal work before merging, no need to look or
 review @Avery @Trevor

---
 notebooks/Test.ipynb | 421 ++++++++++++++++++++++++++++++++++++++++++-
 utils/linkage.py     |  13 ++
 2 files changed, 433 insertions(+), 1 deletion(-)

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index 5df942e..1176ab7 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -12,6 +12,425 @@
     "\n",
     "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def determine_comma_role(name: str) -> str:\n",
+    "    \"\"\"Given a string (someone's name), attempts to determine the role of the\n",
+    "    comma in the name and where it ought to belong.\n",
+    "\n",
+    "    Some assumptions are made:\n",
+    "        * If a suffix is included in the name and the name is not just the last\n",
+    "          name(i.e \"Doe, Jr), the format is\n",
+    "          (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth\n",
+    "\n",
+    "        * If a comma is used anywhere else, it is in the format of\n",
+    "          (last_name, first and middle name) i.e Doe, Jane Elisabeth\n",
+    "    Args:\n",
+    "        name: a string representing a name/names of individuals\n",
+    "    Returns:\n",
+    "        the name with or without a comma based on some conditions\n",
+    "    \"\"\"\n",
+    "    suffixes = [\n",
+    "        \"sr\",\n",
+    "        \"jr\",\n",
+    "        \"i\",\n",
+    "        \"ii\",\n",
+    "        \"iii\",\n",
+    "        \"iv\",\n",
+    "        \"v\",\n",
+    "        \"vi\",\n",
+    "        \"vii\",\n",
+    "        \"viii\",\n",
+    "        \"ix\",\n",
+    "        \"x\",\n",
+    "    ]\n",
+    "    name_parts = name.lower().split(\",\")\n",
+    "    # if the comma is just in the end as a typo:\n",
+    "    if len(name_parts[1]) == 0:\n",
+    "        return name_parts[0].title()\n",
+    "    # if just the suffix in the end, leave the name as it is\n",
+    "    if name_parts[1].strip() in suffixes:\n",
+    "        return name.title()\n",
+    "    # at this point either it's just poor name placement, or the suffix is\n",
+    "    # in the beginning of the name. Either way, the first part of the list is\n",
+    "    # the true last name.\n",
+    "    last_part = name_parts.pop(0)\n",
+    "    first_part = \" \".join(name_parts)\n",
+    "    return first_part.title() + \" \" + last_part.title()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' Jane  Jr Doe'"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "determine_comma_role(\"DOe, Jane, Jr\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:\n",
+    "    \"\"\"Given name related columns, return a person's likely name\n",
+    "\n",
+    "    Given different formatting used accross states, errors in data entry\n",
+    "    and missing data, it can be difficult to determine someone's actual\n",
+    "    name. For example, some states have a last name column with values like\n",
+    "    \"Doe, Jane\", where the person's first name appears to have been erroneously\n",
+    "    included.\n",
+    "\n",
+    "    Args:\n",
+    "        first_name: raw value of first name column\n",
+    "        last_name: raw value last name column\n",
+    "        full_name: raw value of name or full_name column\n",
+    "    Returns:\n",
+    "        The most likely full name of the person listed\n",
+    "\n",
+    "    Sample Usage:\n",
+    "    >>> get_likely_name(\"Jane\", \"Doe\", \"\")\n",
+    "    'Jane Doe'\n",
+    "    >>> get_likely_name(\"\", \"\", \"Jane Doe\")\n",
+    "    'Jane Doe'\n",
+    "    >>> get_likely_name(\"\", \"Doe, Jane\", \"\")\n",
+    "    'Jane Doe'\n",
+    "    >>> get_likely_name(\"Jane Doe\", \"Doe\", \"Jane Doe\")\n",
+    "    'Jane Doe'\n",
+    "    >>> get_likely_name(\"Jane\",\"\",\"Doe, Sr\")\n",
+    "    'Jane Doe, Sr'\n",
+    "    >>> get_likely_name(\"Jane Elisabeth Doe, IV\",\"Elisabeth\",\"Doe, IV\")\n",
+    "    'Jane Elisabeth Doe, Iv'\n",
+    "    >>> get_likely_name(\"\",\"\",\"Jane Elisabeth Doe, IV\")\n",
+    "    'Jane Elisabeth Doe Iv'\n",
+    "    \"\"\"\n",
+    "    # first ensure clean input by deleting spaces:\n",
+    "    first_name, last_name, full_name = list(\n",
+    "        map(lambda x: x.lower().strip(), [first_name, last_name, full_name])\n",
+    "    )\n",
+    "\n",
+    "    # if data is clean:\n",
+    "    if first_name + \" \" + last_name == full_name:\n",
+    "        return full_name\n",
+    "\n",
+    "    # some names have titles or professions associated with the name. We need to\n",
+    "    # remove those from the name.\n",
+    "    titles = [\n",
+    "        \"mr\",\n",
+    "        \"ms\",\n",
+    "        \"mrs\",\n",
+    "        \"miss\",\n",
+    "        \"prof\",\n",
+    "        \"dr\",\n",
+    "        \"doctor\",\n",
+    "        \"sir\",\n",
+    "        \"madam\",\n",
+    "        \"professor\",\n",
+    "    ]\n",
+    "    names = [first_name, last_name, full_name]\n",
+    "\n",
+    "    for i in range(len(names)):\n",
+    "        # if there is a ',' deal with it accordingly\n",
+    "        if \",\" in names[i]:\n",
+    "            names[i] = determine_comma_role(names[i])\n",
+    "\n",
+    "        names[i] = names[i].replace(\".\", \"\").split(\" \")\n",
+    "        names[i] = [\n",
+    "            name_part for name_part in names[i] if name_part not in titles\n",
+    "        ]\n",
+    "        names[i] = \" \".join(names[i])\n",
+    "\n",
+    "    # one last check to remove any pieces that might add extra whitespace\n",
+    "    names = list(filter(lambda x: x != \"\", names))\n",
+    "    names = \" \".join(names)\n",
+    "    names = names.title().replace(\"  \",\" \").split(\" \")\n",
+    "    final_name = []\n",
+    "    [final_name.append(x) for x in names if x not in final_name]\n",
+    "    return \" \".join(final_name).strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_4143866/1500712151.py:2: DtypeWarning: Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  ind_df = pd.read_csv(\"../output/complete_individuals_table.csv\")\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>id</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>last_name</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>entity_type</th>\n",
+       "      <th>state</th>\n",
+       "      <th>party</th>\n",
+       "      <th>company</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1869727</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>william  \bstoner</td>\n",
+       "      <td>individual</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1779679</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>rm  coulon</td>\n",
+       "      <td>individual</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>area agency on aging</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>2277221</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>james  engelson</td>\n",
+       "      <td>individual</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>retired</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>2277156</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>marivic  franciaskinner</td>\n",
+       "      <td>individual</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>fibre source international corp</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>2341373</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>anthony  grindle</td>\n",
+       "      <td>individual</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>zimmerbiomet</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2505341</th>\n",
+       "      <td>861260</td>\n",
+       "      <td>6acfa74b-d5e1-4afd-b020-dbe429eb1c3f</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Melissa Hart</td>\n",
+       "      <td>Candidate</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>REP</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2505342</th>\n",
+       "      <td>861271</td>\n",
+       "      <td>f111045d-bc3d-4050-9ad7-b3b1e6d72e56</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Heather Miller</td>\n",
+       "      <td>Candidate</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>DEM</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2505343</th>\n",
+       "      <td>861277</td>\n",
+       "      <td>d40859d7-b523-4ef5-895b-c3a947ab582f</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Christopher M. Gebhard</td>\n",
+       "      <td>Candidate</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>REP</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2505344</th>\n",
+       "      <td>861775</td>\n",
+       "      <td>f5d76d43-86f4-40f9-aeb9-3df97ca8cdf0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>April Weaver</td>\n",
+       "      <td>Candidate</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>REP</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2505345</th>\n",
+       "      <td>861920</td>\n",
+       "      <td>1a0cf90d-3252-4c8d-b109-dea084a01f69</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Krista Paolucci</td>\n",
+       "      <td>Candidate</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>REP</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2505346 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Unnamed: 0                                    id first_name  \\\n",
+       "0                 0                               1869727        NaN   \n",
+       "1                 1                               1779679        NaN   \n",
+       "2                 2                               2277221        NaN   \n",
+       "3                 3                               2277156        NaN   \n",
+       "4                 4                               2341373        NaN   \n",
+       "...             ...                                   ...        ...   \n",
+       "2505341      861260  6acfa74b-d5e1-4afd-b020-dbe429eb1c3f        NaN   \n",
+       "2505342      861271  f111045d-bc3d-4050-9ad7-b3b1e6d72e56        NaN   \n",
+       "2505343      861277  d40859d7-b523-4ef5-895b-c3a947ab582f        NaN   \n",
+       "2505344      861775  f5d76d43-86f4-40f9-aeb9-3df97ca8cdf0        NaN   \n",
+       "2505345      861920  1a0cf90d-3252-4c8d-b109-dea084a01f69        NaN   \n",
+       "\n",
+       "        last_name                full_name entity_type state party  \\\n",
+       "0             NaN         william  \bstoner  individual   NaN   NaN   \n",
+       "1             NaN               rm  coulon  individual   NaN   NaN   \n",
+       "2             NaN          james  engelson  individual   NaN   NaN   \n",
+       "3             NaN  marivic  franciaskinner  individual   NaN   NaN   \n",
+       "4             NaN         anthony  grindle  individual   NaN   NaN   \n",
+       "...           ...                      ...         ...   ...   ...   \n",
+       "2505341       NaN             Melissa Hart   Candidate    PA   REP   \n",
+       "2505342       NaN           Heather Miller   Candidate    PA   DEM   \n",
+       "2505343       NaN   Christopher M. Gebhard   Candidate    PA   REP   \n",
+       "2505344       NaN             April Weaver   Candidate    PA   REP   \n",
+       "2505345       NaN          Krista Paolucci   Candidate    PA   REP   \n",
+       "\n",
+       "                                 company  \n",
+       "0                                    NaN  \n",
+       "1                   area agency on aging  \n",
+       "2                                retired  \n",
+       "3        fibre source international corp  \n",
+       "4                           zimmerbiomet  \n",
+       "...                                  ...  \n",
+       "2505341                              NaN  \n",
+       "2505342                              NaN  \n",
+       "2505343                              NaN  \n",
+       "2505344                              NaN  \n",
+       "2505345                              NaN  \n",
+       "\n",
+       "[2505346 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "ind_df = pd.read_csv(\"../output/complete_individuals_table.csv\")\n",
+    "ind_df.sample(1000)\n",
+    "ind_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Doe, Jr, Jane'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "determine_comma_role(\"Doe, Jr, Jane\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -30,7 +449,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.7"
   },
   "orig_nbformat": 4
  },
diff --git a/utils/linkage.py b/utils/linkage.py
index 0450fca..f501897 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -3,6 +3,7 @@
 """
 import textdistance as td
 import usaddress
+import pandas as pd
 
 
 def calculate_string_similarity(string1: str, string2: str) -> float:
@@ -215,3 +216,15 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
             string.append(key)
 
     return " ".join(string)
+
+def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
+    '''Given a dataframe, remove rows that have identical entry data beyond
+    UUIDs, and output a file mapping an entry to other the UUIDs of the
+    deduplicated rows
+    
+    Args:
+        a pandas dataframe containing contribution data
+    Returns:
+        a deduplicated pandas dataframe containing contribution data
+    '''
+    pass
\ No newline at end of file

From 3fcbc5b6539edc5fdf1102c9ec9d3727552c57ee Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Wed, 31 Jan 2024 09:51:34 -0600
Subject: [PATCH 23/42] precommit checks

---
 utils/linkage.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 9866a9b..5788eb0 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,9 +1,8 @@
-from utils.constants import (
-    COMPANY_TYPES
-)
 import textdistance as td
 import usaddress
 
+from utils.constants import COMPANY_TYPES
+
 """
 Module for performing record linkage on state campaign finance dataset
 """
@@ -121,4 +120,5 @@ def standardize_corp_names(company_name: str) -> str:
     new_company_name = " ".join(company_name_split)
     return new_company_name
 
-print(standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC'))
+
+print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOC"))

From f07dae2a96ebc9ed00d7056721361d3684165b5c Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Wed, 31 Jan 2024 10:23:53 -0600
Subject: [PATCH 24/42] get address number from line 1 function

---
 utils/linkage.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index a96b816..1333024 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -165,4 +165,32 @@ def standardize_corp_names(company_name: str) -> str:
     return new_company_name
 
 
-print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOC"))
+def get_address_number_from_address_line_1(address_line_1: str) -> str:
+    """Given an address line 1, return the building number or po box
+
+    Args:
+        address_line_1: either street information or PO box
+    Returns:
+        address or po box number
+
+    Sample Usage:
+    >>> get_building_from_address_line_1('6727 W. Corrine Dr.  Peoria,AZ 85381')
+    '6727'
+    >>> get_building_from_address_line_1('P.O. Box 5456  Sun City West ,AZ 85375')
+    'P.O. Box 5456'
+    >>> get_building_from_address_line_1('119 S 5th St  Niles,MI 49120')
+    '119'
+    >>> get_building_from_address_line_1(
+    ...     '1415 PARKER STREET APT 251	DETROIT	MI	48214-0000'
+    ... )
+    '1415'
+    """
+
+    address_line_1_components = usaddress.parse(address_line_1)
+
+    for i in range(len(address_line_1_components)):
+        if address_line_1_components[i][1] == "AddressNumber":
+            return address_line_1_components[i][0]
+        elif address_line_1_components[i][1] == "USPSBoxID":
+            return address_line_1_components[i][0]
+    raise ValueError("Can not find Address Number")

From 8849f462925bbc3064f5f5539513cb16cf7c20b7 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Wed, 31 Jan 2024 10:29:06 -0600
Subject: [PATCH 25/42] get address number from line 1 function

---
 utils/linkage.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 1333024..379e6d4 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -174,13 +174,13 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
         address or po box number
 
     Sample Usage:
-    >>> get_building_from_address_line_1('6727 W. Corrine Dr.  Peoria,AZ 85381')
+    >>> get_address_number_from_address_line_1('6727 W. Corrine Dr.  Peoria,AZ 85381')
     '6727'
-    >>> get_building_from_address_line_1('P.O. Box 5456  Sun City West ,AZ 85375')
+    >>> get_address_number_from_address_line_1('P.O. Box 5456  Sun City West ,AZ 85375')
     'P.O. Box 5456'
-    >>> get_building_from_address_line_1('119 S 5th St  Niles,MI 49120')
+    >>> get_address_number_from_address_line_1('119 S 5th St  Niles,MI 49120')
     '119'
-    >>> get_building_from_address_line_1(
+    >>> get_address_number_from_address_line_1(
     ...     '1415 PARKER STREET APT 251	DETROIT	MI	48214-0000'
     ... )
     '1415'

From d0086ef22db122a6e8bd6add3f7e2fdfcc9fb221 Mon Sep 17 00:00:00 2001
From: npashilkar <npashilkar@uchicago.edu>
Date: Wed, 31 Jan 2024 11:07:37 -0600
Subject: [PATCH 26/42] get address number from line 1 function

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 379e6d4..ac11a5a 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -177,7 +177,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
     >>> get_address_number_from_address_line_1('6727 W. Corrine Dr.  Peoria,AZ 85381')
     '6727'
     >>> get_address_number_from_address_line_1('P.O. Box 5456  Sun City West ,AZ 85375')
-    'P.O. Box 5456'
+    '5456'
     >>> get_address_number_from_address_line_1('119 S 5th St  Niles,MI 49120')
     '119'
     >>> get_address_number_from_address_line_1(

From 5f65159fbe7d8752755e814878486d8f50697b48 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Wed, 31 Jan 2024 23:48:08 -0600
Subject: [PATCH 27/42] attempt so far at dedup

---
 utils/linkage.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 6e8e6a5..f8ea7bb 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -271,4 +271,13 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         a deduplicated pandas dataframe containing contribution data
     '''
-    pass
\ No newline at end of file
+    #first remove all duplicate entries:
+    new_df = df.drop_duplicates()
+
+    # now find the duplicates along all columns but the ID
+    cols = new_df.columns[1:]
+    duplicates = new_df[new_df.duplicated(cols)]        
+    new_df = new_df.drop(index=duplicates.index.tolist())
+    #for index in duplicates.index:
+
+    return new_df
\ No newline at end of file

From 71a3174aabda2137f4980cb8df7952374f3ca7a5 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Thu, 1 Feb 2024 00:12:47 -0600
Subject: [PATCH 28/42] attempt so far at dedup

---
 utils/linkage.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index f8ea7bb..bc2f062 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,9 +1,9 @@
 """
 Module for performing record linkage on state campaign finance dataset
 """
+import pandas as pd
 import textdistance as td
 import usaddress
-import pandas as pd
 
 
 def get_address_line_1_from_full_address(address: str) -> str:
@@ -261,23 +261,24 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
 
     return " ".join(string)
 
+
 def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
-    '''Given a dataframe, remove rows that have identical entry data beyond
+    """Given a dataframe, remove rows that have identical entry data beyond
     UUIDs, and output a file mapping an entry to other the UUIDs of the
     deduplicated rows
-    
+
     Args:
         a pandas dataframe containing contribution data
     Returns:
         a deduplicated pandas dataframe containing contribution data
-    '''
-    #first remove all duplicate entries:
+    """
+    # first remove all duplicate entries:
     new_df = df.drop_duplicates()
 
     # now find the duplicates along all columns but the ID
     cols = new_df.columns[1:]
-    duplicates = new_df[new_df.duplicated(cols)]        
+    duplicates = new_df[new_df.duplicated(cols)]
     new_df = new_df.drop(index=duplicates.index.tolist())
-    #for index in duplicates.index:
+    # for index in duplicates.index:
 
-    return new_df
\ No newline at end of file
+    return new_df

From 56cde5f003a2e3a49817e3c04e2305252110ef96 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Thu, 1 Feb 2024 00:13:15 -0600
Subject: [PATCH 29/42] attempt so far at dedup

---
 utils/linkage.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index bc2f062..25e110d 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -279,6 +279,5 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     cols = new_df.columns[1:]
     duplicates = new_df[new_df.duplicated(cols)]
     new_df = new_df.drop(index=duplicates.index.tolist())
-    # for index in duplicates.index:
 
     return new_df

From 161a175c8f31bf79fea702d7b7497cb33218bd0b Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Thu, 1 Feb 2024 01:54:17 -0600
Subject: [PATCH 30/42] updates on linkage doc, ignore notebooks/Test.ipynb

---
 notebooks/Test.ipynb | 287 +++++++++++++++++++++++++++++++++++++------
 utils/linkage.py     |  27 +++-
 2 files changed, 276 insertions(+), 38 deletions(-)

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index cf4679f..e4cac62 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -191,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -403,7 +403,7 @@
        "18                                            Paa Pac    PA  Organization  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -433,10 +433,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from utils.constants import repo_root\n",
+    "def convert_duplicates_to_dict(df: pd.DataFrame)->pd.DataFrame:\n",
+    "    '''Takes a dataframe whose indexes are UUIDs, and a column that is a list of\n",
+    "    all other UUIDs that have duplicate values. The function then outputs a\n",
+    "    dictionary file where the deduped UUIDs map to the dataframe main UUID\n",
+    "    \n",
+    "    Args:\n",
+    "        A pandas dataframe with UUIDs as indexes and deduplicated UUIDs\n",
+    "        matching up to the index in the same row\n",
+    "        \n",
+    "    Returns\n",
+    "        None. However it outputs a dictionary\n",
+    "    '''\n",
+    "    #for index in df.index:\n",
+    "        \n",
+    "    #entities.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n",
+    "    pass\n",
+    "\n",
+    "\n",
     "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n",
     "    '''Given a dataframe, remove rows that have identical entry data beyond\n",
     "    UUIDs, and output a file mapping an entry to other the UUIDs of the\n",
@@ -451,17 +470,20 @@
     "    new_df = df.drop_duplicates()\n",
     "\n",
     "    # now find the duplicates along all columns but the ID\n",
-    "    cols = new_df.columns[1:]\n",
-    "    duplicates = new_df[new_df.duplicated(cols)]        \n",
-    "    new_df = new_df.drop(index=duplicates.index.tolist())\n",
-    "    #for index in duplicates.index:\n",
+    "    new_df=new_df.groupby(df.columns[1:].tolist())[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n",
+    "    new_df.index=new_df[\"duplicated\"].str[0].tolist()\n",
+    "    new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n",
     "\n",
+    "    # now convert the duplicated column into a dictionary that can will be\n",
+    "    # an output\n",
+    "    convert_duplicates_to_dict(new_df[['duplicated']])\n",
+    "    #new_df = new_df.drop(['duplicated'], axis=1)\n",
     "    return new_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
@@ -485,63 +507,151 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>id</th>\n",
        "      <th>name</th>\n",
        "      <th>state</th>\n",
        "      <th>entity_type</th>\n",
+       "      <th>duplicated</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe</td>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
+       "      <th>d31df1ca-714e-4a82-9e88-1892c0451a71</th>\n",
+       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
        "      <td>MI</td>\n",
        "      <td>committee</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff</td>\n",
+       "      <th>910c4d36-b036-469e-aa2a-ea4ff8855a6c</th>\n",
+       "      <td>Citizens For Kail</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</th>\n",
        "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
        "      <td>MI</td>\n",
        "      <td>committee</td>\n",
+       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
+       "      <th>c875d7de-94be-42f1-b994-dd89b114d51e</th>\n",
+       "      <td>Pa Fraternal Order Of Police Pac</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>60d454d1-3773-4d88-80e9-132c161da0f0</th>\n",
        "      <td>Paa Pac</td>\n",
        "      <td>PA</td>\n",
        "      <td>Organization</td>\n",
+       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>f71341d7-d27e-47eb-9b66-903af39d6cb5</th>\n",
+       "      <td>Pabar Pac (Pa Bar Assn)</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50c7d9a1-b448-46a5-8e2d-cd15b3097360</th>\n",
+       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>committee</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</th>\n",
+       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>committee</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</th>\n",
+       "      <td>Ugi Utilities Inc/Ugi Energy Services Llc Pac</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                      id  \\\n",
-       "16  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe   \n",
-       "17  1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff   \n",
-       "18  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
+       "                                                                                   name  \\\n",
+       "d31df1ca-714e-4a82-9e88-1892c0451a71             COMMITTEE TO ELECT DR PATRICIA BERNARD   \n",
+       "910c4d36-b036-469e-aa2a-ea4ff8855a6c                                  Citizens For Kail   \n",
+       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC   \n",
+       "c875d7de-94be-42f1-b994-dd89b114d51e                   Pa Fraternal Order Of Police Pac   \n",
+       "60d454d1-3773-4d88-80e9-132c161da0f0                                            Paa Pac   \n",
+       "f71341d7-d27e-47eb-9b66-903af39d6cb5                            Pabar Pac (Pa Bar Assn)   \n",
+       "50c7d9a1-b448-46a5-8e2d-cd15b3097360  REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...   \n",
+       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3  UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...   \n",
+       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7      Ugi Utilities Inc/Ugi Energy Services Llc Pac   \n",
+       "\n",
+       "                                     state   entity_type  \\\n",
+       "d31df1ca-714e-4a82-9e88-1892c0451a71    MI     committee   \n",
+       "910c4d36-b036-469e-aa2a-ea4ff8855a6c    PA  Organization   \n",
+       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd    MI     committee   \n",
+       "c875d7de-94be-42f1-b994-dd89b114d51e    PA  Organization   \n",
+       "60d454d1-3773-4d88-80e9-132c161da0f0    PA  Organization   \n",
+       "f71341d7-d27e-47eb-9b66-903af39d6cb5    PA  Organization   \n",
+       "50c7d9a1-b448-46a5-8e2d-cd15b3097360    MI     committee   \n",
+       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3    MI     committee   \n",
+       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7    PA  Organization   \n",
        "\n",
-       "                                              name state   entity_type  \n",
-       "16  MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee  \n",
-       "17  MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee  \n",
-       "18                                         Paa Pac    PA  Organization  "
+       "                                                                             duplicated  \n",
+       "d31df1ca-714e-4a82-9e88-1892c0451a71                                                 []  \n",
+       "910c4d36-b036-469e-aa2a-ea4ff8855a6c                                                 []  \n",
+       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd  [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...  \n",
+       "c875d7de-94be-42f1-b994-dd89b114d51e                                                 []  \n",
+       "60d454d1-3773-4d88-80e9-132c161da0f0             [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]  \n",
+       "f71341d7-d27e-47eb-9b66-903af39d6cb5                                                 []  \n",
+       "50c7d9a1-b448-46a5-8e2d-cd15b3097360                                                 []  \n",
+       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3                                                 []  \n",
+       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7                                                 []  "
       ]
      },
-     "execution_count": 7,
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "x = deduplicate_perfect_matches(sample_df)\n",
-    "for i in range(len(x)):\n",
-    "    curr_row = x.loc[i]\n",
-    "    sample_df.loc[(sample_df.name == 'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC') &\n",
-    "#              (sample_df.state == 'MI') &\n",
-    "#               (sample_df.entity_type == 'committee')]\n",
-    "x\n"
+    "x#[['duplicated']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[]\n",
+      "[]\n",
+      "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe', '1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff']\n",
+      "[]\n",
+      "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd']\n",
+      "[]\n",
+      "[]\n",
+      "[]\n",
+      "[]\n"
+     ]
+    }
+   ],
+   "source": [
+    "y = x[['duplicated']]\n",
+    "for i in range(len(y)):\n",
+    "    #print(y.iloc[i]['duplicated'])\n",
+    "    print(y.iloc[i]['duplicated'])"
    ]
   },
   {
@@ -637,21 +747,128 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>state</th>\n",
+       "      <th>entity_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>d31df1ca-714e-4a82-9e88-1892c0451a71</th>\n",
+       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>committee</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>910c4d36-b036-469e-aa2a-ea4ff8855a6c</th>\n",
+       "      <td>Citizens For Kail</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</th>\n",
+       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>committee</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>c875d7de-94be-42f1-b994-dd89b114d51e</th>\n",
+       "      <td>Pa Fraternal Order Of Police Pac</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>60d454d1-3773-4d88-80e9-132c161da0f0</th>\n",
+       "      <td>Paa Pac</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>f71341d7-d27e-47eb-9b66-903af39d6cb5</th>\n",
+       "      <td>Pabar Pac (Pa Bar Assn)</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50c7d9a1-b448-46a5-8e2d-cd15b3097360</th>\n",
+       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>committee</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</th>\n",
+       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>committee</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</th>\n",
+       "      <td>Ugi Utilities Inc/Ugi Energy Services Llc Pac</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "['name', 'state', 'entity_type']"
+       "                                                                                   name  \\\n",
+       "d31df1ca-714e-4a82-9e88-1892c0451a71             COMMITTEE TO ELECT DR PATRICIA BERNARD   \n",
+       "910c4d36-b036-469e-aa2a-ea4ff8855a6c                                  Citizens For Kail   \n",
+       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC   \n",
+       "c875d7de-94be-42f1-b994-dd89b114d51e                   Pa Fraternal Order Of Police Pac   \n",
+       "60d454d1-3773-4d88-80e9-132c161da0f0                                            Paa Pac   \n",
+       "f71341d7-d27e-47eb-9b66-903af39d6cb5                            Pabar Pac (Pa Bar Assn)   \n",
+       "50c7d9a1-b448-46a5-8e2d-cd15b3097360  REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...   \n",
+       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3  UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...   \n",
+       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7      Ugi Utilities Inc/Ugi Energy Services Llc Pac   \n",
+       "\n",
+       "                                     state   entity_type  \n",
+       "d31df1ca-714e-4a82-9e88-1892c0451a71    MI     committee  \n",
+       "910c4d36-b036-469e-aa2a-ea4ff8855a6c    PA  Organization  \n",
+       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd    MI     committee  \n",
+       "c875d7de-94be-42f1-b994-dd89b114d51e    PA  Organization  \n",
+       "60d454d1-3773-4d88-80e9-132c161da0f0    PA  Organization  \n",
+       "f71341d7-d27e-47eb-9b66-903af39d6cb5    PA  Organization  \n",
+       "50c7d9a1-b448-46a5-8e2d-cd15b3097360    MI     committee  \n",
+       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3    MI     committee  \n",
+       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7    PA  Organization  "
       ]
      },
-     "execution_count": 19,
+     "execution_count": 41,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "source": []
+   "source": [
+    "#for index in x.index:\n",
+    "#    print(index)\n",
+    "x"
+   ]
   },
   {
    "cell_type": "code",
diff --git a/utils/linkage.py b/utils/linkage.py
index 6ee0de0..01d05de 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -270,6 +270,22 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
     return " ".join(string)
 
 
+def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame:
+    """Takes a dataframe whose indexes are UUIDs, and a column that is a list of
+    all other UUIDs that have duplicate values. The function then outputs a
+    dictionary file where the deduped UUIDs map to the dataframe main UUID
+
+    Args:
+        A pandas dataframe with UUIDs as indexes and deduplicated UUIDs
+        matching up to the index in the same row
+
+    Returns
+        None. However it outputs a dictionary
+    """
+    # df.to_csv(repo_root / "output" / "deduplicated_UUIDs.csv", index=False)
+    pass
+
+
 def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     """Given a dataframe, remove rows that have identical entry data beyond
     UUIDs, and output a file mapping an entry to other the UUIDs of the
@@ -284,9 +300,14 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     new_df = df.drop_duplicates()
 
     # now find the duplicates along all columns but the ID
-    cols = new_df.columns[1:]
-    duplicates = new_df[new_df.duplicated(cols)]
-    new_df = new_df.drop(index=duplicates.index.tolist())
+    new_df = (
+        new_df.groupby(df.columns[1:].tolist())["id"]
+        .agg(list)
+        .reset_index()
+        .rename(columns={"id": "duplicated"})
+    )
+    new_df.index = new_df["duplicated"].str[0].tolist()
+    new_df["duplicated"] = new_df["duplicated"].str[1:]
 
     return new_df
 

From b519fa164babf8498930abcddfcc0aa4abd8f135 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Thu, 1 Feb 2024 09:22:40 -0600
Subject: [PATCH 31/42] modifications to dedup function, not yet done, no need
 to review yet

---
 notebooks/Test.ipynb | 195 +++++++++++++++++++++++++------------------
 utils/linkage.py     |   9 ++
 2 files changed, 124 insertions(+), 80 deletions(-)

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index e4cac62..bc73185 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -191,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -403,7 +403,7 @@
        "18                                            Paa Pac    PA  Organization  "
       ]
      },
-     "execution_count": 45,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -433,7 +433,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -475,15 +475,16 @@
     "    new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n",
     "\n",
     "    # now convert the duplicated column into a dictionary that can will be\n",
-    "    # an output\n",
-    "    convert_duplicates_to_dict(new_df[['duplicated']])\n",
-    "    #new_df = new_df.drop(['duplicated'], axis=1)\n",
+    "    # an output by only feeding the entries with duplicates\n",
+    "    new_df = new_df.reset_index().rename(columns = {'index':'id'})\n",
+    "    convert_duplicates_to_dict(new_df[new_df['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n",
+    "    new_df = new_df.drop(['duplicated'], axis=1)\n",
     "    return new_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -507,74 +508,120 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>name</th>\n",
-       "      <th>state</th>\n",
-       "      <th>entity_type</th>\n",
+       "      <th>id</th>\n",
        "      <th>duplicated</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>d31df1ca-714e-4a82-9e88-1892c0451a71</th>\n",
-       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
+       "      <th>2</th>\n",
+       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
+       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>60d454d1-3773-4d88-80e9-132c161da0f0</td>\n",
+       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     id  \\\n",
+       "2  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
+       "4  60d454d1-3773-4d88-80e9-132c161da0f0   \n",
+       "\n",
+       "                                          duplicated  \n",
+       "2  [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...  \n",
+       "4             [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]  "
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = deduplicate_perfect_matches(sample_df)\n",
+    "#len(x.iloc[2]['duplicated'])\n",
+    "x[x['duplicated'].apply(lambda x: len(x)) > 0][['id','duplicated']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>duplicated</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>d31df1ca-714e-4a82-9e88-1892c0451a71</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>910c4d36-b036-469e-aa2a-ea4ff8855a6c</th>\n",
-       "      <td>Citizens For Kail</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>1</th>\n",
+       "      <td>910c4d36-b036-469e-aa2a-ea4ff8855a6c</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</th>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
+       "      <th>2</th>\n",
+       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
        "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>c875d7de-94be-42f1-b994-dd89b114d51e</th>\n",
-       "      <td>Pa Fraternal Order Of Police Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>3</th>\n",
+       "      <td>c875d7de-94be-42f1-b994-dd89b114d51e</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60d454d1-3773-4d88-80e9-132c161da0f0</th>\n",
-       "      <td>Paa Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>4</th>\n",
+       "      <td>60d454d1-3773-4d88-80e9-132c161da0f0</td>\n",
        "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>f71341d7-d27e-47eb-9b66-903af39d6cb5</th>\n",
-       "      <td>Pabar Pac (Pa Bar Assn)</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>5</th>\n",
+       "      <td>f71341d7-d27e-47eb-9b66-903af39d6cb5</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>50c7d9a1-b448-46a5-8e2d-cd15b3097360</th>\n",
-       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
+       "      <th>6</th>\n",
+       "      <td>50c7d9a1-b448-46a5-8e2d-cd15b3097360</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</th>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
+       "      <th>7</th>\n",
+       "      <td>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</th>\n",
-       "      <td>Ugi Utilities Inc/Ugi Energy Services Llc Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>8</th>\n",
+       "      <td>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -582,48 +629,36 @@
        "</div>"
       ],
       "text/plain": [
-       "                                                                                   name  \\\n",
-       "d31df1ca-714e-4a82-9e88-1892c0451a71             COMMITTEE TO ELECT DR PATRICIA BERNARD   \n",
-       "910c4d36-b036-469e-aa2a-ea4ff8855a6c                                  Citizens For Kail   \n",
-       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC   \n",
-       "c875d7de-94be-42f1-b994-dd89b114d51e                   Pa Fraternal Order Of Police Pac   \n",
-       "60d454d1-3773-4d88-80e9-132c161da0f0                                            Paa Pac   \n",
-       "f71341d7-d27e-47eb-9b66-903af39d6cb5                            Pabar Pac (Pa Bar Assn)   \n",
-       "50c7d9a1-b448-46a5-8e2d-cd15b3097360  REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...   \n",
-       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3  UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...   \n",
-       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7      Ugi Utilities Inc/Ugi Energy Services Llc Pac   \n",
-       "\n",
-       "                                     state   entity_type  \\\n",
-       "d31df1ca-714e-4a82-9e88-1892c0451a71    MI     committee   \n",
-       "910c4d36-b036-469e-aa2a-ea4ff8855a6c    PA  Organization   \n",
-       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd    MI     committee   \n",
-       "c875d7de-94be-42f1-b994-dd89b114d51e    PA  Organization   \n",
-       "60d454d1-3773-4d88-80e9-132c161da0f0    PA  Organization   \n",
-       "f71341d7-d27e-47eb-9b66-903af39d6cb5    PA  Organization   \n",
-       "50c7d9a1-b448-46a5-8e2d-cd15b3097360    MI     committee   \n",
-       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3    MI     committee   \n",
-       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7    PA  Organization   \n",
+       "                                     id  \\\n",
+       "0  d31df1ca-714e-4a82-9e88-1892c0451a71   \n",
+       "1  910c4d36-b036-469e-aa2a-ea4ff8855a6c   \n",
+       "2  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
+       "3  c875d7de-94be-42f1-b994-dd89b114d51e   \n",
+       "4  60d454d1-3773-4d88-80e9-132c161da0f0   \n",
+       "5  f71341d7-d27e-47eb-9b66-903af39d6cb5   \n",
+       "6  50c7d9a1-b448-46a5-8e2d-cd15b3097360   \n",
+       "7  62ea1e9c-ac12-400c-b3dc-519389c0f7d3   \n",
+       "8  4db76e6e-f0d5-40eb-82de-6dbcdb562dd7   \n",
        "\n",
-       "                                                                             duplicated  \n",
-       "d31df1ca-714e-4a82-9e88-1892c0451a71                                                 []  \n",
-       "910c4d36-b036-469e-aa2a-ea4ff8855a6c                                                 []  \n",
-       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd  [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...  \n",
-       "c875d7de-94be-42f1-b994-dd89b114d51e                                                 []  \n",
-       "60d454d1-3773-4d88-80e9-132c161da0f0             [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]  \n",
-       "f71341d7-d27e-47eb-9b66-903af39d6cb5                                                 []  \n",
-       "50c7d9a1-b448-46a5-8e2d-cd15b3097360                                                 []  \n",
-       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3                                                 []  \n",
-       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7                                                 []  "
+       "                                          duplicated  \n",
+       "0                                                 []  \n",
+       "1                                                 []  \n",
+       "2  [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...  \n",
+       "3                                                 []  \n",
+       "4             [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]  \n",
+       "5                                                 []  \n",
+       "6                                                 []  \n",
+       "7                                                 []  \n",
+       "8                                                 []  "
       ]
      },
-     "execution_count": 47,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "x = deduplicate_perfect_matches(sample_df)\n",
-    "x#[['duplicated']]"
+    "x[['id','duplicated']]"
    ]
   },
   {
diff --git a/utils/linkage.py b/utils/linkage.py
index 01d05de..0d2ebf7 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -309,6 +309,15 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
     new_df.index = new_df["duplicated"].str[0].tolist()
     new_df["duplicated"] = new_df["duplicated"].str[1:]
 
+    # now convert the duplicated column into a dictionary that can will be
+    # an output by only feeding the entries with duplicates
+    new_df = new_df.reset_index().rename(columns={"index": "id"})
+    convert_duplicates_to_dict(
+        new_df[new_df["duplicated"].apply(lambda x: len(x)) > 0][
+            ["id", "duplicated"]
+        ]
+    )
+    new_df = new_df.drop(["duplicated"], axis=1)
     return new_df
 
 

From 4ac551fa498be733717a7f50af2084cb28e6c321 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Fri, 2 Feb 2024 02:34:42 +0000
Subject: [PATCH 32/42] passing pre-commits and doctests

---
 utils/linkage.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 1dbf54b..d223617 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -70,8 +70,6 @@ def cleaning_company_column(company_entry: str) -> str:
     'Unemployed'
     >>> cleaning_company_column("N/A")
     'Unemployed'
-    >>> cleaning_company_column("nan")
-    'Unemployed'
     """
 
     if not company_entry:

From 37dcbf76a638c0007ff0de1620b93f6ec2f24ec3 Mon Sep 17 00:00:00 2001
From: Avery Schoen <33437601+averyschoen@users.noreply.github.com>
Date: Fri, 2 Feb 2024 14:02:34 -0600
Subject: [PATCH 33/42] Update linkage.py

---
 utils/linkage.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index c884238..74cbc93 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -252,4 +252,5 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
             return address_line_1_components[i][0]
         elif address_line_1_components[i][1] == "USPSBoxID":
             return address_line_1_components[i][0]
-    raise ValueError("Can not find Address Number")
\ No newline at end of file
+    raise ValueError("Can not find Address Number")
+    

From 7f9135f7acc77ee429557bc48d19d3d9a5f69cf6 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g003.ds.uchicago.edu>
Date: Sun, 4 Feb 2024 14:31:07 -0600
Subject: [PATCH 34/42] finished dedup function with helper function to output
 to a csv_file in the output directory

---
 utils/linkage.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index ad5589a..f2242da 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -5,7 +5,7 @@
 import textdistance as td
 import usaddress
 
-from utils.constants import COMPANY_TYPES
+from utils.constants import COMPANY_TYPES, repo_root
 
 
 def get_address_line_1_from_full_address(address: str) -> str:
@@ -280,10 +280,25 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame:
         matching up to the index in the same row
 
     Returns
-        None. However it outputs a dictionary
+        None. However it outputs a dictionary to the output directory, with 2
+        columns. The first, which indicates the deduplicated UUIDs, is labeled
+        'duplicated_uuids', and the 2nd, which shows the uuids to which the
+        deduplicated entries match two, is labeled 'mapped_uuids'.
     """
-    # df.to_csv(repo_root / "output" / "deduplicated_UUIDs.csv", index=False)
-    pass
+    deduped_dict = {}
+    for i in range(len(df)):
+        deduped_uudis = df.iloc[i]["duplicated"]
+        for j in range(len(deduped_uudis)):
+            deduped_dict.update({deduped_uudis[j]: df.iloc[i]["id"]})
+
+    # now convert dictionary into a csv file
+    deduped_df = pd.DataFrame.from_dict(deduped_dict, "index")
+    deduped_df = deduped_df.reset_index().rename(
+        columns={"index": "duplicated_uuids", 0: "mapped_uuids"}
+    )
+    deduped_df.to_csv(
+        repo_root / "output" / "deduplicated_UUIDs.csv", index=False
+    )
 
 
 def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:

From fb106545507614b4306c7652589eb3dbf93a7059 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 5 Feb 2024 01:13:27 +0000
Subject: [PATCH 35/42] updated function

---
 utils/linkage.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 74cbc93..26fbd5b 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,6 +1,7 @@
+import re
+
 import textdistance as td
 import usaddress
-import re
 
 from utils.constants import COMPANY_TYPES
 
@@ -253,4 +254,3 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str:
         elif address_line_1_components[i][1] == "USPSBoxID":
             return address_line_1_components[i][0]
     raise ValueError("Can not find Address Number")
-    

From 29ee6bb63e198256d83a22019f98561f303a764b Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Tue, 6 Feb 2024 09:57:53 -0600
Subject: [PATCH 36/42] made modifications to the deduplication function

---
 utils/linkage.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index f2242da..5db8745 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -271,19 +271,21 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
 
 
 def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame:
-    """Takes a dataframe whose indexes are UUIDs, and a column that is a list of
-    all other UUIDs that have duplicate values. The function then outputs a
-    dictionary file where the deduped UUIDs map to the dataframe main UUID
+    """Saves to the "output" directory a file mapping multiple strings to one
+    string
+
+    Given a dataframe where each row contains one string in a column and a list
+    of strings in another column, the function maps each string in the list to
+    the single string.
 
     Args:
-        A pandas dataframe with UUIDs as indexes and deduplicated UUIDs
-        matching up to the index in the same row
+        A pandas dataframe
 
     Returns
-        None. However it outputs a dictionary to the output directory, with 2
-        columns. The first, which indicates the deduplicated UUIDs, is labeled
+        None. However it outputs a file to the output directory, with 2
+        columns. The first, which indicates the duplicated UUIDs, is labeled
         'duplicated_uuids', and the 2nd, which shows the uuids to which the
-        deduplicated entries match two, is labeled 'mapped_uuids'.
+        deduplicated entries match to, is labeled 'mapped_uuids'.
     """
     deduped_dict = {}
     for i in range(len(df)):
@@ -297,14 +299,17 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame:
         columns={"index": "duplicated_uuids", 0: "mapped_uuids"}
     )
     deduped_df.to_csv(
-        repo_root / "output" / "deduplicated_UUIDs.csv", index=False
+        repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a"
     )
 
 
 def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
-    """Given a dataframe, remove rows that have identical entry data beyond
-    UUIDs, and output a file mapping an entry to other the UUIDs of the
-    deduplicated rows
+    """Return a dataframe with duplicated entries removed.
+
+    Given a dataframe, combines rows that have identical data beyond their
+    UUIDs, keeps the first UUID amond the similarly grouped UUIDs, and saves the
+    rest of the UUIDS to a file in the "output" directory linking them to the
+    first selected UUID.
 
     Args:
         a pandas dataframe containing contribution data
@@ -316,7 +321,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
 
     # now find the duplicates along all columns but the ID
     new_df = (
-        new_df.groupby(df.columns[1:].tolist())["id"]
+        new_df.groupby(df.columns[1:].tolist(), dropna=False)["id"]
         .agg(list)
         .reset_index()
         .rename(columns={"id": "duplicated"})

From cfa15d079459a30032a61325fa2f1dcf8a74e3f8 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@fe01>
Date: Tue, 6 Feb 2024 10:07:25 -0600
Subject: [PATCH 37/42] received a git push error stating that the tip of my
 branch is behind its remote counterpart...commiting my changes before
 rebasing

---
 notebooks/Test.ipynb | 667 ++++++++++++++++++++++++++++---------------
 1 file changed, 431 insertions(+), 236 deletions(-)

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index bc73185..188591d 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,27 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "' Jane  Jr Doe'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "determine_comma_role(\"DOe, Jane, Jr\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -169,29 +149,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "str"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "orgs = pd.read_csv(\"../output/complete_organizations_table.csv\")\n",
-    "type(orgs.id.tolist()[1000])"
+    "orgs_sample = pd.read_csv(\"../output/complete_organizations_table.csv\",index_col=0).sample(10000)\n",
+    "inds_sample = pd.read_csv(\"../output/complete_individuals_table.csv\",index_col=0, low_memory=False).sample(10000)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -403,7 +372,7 @@
        "18                                            Paa Pac    PA  Organization  "
       ]
      },
-     "execution_count": 28,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -433,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -448,12 +417,21 @@
     "        matching up to the index in the same row\n",
     "        \n",
     "    Returns\n",
-    "        None. However it outputs a dictionary\n",
+    "        None. However it outputs a dictionary to the output directory, with 2\n",
+    "        columns. The first, which indicates the deduplicated UUIDs, is labeled\n",
+    "        'duplicated_uuids', and the 2nd, which shows the uuids to which the\n",
+    "        deduplicated entries match two, is labeled 'mapped_uuids'.\n",
     "    '''\n",
-    "    #for index in df.index:\n",
-    "        \n",
-    "    #entities.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n",
-    "    pass\n",
+    "    deduped_dict = {}\n",
+    "    for i in range(len(df)):\n",
+    "        deduped_uudis = df.iloc[i]['duplicated']\n",
+    "        for j in range(len(deduped_uudis)):\n",
+    "            deduped_dict.update({deduped_uudis[j]:df.iloc[i]['id']})\n",
+    "    \n",
+    "    # now convert dictionary into a csv file\n",
+    "    deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') \n",
+    "    deduped_df = deduped_df.reset_index().rename(columns={\"index\":\"duplicated_uuids\", 0:\"mapped_uuids\"})\n",
+    "    deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n",
     "\n",
     "\n",
     "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n",
@@ -470,7 +448,7 @@
     "    new_df = df.drop_duplicates()\n",
     "\n",
     "    # now find the duplicates along all columns but the ID\n",
-    "    new_df=new_df.groupby(df.columns[1:].tolist())[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n",
+    "    new_df=new_df.groupby(df.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n",
     "    new_df.index=new_df[\"duplicated\"].str[0].tolist()\n",
     "    new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n",
     "\n",
@@ -484,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -509,48 +487,138 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
-       "      <th>duplicated</th>\n",
+       "      <th>name</th>\n",
+       "      <th>state</th>\n",
+       "      <th>entity_type</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>43a79b93-fed7-4f3c-a279-0441cdc7e722</td>\n",
+       "      <td>14TH DISTRICT DEMOCRATIC PARTY</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>215f3104-2df0-4799-9a13-d0c5ec27d6f2</td>\n",
+       "      <td>14TH DISTRICT DEMOCRATS</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
-       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...</td>\n",
+       "      <td>022d2951-8fe9-42d6-a6ac-01e82d90fa65</td>\n",
+       "      <td>21ST CENTURY MEDIA - MICHIGAN</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>e1150dce-219c-4eef-995d-ee2759a92923</td>\n",
+       "      <td>360 TOUCH</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>corporation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>60d454d1-3773-4d88-80e9-132c161da0f0</td>\n",
-       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]</td>\n",
+       "      <td>88c3b805-e0f1-42d5-8b77-536734731c4a</td>\n",
+       "      <td>50+1 STRATEGIES LLC</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2135</th>\n",
+       "      <td>f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8</td>\n",
+       "      <td>Zoom</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2136</th>\n",
+       "      <td>616c47f1-39cc-4b12-a93d-f7d3bdc88047</td>\n",
+       "      <td>Zoom Video Communications</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2137</th>\n",
+       "      <td>df101e29-4adf-4496-8d96-9732d9f7dbc8</td>\n",
+       "      <td>Zoom.Us</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2138</th>\n",
+       "      <td>d02d1f6d-4a13-428e-a040-d35bd5cfcf9f</td>\n",
+       "      <td>Zupancich, Andrea Senate Committee</td>\n",
+       "      <td>GA</td>\n",
+       "      <td>Committee</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2139</th>\n",
+       "      <td>df42f2ec-9ee0-49d0-9020-d1a441ef8b42</td>\n",
+       "      <td>womenwinning State PAC</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>Committee</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>2140 rows × 4 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                     id  \\\n",
-       "2  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
-       "4  60d454d1-3773-4d88-80e9-132c161da0f0   \n",
+       "                                        id  \\\n",
+       "0     43a79b93-fed7-4f3c-a279-0441cdc7e722   \n",
+       "1     215f3104-2df0-4799-9a13-d0c5ec27d6f2   \n",
+       "2     022d2951-8fe9-42d6-a6ac-01e82d90fa65   \n",
+       "3     e1150dce-219c-4eef-995d-ee2759a92923   \n",
+       "4     88c3b805-e0f1-42d5-8b77-536734731c4a   \n",
+       "...                                    ...   \n",
+       "2135  f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8   \n",
+       "2136  616c47f1-39cc-4b12-a93d-f7d3bdc88047   \n",
+       "2137  df101e29-4adf-4496-8d96-9732d9f7dbc8   \n",
+       "2138  d02d1f6d-4a13-428e-a040-d35bd5cfcf9f   \n",
+       "2139  df42f2ec-9ee0-49d0-9020-d1a441ef8b42   \n",
        "\n",
-       "                                          duplicated  \n",
-       "2  [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...  \n",
-       "4             [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]  "
+       "                                       name state   entity_type  \n",
+       "0      14TH DISTRICT DEMOCRATIC PARTY          MI   corporation  \n",
+       "1      14TH DISTRICT DEMOCRATS                 MI   corporation  \n",
+       "2      21ST CENTURY MEDIA - MICHIGAN           MI   corporation  \n",
+       "3      360 TOUCH                               MI   corporation  \n",
+       "4      50+1 STRATEGIES LLC                     MI   corporation  \n",
+       "...                                     ...   ...           ...  \n",
+       "2135                                   Zoom    PA  Organization  \n",
+       "2136              Zoom Video Communications    PA  Organization  \n",
+       "2137                                Zoom.Us    PA  Organization  \n",
+       "2138     Zupancich, Andrea Senate Committee    GA     Committee  \n",
+       "2139                 womenwinning State PAC    MN     Committee  \n",
+       "\n",
+       "[2140 rows x 4 columns]"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "x = deduplicate_perfect_matches(sample_df)\n",
+    "x = deduplicate_perfect_matches(orgs_sample)\n",
     "#len(x.iloc[2]['duplicated'])\n",
-    "x[x['duplicated'].apply(lambda x: len(x)) > 0][['id','duplicated']]"
+    "x"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -575,123 +643,257 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>last_name</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>entity_type</th>\n",
+       "      <th>state</th>\n",
+       "      <th>party</th>\n",
+       "      <th>company</th>\n",
        "      <th>duplicated</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>d31df1ca-714e-4a82-9e88-1892c0451a71</td>\n",
+       "      <td>6c833843-2f4f-416c-9092-f1d95d9b27dc</td>\n",
+       "      <td>'JESSE' PHILIP</td>\n",
+       "      <td>SHERMAN</td>\n",
+       "      <td>'JESSE' PHILIP       SHERMAN                  ...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>CA</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>910c4d36-b036-469e-aa2a-ea4ff8855a6c</td>\n",
+       "      <td>cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4</td>\n",
+       "      <td>AARON</td>\n",
+       "      <td>AEBIG</td>\n",
+       "      <td>AARON                AEBIG                    ...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
-       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...</td>\n",
+       "      <td>a7304cd4-76ae-4223-86c3-f50da82a62aa</td>\n",
+       "      <td>AARON</td>\n",
+       "      <td>BATES</td>\n",
+       "      <td>AARON                BATES                    ...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>c875d7de-94be-42f1-b994-dd89b114d51e</td>\n",
+       "      <td>cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d</td>\n",
+       "      <td>AARON</td>\n",
+       "      <td>BIRD</td>\n",
+       "      <td>AARON                BIRD                     ...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>WA</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>L0021</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>60d454d1-3773-4d88-80e9-132c161da0f0</td>\n",
-       "      <td>[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]</td>\n",
+       "      <td>1302bf1f-393b-43ed-a15d-8cf6e121223c</td>\n",
+       "      <td>AARON</td>\n",
+       "      <td>COHEN</td>\n",
+       "      <td>AARON                COHEN                    ...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>IL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>f71341d7-d27e-47eb-9b66-903af39d6cb5</td>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7182</th>\n",
+       "      <td>160a5c9e-d04a-40c9-a0fd-c28e21dd70dc</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Wilkinson, James</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>50c7d9a1-b448-46a5-8e2d-cd15b3097360</td>\n",
+       "      <th>7183</th>\n",
+       "      <td>7a19cbb7-d681-46a5-8f9f-1e7be7071f06</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Wolf, Linda</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</td>\n",
+       "      <th>7184</th>\n",
+       "      <td>ce5156f8-23d4-40e0-8711-f19bff942543</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Wollenburg, George</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</td>\n",
+       "      <th>7185</th>\n",
+       "      <td>1948661</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>richard  3033 shoreham</td>\n",
+       "      <td>individual</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7186</th>\n",
+       "      <td>69744565-e7e4-47e1-8555-ede565fca705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>wark, david</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>[]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>7187 rows × 9 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                     id  \\\n",
-       "0  d31df1ca-714e-4a82-9e88-1892c0451a71   \n",
-       "1  910c4d36-b036-469e-aa2a-ea4ff8855a6c   \n",
-       "2  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
-       "3  c875d7de-94be-42f1-b994-dd89b114d51e   \n",
-       "4  60d454d1-3773-4d88-80e9-132c161da0f0   \n",
-       "5  f71341d7-d27e-47eb-9b66-903af39d6cb5   \n",
-       "6  50c7d9a1-b448-46a5-8e2d-cd15b3097360   \n",
-       "7  62ea1e9c-ac12-400c-b3dc-519389c0f7d3   \n",
-       "8  4db76e6e-f0d5-40eb-82de-6dbcdb562dd7   \n",
+       "                                        id            first_name  \\\n",
+       "0     6c833843-2f4f-416c-9092-f1d95d9b27dc  'JESSE' PHILIP         \n",
+       "1     cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4  AARON                  \n",
+       "2     a7304cd4-76ae-4223-86c3-f50da82a62aa  AARON                  \n",
+       "3     cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d  AARON                  \n",
+       "4     1302bf1f-393b-43ed-a15d-8cf6e121223c  AARON                  \n",
+       "...                                    ...                   ...   \n",
+       "7182  160a5c9e-d04a-40c9-a0fd-c28e21dd70dc                   NaN   \n",
+       "7183  7a19cbb7-d681-46a5-8f9f-1e7be7071f06                   NaN   \n",
+       "7184  ce5156f8-23d4-40e0-8711-f19bff942543                   NaN   \n",
+       "7185                               1948661                   NaN   \n",
+       "7186  69744565-e7e4-47e1-8555-ede565fca705                   NaN   \n",
+       "\n",
+       "                                 last_name  \\\n",
+       "0     SHERMAN                                \n",
+       "1     AEBIG                                  \n",
+       "2     BATES                                  \n",
+       "3     BIRD                                   \n",
+       "4     COHEN                                  \n",
+       "...                                    ...   \n",
+       "7182                                   NaN   \n",
+       "7183                                   NaN   \n",
+       "7184                                   NaN   \n",
+       "7185                                   NaN   \n",
+       "7186                                   NaN   \n",
        "\n",
-       "                                          duplicated  \n",
-       "0                                                 []  \n",
-       "1                                                 []  \n",
-       "2  [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...  \n",
-       "3                                                 []  \n",
-       "4             [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]  \n",
-       "5                                                 []  \n",
-       "6                                                 []  \n",
-       "7                                                 []  \n",
-       "8                                                 []  "
+       "                                              full_name entity_type state  \\\n",
+       "0     'JESSE' PHILIP       SHERMAN                  ...  Individual    CA   \n",
+       "1     AARON                AEBIG                    ...  Individual    MI   \n",
+       "2     AARON                BATES                    ...  Individual    MI   \n",
+       "3     AARON                BIRD                     ...  Individual    WA   \n",
+       "4     AARON                COHEN                    ...  Individual    IL   \n",
+       "...                                                 ...         ...   ...   \n",
+       "7182                                   Wilkinson, James  Individual    MN   \n",
+       "7183                                        Wolf, Linda  Individual    MN   \n",
+       "7184                                 Wollenburg, George  Individual    MN   \n",
+       "7185                             richard  3033 shoreham  individual   NaN   \n",
+       "7186                                        wark, david  Individual    MN   \n",
+       "\n",
+       "     party company duplicated  \n",
+       "0      NaN     NaN         []  \n",
+       "1      NaN     NaN         []  \n",
+       "2      NaN     NaN         []  \n",
+       "3      NaN   L0021         []  \n",
+       "4      NaN     NaN         []  \n",
+       "...    ...     ...        ...  \n",
+       "7182   NaN     NaN         []  \n",
+       "7183   NaN     NaN         []  \n",
+       "7184   NaN     NaN         []  \n",
+       "7185   NaN     NaN         []  \n",
+       "7186   NaN     NaN         []  \n",
+       "\n",
+       "[7187 rows x 9 columns]"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "x[['id','duplicated']]"
+    "y = inds_sample.drop_duplicates()\n",
+    "\n",
+    "# now find the duplicates along all columns but the ID\n",
+    "y=y.groupby(inds_sample.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n",
+    "y.index=y[\"duplicated\"].str[0].tolist()\n",
+    "y[\"duplicated\"]=y[\"duplicated\"].str[1:]\n",
+    "\n",
+    "# now convert the duplicated column into a dictionary that can will be\n",
+    "# an output by only feeding the entries with duplicates\n",
+    "y = y.reset_index().rename(columns = {'index':'id'})\n",
+    "convert_duplicates_to_dict(y[y['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n",
+    "new_df = y.drop(['duplicated'], axis=1)\n",
+    "#return new_df\n",
+    "y"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[]\n",
-      "[]\n",
-      "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe', '1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff']\n",
-      "[]\n",
-      "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd']\n",
-      "[]\n",
-      "[]\n",
-      "[]\n",
-      "[]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "Index(['first_name', 'last_name', 'full_name', 'entity_type', 'state', 'party',\n",
+       "       'company'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "y = x[['duplicated']]\n",
-    "for i in range(len(y)):\n",
-    "    #print(y.iloc[i]['duplicated'])\n",
-    "    print(y.iloc[i]['duplicated'])"
+    "inds_sample.columns[1:]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -715,74 +917,92 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>name</th>\n",
-       "      <th>state</th>\n",
-       "      <th>entity_type</th>\n",
-       "      <th>id</th>\n",
+       "      <th>Max Speed</th>\n",
+       "      <th>Animal</th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Age</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "      <td>2</td>\n",
+       "      <td>380.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>green</td>\n",
+       "      <td>2.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Paa Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "      <td>2</td>\n",
+       "      <th>1</th>\n",
+       "      <td>370.0</td>\n",
+       "      <td>Falcon</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "      <td>3</td>\n",
+       "      <th>2</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>yellow</td>\n",
+       "      <td>5.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "      <td>4</td>\n",
+       "      <th>3</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Parrot</td>\n",
+       "      <td>blue</td>\n",
+       "      <td>6.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                name state   entity_type  id\n",
-       "0             COMMITTEE TO ELECT DR PATRICIA BERNARD    MI     committee   2\n",
-       "2     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee   4\n",
-       "4                                            Paa Pac    PA  Organization   2\n",
-       "6  REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...    MI     committee   3\n",
-       "7  UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...    MI     committee   4"
+       "   Max Speed  Animal   Color  Age\n",
+       "0      380.0    None   green  2.0\n",
+       "1      370.0  Falcon    None  NaN\n",
+       "2        NaN    None  yellow  5.0\n",
+       "3        NaN  Parrot    blue  6.0"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "x = sample_df.groupby(sample_df.columns[1:].tolist()).count().reset_index()\n",
-    "x.loc[x.id >1]"
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "df = pd.DataFrame({'Max Speed': [380., 370., np.nan, np.nan],\n",
+    "                   'Animal': ['None', 'Falcon', 'None', 'Parrot'],\n",
+    "                   'Color':['green',None,'yellow','blue'],\n",
+    "                   'Age':[2,np.nan,5,6]})\n",
+    "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fa2f9d5bb50>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df= df.groupby(df.columns[1:].tolist(), dropna=False)[\"Max Speed\"]#.agg(list)#.reset_index()\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -806,114 +1026,89 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>name</th>\n",
-       "      <th>state</th>\n",
-       "      <th>entity_type</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>Animal</th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Max Speed</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>d31df1ca-714e-4a82-9e88-1892c0451a71</th>\n",
-       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>910c4d36-b036-469e-aa2a-ea4ff8855a6c</th>\n",
-       "      <td>Citizens For Kail</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</th>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c875d7de-94be-42f1-b994-dd89b114d51e</th>\n",
-       "      <td>Pa Fraternal Order Of Police Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>60d454d1-3773-4d88-80e9-132c161da0f0</th>\n",
-       "      <td>Paa Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>f71341d7-d27e-47eb-9b66-903af39d6cb5</th>\n",
-       "      <td>Pabar Pac (Pa Bar Assn)</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>0</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>green</td>\n",
+       "      <td>[380.0]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>50c7d9a1-b448-46a5-8e2d-cd15b3097360</th>\n",
-       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
+       "      <th>1</th>\n",
+       "      <td>5.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>yellow</td>\n",
+       "      <td>[nan]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</th>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
+       "      <th>2</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Parrot</td>\n",
+       "      <td>blue</td>\n",
+       "      <td>[nan]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</th>\n",
-       "      <td>Ugi Utilities Inc/Ugi Energy Services Llc Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
+       "      <th>3</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Falcon</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[370.0]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                                                   name  \\\n",
-       "d31df1ca-714e-4a82-9e88-1892c0451a71             COMMITTEE TO ELECT DR PATRICIA BERNARD   \n",
-       "910c4d36-b036-469e-aa2a-ea4ff8855a6c                                  Citizens For Kail   \n",
-       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC   \n",
-       "c875d7de-94be-42f1-b994-dd89b114d51e                   Pa Fraternal Order Of Police Pac   \n",
-       "60d454d1-3773-4d88-80e9-132c161da0f0                                            Paa Pac   \n",
-       "f71341d7-d27e-47eb-9b66-903af39d6cb5                            Pabar Pac (Pa Bar Assn)   \n",
-       "50c7d9a1-b448-46a5-8e2d-cd15b3097360  REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...   \n",
-       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3  UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...   \n",
-       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7      Ugi Utilities Inc/Ugi Energy Services Llc Pac   \n",
-       "\n",
-       "                                     state   entity_type  \n",
-       "d31df1ca-714e-4a82-9e88-1892c0451a71    MI     committee  \n",
-       "910c4d36-b036-469e-aa2a-ea4ff8855a6c    PA  Organization  \n",
-       "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd    MI     committee  \n",
-       "c875d7de-94be-42f1-b994-dd89b114d51e    PA  Organization  \n",
-       "60d454d1-3773-4d88-80e9-132c161da0f0    PA  Organization  \n",
-       "f71341d7-d27e-47eb-9b66-903af39d6cb5    PA  Organization  \n",
-       "50c7d9a1-b448-46a5-8e2d-cd15b3097360    MI     committee  \n",
-       "62ea1e9c-ac12-400c-b3dc-519389c0f7d3    MI     committee  \n",
-       "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7    PA  Organization  "
+       "   Age  Animal   Color Max Speed\n",
+       "0  2.0    None   green   [380.0]\n",
+       "1  5.0    None  yellow     [nan]\n",
+       "2  6.0  Parrot    blue     [nan]\n",
+       "3  NaN  Falcon     NaN   [370.0]"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "#for index in x.index:\n",
-    "#    print(index)\n",
-    "x"
+    "df = df.groupby((df.columns.difference(['Max Speed'])).tolist(),dropna=False)['Max Speed'].agg(list).reset_index()\n",
+    "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f59594f81d0>"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "from utils.constants import repo_root\n",
-    "entities.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n"
+    "df"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 3d26fdef9d0d56459c36f61cb7b4d9fa309f7925 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 7 Feb 2024 09:47:25 -0600
Subject: [PATCH 38/42] trying to see what the git branch issues are...no need
 to review this commit

---
 notebooks/Test.ipynb | 333 ++++++++++++++++++++-----------------------
 1 file changed, 154 insertions(+), 179 deletions(-)

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index 188591d..26d98b5 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -149,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -372,7 +372,7 @@
        "18                                            Paa Pac    PA  Organization  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -402,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -431,7 +431,7 @@
     "    # now convert dictionary into a csv file\n",
     "    deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') \n",
     "    deduped_df = deduped_df.reset_index().rename(columns={\"index\":\"duplicated_uuids\", 0:\"mapped_uuids\"})\n",
-    "    deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n",
+    "    deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False, mode='a')\n",
     "\n",
     "\n",
     "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n",
@@ -462,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -495,36 +495,36 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>43a79b93-fed7-4f3c-a279-0441cdc7e722</td>\n",
-       "      <td>14TH DISTRICT DEMOCRATIC PARTY</td>\n",
+       "      <td>3246120d-45fc-4d19-adee-d2aa2c5be6db</td>\n",
+       "      <td>1 BOLD STEP</td>\n",
        "      <td>MI</td>\n",
        "      <td>corporation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>215f3104-2df0-4799-9a13-d0c5ec27d6f2</td>\n",
-       "      <td>14TH DISTRICT DEMOCRATS</td>\n",
+       "      <td>8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd</td>\n",
+       "      <td>12CDRC</td>\n",
        "      <td>MI</td>\n",
        "      <td>corporation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>022d2951-8fe9-42d6-a6ac-01e82d90fa65</td>\n",
-       "      <td>21ST CENTURY MEDIA - MICHIGAN</td>\n",
+       "      <td>a5379930-7324-4f1d-b216-84d9e9ddea40</td>\n",
+       "      <td>303 MANAGEMENT INC.</td>\n",
        "      <td>MI</td>\n",
        "      <td>corporation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>e1150dce-219c-4eef-995d-ee2759a92923</td>\n",
-       "      <td>360 TOUCH</td>\n",
+       "      <td>9064112f-ef40-4690-9d0a-782a2375feb0</td>\n",
+       "      <td>314 ACTION FUND</td>\n",
        "      <td>MI</td>\n",
        "      <td>corporation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>88c3b805-e0f1-42d5-8b77-536734731c4a</td>\n",
-       "      <td>50+1 STRATEGIES LLC</td>\n",
+       "      <td>9e11e7ae-ee29-4a50-9720-41c6ac556a1f</td>\n",
+       "      <td>A T AND T MICHIGAN PAC</td>\n",
        "      <td>MI</td>\n",
        "      <td>corporation</td>\n",
        "    </tr>\n",
@@ -536,76 +536,76 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2135</th>\n",
-       "      <td>f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8</td>\n",
-       "      <td>Zoom</td>\n",
+       "      <th>2149</th>\n",
+       "      <td>d79f9729-c9af-4347-868a-ae6e6814a295</td>\n",
+       "      <td>Zach Kirk</td>\n",
        "      <td>PA</td>\n",
        "      <td>Organization</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2136</th>\n",
-       "      <td>616c47f1-39cc-4b12-a93d-f7d3bdc88047</td>\n",
-       "      <td>Zoom Video Communications</td>\n",
+       "      <th>2150</th>\n",
+       "      <td>fbfea472-e183-4479-b869-90eddfa5198c</td>\n",
+       "      <td>Zest Kitchen</td>\n",
        "      <td>PA</td>\n",
        "      <td>Organization</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2137</th>\n",
-       "      <td>df101e29-4adf-4496-8d96-9732d9f7dbc8</td>\n",
-       "      <td>Zoom.Us</td>\n",
+       "      <th>2151</th>\n",
+       "      <td>c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6</td>\n",
+       "      <td>Zoom Us</td>\n",
        "      <td>PA</td>\n",
        "      <td>Organization</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2138</th>\n",
-       "      <td>d02d1f6d-4a13-428e-a040-d35bd5cfcf9f</td>\n",
-       "      <td>Zupancich, Andrea Senate Committee</td>\n",
-       "      <td>GA</td>\n",
-       "      <td>Committee</td>\n",
+       "      <th>2152</th>\n",
+       "      <td>59cc8db9-607e-4e1b-ba41-0850b6019360</td>\n",
+       "      <td>Zoom Video Communications Inc.</td>\n",
+       "      <td>PA</td>\n",
+       "      <td>Organization</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2139</th>\n",
-       "      <td>df42f2ec-9ee0-49d0-9020-d1a441ef8b42</td>\n",
-       "      <td>womenwinning State PAC</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>Committee</td>\n",
+       "      <th>2153</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>MI</td>\n",
+       "      <td>corporation</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>2140 rows × 4 columns</p>\n",
+       "<p>2154 rows × 4 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "                                        id  \\\n",
-       "0     43a79b93-fed7-4f3c-a279-0441cdc7e722   \n",
-       "1     215f3104-2df0-4799-9a13-d0c5ec27d6f2   \n",
-       "2     022d2951-8fe9-42d6-a6ac-01e82d90fa65   \n",
-       "3     e1150dce-219c-4eef-995d-ee2759a92923   \n",
-       "4     88c3b805-e0f1-42d5-8b77-536734731c4a   \n",
+       "0     3246120d-45fc-4d19-adee-d2aa2c5be6db   \n",
+       "1     8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd   \n",
+       "2     a5379930-7324-4f1d-b216-84d9e9ddea40   \n",
+       "3     9064112f-ef40-4690-9d0a-782a2375feb0   \n",
+       "4     9e11e7ae-ee29-4a50-9720-41c6ac556a1f   \n",
        "...                                    ...   \n",
-       "2135  f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8   \n",
-       "2136  616c47f1-39cc-4b12-a93d-f7d3bdc88047   \n",
-       "2137  df101e29-4adf-4496-8d96-9732d9f7dbc8   \n",
-       "2138  d02d1f6d-4a13-428e-a040-d35bd5cfcf9f   \n",
-       "2139  df42f2ec-9ee0-49d0-9020-d1a441ef8b42   \n",
+       "2149  d79f9729-c9af-4347-868a-ae6e6814a295   \n",
+       "2150  fbfea472-e183-4479-b869-90eddfa5198c   \n",
+       "2151  c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6   \n",
+       "2152  59cc8db9-607e-4e1b-ba41-0850b6019360   \n",
+       "2153                                   NaN   \n",
        "\n",
        "                                       name state   entity_type  \n",
-       "0      14TH DISTRICT DEMOCRATIC PARTY          MI   corporation  \n",
-       "1      14TH DISTRICT DEMOCRATS                 MI   corporation  \n",
-       "2      21ST CENTURY MEDIA - MICHIGAN           MI   corporation  \n",
-       "3      360 TOUCH                               MI   corporation  \n",
-       "4      50+1 STRATEGIES LLC                     MI   corporation  \n",
+       "0      1 BOLD STEP                             MI   corporation  \n",
+       "1      12CDRC                                  MI   corporation  \n",
+       "2      303 MANAGEMENT INC.                     MI   corporation  \n",
+       "3      314 ACTION FUND                         MI   corporation  \n",
+       "4      A T AND T MICHIGAN PAC                  MI   corporation  \n",
        "...                                     ...   ...           ...  \n",
-       "2135                                   Zoom    PA  Organization  \n",
-       "2136              Zoom Video Communications    PA  Organization  \n",
-       "2137                                Zoom.Us    PA  Organization  \n",
-       "2138     Zupancich, Andrea Senate Committee    GA     Committee  \n",
-       "2139                 womenwinning State PAC    MN     Committee  \n",
+       "2149                              Zach Kirk    PA  Organization  \n",
+       "2150                           Zest Kitchen    PA  Organization  \n",
+       "2151                                Zoom Us    PA  Organization  \n",
+       "2152         Zoom Video Communications Inc.    PA  Organization  \n",
+       "2153                                    NaN    MI   corporation  \n",
        "\n",
-       "[2140 rows x 4 columns]"
+       "[2154 rows x 4 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -618,7 +618,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -650,69 +650,63 @@
        "      <th>state</th>\n",
        "      <th>party</th>\n",
        "      <th>company</th>\n",
-       "      <th>duplicated</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>6c833843-2f4f-416c-9092-f1d95d9b27dc</td>\n",
-       "      <td>'JESSE' PHILIP</td>\n",
-       "      <td>SHERMAN</td>\n",
-       "      <td>'JESSE' PHILIP       SHERMAN                  ...</td>\n",
+       "      <td>f6df631a-e626-4861-b62b-e09512887bd3</td>\n",
+       "      <td>A SCOTT</td>\n",
+       "      <td>PARIS</td>\n",
+       "      <td>A SCOTT              PARIS                    ...</td>\n",
        "      <td>Individual</td>\n",
-       "      <td>CA</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>MI</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
+       "      <td>NOT EMPLOYED</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4</td>\n",
-       "      <td>AARON</td>\n",
-       "      <td>AEBIG</td>\n",
-       "      <td>AARON                AEBIG                    ...</td>\n",
+       "      <td>075fb1c6-6c70-4ec6-a439-fcebb76c4e0a</td>\n",
+       "      <td>A. MARK</td>\n",
+       "      <td>GLICKSTEIN</td>\n",
+       "      <td>A. MARK              GLICKSTEIN               ...</td>\n",
        "      <td>Individual</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>CA</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
+       "      <td>PARTNERSHIP HEALTH PLAN OF CA</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>a7304cd4-76ae-4223-86c3-f50da82a62aa</td>\n",
-       "      <td>AARON</td>\n",
-       "      <td>BATES</td>\n",
-       "      <td>AARON                BATES                    ...</td>\n",
+       "      <td>4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8</td>\n",
+       "      <td>A. MICHAEL</td>\n",
+       "      <td>PALIZZI</td>\n",
+       "      <td>A. MICHAEL           PALIZZI                  ...</td>\n",
        "      <td>Individual</td>\n",
        "      <td>MI</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
+       "      <td>MILLER CANFIELD</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d</td>\n",
+       "      <td>bb952efc-3dba-4449-9405-ea65202fbbea</td>\n",
        "      <td>AARON</td>\n",
-       "      <td>BIRD</td>\n",
-       "      <td>AARON                BIRD                     ...</td>\n",
+       "      <td>ALDRICH</td>\n",
+       "      <td>AARON                ALDRICH                  ...</td>\n",
        "      <td>Individual</td>\n",
-       "      <td>WA</td>\n",
+       "      <td>MI</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>L0021</td>\n",
-       "      <td>[]</td>\n",
+       "      <td>MILLER PIPELINE CORP.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1302bf1f-393b-43ed-a15d-8cf6e121223c</td>\n",
+       "      <td>79ec4a73-f688-479a-a4e3-0b0a3813188a</td>\n",
        "      <td>AARON</td>\n",
-       "      <td>COHEN</td>\n",
-       "      <td>AARON                COHEN                    ...</td>\n",
+       "      <td>BLAND</td>\n",
+       "      <td>AARON                BLAND                    ...</td>\n",
        "      <td>Individual</td>\n",
-       "      <td>IL</td>\n",
+       "      <td>MI</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -724,59 +718,54 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
-       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7182</th>\n",
-       "      <td>160a5c9e-d04a-40c9-a0fd-c28e21dd70dc</td>\n",
+       "      <th>7122</th>\n",
+       "      <td>a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>Wilkinson, James</td>\n",
+       "      <td>Trone, Robert</td>\n",
        "      <td>Individual</td>\n",
        "      <td>MN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7183</th>\n",
-       "      <td>7a19cbb7-d681-46a5-8f9f-1e7be7071f06</td>\n",
+       "      <th>7123</th>\n",
+       "      <td>37ab55f5-3613-469c-8b66-ac8888f5bcae</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>Wolf, Linda</td>\n",
+       "      <td>Wark, Mary Ann</td>\n",
        "      <td>Individual</td>\n",
        "      <td>MN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7184</th>\n",
-       "      <td>ce5156f8-23d4-40e0-8711-f19bff942543</td>\n",
+       "      <th>7124</th>\n",
+       "      <td>92d5ac7c-4702-420c-97a7-656111677f5a</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>Wollenburg, George</td>\n",
+       "      <td>Wenstrom, Gene</td>\n",
        "      <td>Individual</td>\n",
        "      <td>MN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7185</th>\n",
-       "      <td>1948661</td>\n",
+       "      <th>7125</th>\n",
+       "      <td>fa934bf1-f611-4cd3-9bff-451bdf2e5bd2</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>richard  3033 shoreham</td>\n",
-       "      <td>individual</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>Wika, Kevin</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>MN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7186</th>\n",
-       "      <td>69744565-e7e4-47e1-8555-ede565fca705</td>\n",
+       "      <th>7126</th>\n",
+       "      <td>fb8bb833-7010-418a-9f24-1a29771e0b67</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>wark, david</td>\n",
@@ -784,111 +773,97 @@
        "      <td>MN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>7187 rows × 9 columns</p>\n",
+       "<p>7127 rows × 8 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "                                        id            first_name  \\\n",
-       "0     6c833843-2f4f-416c-9092-f1d95d9b27dc  'JESSE' PHILIP         \n",
-       "1     cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4  AARON                  \n",
-       "2     a7304cd4-76ae-4223-86c3-f50da82a62aa  AARON                  \n",
-       "3     cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d  AARON                  \n",
-       "4     1302bf1f-393b-43ed-a15d-8cf6e121223c  AARON                  \n",
+       "0     f6df631a-e626-4861-b62b-e09512887bd3  A SCOTT                \n",
+       "1     075fb1c6-6c70-4ec6-a439-fcebb76c4e0a  A. MARK                \n",
+       "2     4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8  A. MICHAEL             \n",
+       "3     bb952efc-3dba-4449-9405-ea65202fbbea  AARON                  \n",
+       "4     79ec4a73-f688-479a-a4e3-0b0a3813188a  AARON                  \n",
        "...                                    ...                   ...   \n",
-       "7182  160a5c9e-d04a-40c9-a0fd-c28e21dd70dc                   NaN   \n",
-       "7183  7a19cbb7-d681-46a5-8f9f-1e7be7071f06                   NaN   \n",
-       "7184  ce5156f8-23d4-40e0-8711-f19bff942543                   NaN   \n",
-       "7185                               1948661                   NaN   \n",
-       "7186  69744565-e7e4-47e1-8555-ede565fca705                   NaN   \n",
+       "7122  a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0                   NaN   \n",
+       "7123  37ab55f5-3613-469c-8b66-ac8888f5bcae                   NaN   \n",
+       "7124  92d5ac7c-4702-420c-97a7-656111677f5a                   NaN   \n",
+       "7125  fa934bf1-f611-4cd3-9bff-451bdf2e5bd2                   NaN   \n",
+       "7126  fb8bb833-7010-418a-9f24-1a29771e0b67                   NaN   \n",
        "\n",
        "                                 last_name  \\\n",
-       "0     SHERMAN                                \n",
-       "1     AEBIG                                  \n",
-       "2     BATES                                  \n",
-       "3     BIRD                                   \n",
-       "4     COHEN                                  \n",
+       "0     PARIS                                  \n",
+       "1     GLICKSTEIN                             \n",
+       "2     PALIZZI                                \n",
+       "3     ALDRICH                                \n",
+       "4     BLAND                                  \n",
        "...                                    ...   \n",
-       "7182                                   NaN   \n",
-       "7183                                   NaN   \n",
-       "7184                                   NaN   \n",
-       "7185                                   NaN   \n",
-       "7186                                   NaN   \n",
+       "7122                                   NaN   \n",
+       "7123                                   NaN   \n",
+       "7124                                   NaN   \n",
+       "7125                                   NaN   \n",
+       "7126                                   NaN   \n",
        "\n",
        "                                              full_name entity_type state  \\\n",
-       "0     'JESSE' PHILIP       SHERMAN                  ...  Individual    CA   \n",
-       "1     AARON                AEBIG                    ...  Individual    MI   \n",
-       "2     AARON                BATES                    ...  Individual    MI   \n",
-       "3     AARON                BIRD                     ...  Individual    WA   \n",
-       "4     AARON                COHEN                    ...  Individual    IL   \n",
+       "0     A SCOTT              PARIS                    ...  Individual    MI   \n",
+       "1     A. MARK              GLICKSTEIN               ...  Individual    CA   \n",
+       "2     A. MICHAEL           PALIZZI                  ...  Individual    MI   \n",
+       "3     AARON                ALDRICH                  ...  Individual    MI   \n",
+       "4     AARON                BLAND                    ...  Individual    MI   \n",
        "...                                                 ...         ...   ...   \n",
-       "7182                                   Wilkinson, James  Individual    MN   \n",
-       "7183                                        Wolf, Linda  Individual    MN   \n",
-       "7184                                 Wollenburg, George  Individual    MN   \n",
-       "7185                             richard  3033 shoreham  individual   NaN   \n",
-       "7186                                        wark, david  Individual    MN   \n",
+       "7122                                      Trone, Robert  Individual    MN   \n",
+       "7123                                     Wark, Mary Ann  Individual    MN   \n",
+       "7124                                     Wenstrom, Gene  Individual    MN   \n",
+       "7125                                        Wika, Kevin  Individual    MN   \n",
+       "7126                                        wark, david  Individual    MN   \n",
        "\n",
-       "     party company duplicated  \n",
-       "0      NaN     NaN         []  \n",
-       "1      NaN     NaN         []  \n",
-       "2      NaN     NaN         []  \n",
-       "3      NaN   L0021         []  \n",
-       "4      NaN     NaN         []  \n",
-       "...    ...     ...        ...  \n",
-       "7182   NaN     NaN         []  \n",
-       "7183   NaN     NaN         []  \n",
-       "7184   NaN     NaN         []  \n",
-       "7185   NaN     NaN         []  \n",
-       "7186   NaN     NaN         []  \n",
+       "     party                        company  \n",
+       "0      NaN                   NOT EMPLOYED  \n",
+       "1      NaN  PARTNERSHIP HEALTH PLAN OF CA  \n",
+       "2      NaN                MILLER CANFIELD  \n",
+       "3      NaN          MILLER PIPELINE CORP.  \n",
+       "4      NaN                            NaN  \n",
+       "...    ...                            ...  \n",
+       "7122   NaN                            NaN  \n",
+       "7123   NaN                            NaN  \n",
+       "7124   NaN                            NaN  \n",
+       "7125   NaN                            NaN  \n",
+       "7126   NaN                            NaN  \n",
        "\n",
-       "[7187 rows x 9 columns]"
+       "[7127 rows x 8 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "y = inds_sample.drop_duplicates()\n",
-    "\n",
-    "# now find the duplicates along all columns but the ID\n",
-    "y=y.groupby(inds_sample.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n",
-    "y.index=y[\"duplicated\"].str[0].tolist()\n",
-    "y[\"duplicated\"]=y[\"duplicated\"].str[1:]\n",
-    "\n",
-    "# now convert the duplicated column into a dictionary that can will be\n",
-    "# an output by only feeding the entries with duplicates\n",
-    "y = y.reset_index().rename(columns = {'index':'id'})\n",
-    "convert_duplicates_to_dict(y[y['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n",
-    "new_df = y.drop(['duplicated'], axis=1)\n",
-    "#return new_df\n",
+    "y=deduplicate_perfect_matches(inds_sample)\n",
     "y"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Index(['first_name', 'last_name', 'full_name', 'entity_type', 'state', 'party',\n",
-       "       'company'],\n",
-       "      dtype='object')"
+       "7207"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "inds_sample.columns[1:]"
+    "a = inds_sample.drop_duplicates()\n",
+    "len(a)"
    ]
   },
   {

From 5843485fbeb48f4adb4a20a86a79cece154e10c0 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 7 Feb 2024 23:50:38 -0600
Subject: [PATCH 39/42] implementing PR feedback

---
 utils/linkage.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 5db8745..1b27a84 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import textdistance as td
 import usaddress
+import os.path
 
 from utils.constants import COMPANY_TYPES, repo_root
 
@@ -270,9 +271,9 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
     return " ".join(string)
 
 
-def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame:
-    """Saves to the "output" directory a file mapping multiple strings to one
-    string
+def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
+    """Saves to the "output" directory a file where each row represents a string
+    matching to another string
 
     Given a dataframe where each row contains one string in a column and a list
     of strings in another column, the function maps each string in the list to
@@ -296,11 +297,9 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame:
     # now convert dictionary into a csv file
     deduped_df = pd.DataFrame.from_dict(deduped_dict, "index")
     deduped_df = deduped_df.reset_index().rename(
-        columns={"index": "duplicated_uuids", 0: "mapped_uuids"}
-    )
+        columns={"index": "duplicated_uuids", 0: "mapped_uuids"})
     deduped_df.to_csv(
-        repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a"
-    )
+        repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a", header= not os.path.exists('../output/deduplicated_UUIDs.csv'))
 
 
 def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:

From 97b89dd7dba65a71b0c3ba31225e559d16c21617 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 7 Feb 2024 23:51:44 -0600
Subject: [PATCH 40/42] addressing linter tests failure due to formatting

---
 utils/linkage.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 1b27a84..0b8459d 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,10 +1,11 @@
 """
 Module for performing record linkage on state campaign finance dataset
 """
+import os.path
+
 import pandas as pd
 import textdistance as td
 import usaddress
-import os.path
 
 from utils.constants import COMPANY_TYPES, repo_root
 
@@ -297,9 +298,14 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
     # now convert dictionary into a csv file
     deduped_df = pd.DataFrame.from_dict(deduped_dict, "index")
     deduped_df = deduped_df.reset_index().rename(
-        columns={"index": "duplicated_uuids", 0: "mapped_uuids"})
+        columns={"index": "duplicated_uuids", 0: "mapped_uuids"}
+    )
     deduped_df.to_csv(
-        repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a", header= not os.path.exists('../output/deduplicated_UUIDs.csv'))
+        repo_root / "output" / "deduplicated_UUIDs.csv",
+        index=False,
+        mode="a",
+        header=not os.path.exists("../output/deduplicated_UUIDs.csv"),
+    )
 
 
 def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:

From 665519241ceaafdd5361d69bbfc6162226ea46e9 Mon Sep 17 00:00:00 2001
From: Alan Mburu Kagiri <alankagiri@g002.ds.uchicago.edu>
Date: Wed, 14 Feb 2024 02:27:03 -0600
Subject: [PATCH 41/42] updates to dedup file and beginning steps on netorkx

---
 requirements.txt | 1 +
 utils/linkage.py | 9 ++-------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index db05b66..d28ae9f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ Requests==2.31.0
 setuptools==68.0.0
 textdistance==4.6.1
 usaddress==0.5.4
+networkx~=3.1
\ No newline at end of file
diff --git a/utils/linkage.py b/utils/linkage.py
index f323188..ee8dcd6 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -190,7 +190,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
 
     # if data is clean:
     if first_name + " " + last_name == full_name:
-        return full_name
+        return full_name.title()
 
     # some names have titles or professions associated with the name. We need to
     # remove those from the name.
@@ -333,16 +333,11 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
         .rename(columns={"id": "duplicated"})
     )
     new_df.index = new_df["duplicated"].str[0].tolist()
-    new_df["duplicated"] = new_df["duplicated"].str[1:]
 
     # now convert the duplicated column into a dictionary that can will be
     # an output by only feeding the entries with duplicates
     new_df = new_df.reset_index().rename(columns={"index": "id"})
-    convert_duplicates_to_dict(
-        new_df[new_df["duplicated"].apply(lambda x: len(x)) > 0][
-            ["id", "duplicated"]
-        ]
-    )
+    convert_duplicates_to_dict(new_df[["id", "duplicated"]])
     new_df = new_df.drop(["duplicated"], axis=1)
     return new_df
 

From b24041d9b532a1c3e363e3ab8c70d8a7fd2d9d79 Mon Sep 17 00:00:00 2001
From: Avery Schoen <33437601+averyschoen@users.noreply.github.com>
Date: Wed, 14 Feb 2024 10:01:12 -0600
Subject: [PATCH 42/42] Delete notebooks/Test.ipynb

---
 notebooks/Test.ipynb | 1111 ------------------------------------------
 1 file changed, 1111 deletions(-)
 delete mode 100644 notebooks/Test.ipynb

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
deleted file mode 100644
index 26d98b5..0000000
--- a/notebooks/Test.ipynb
+++ /dev/null
@@ -1,1111 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Example Notebook file demonstrating how to use the file structure\n",
-    "from utils.preprocess_util_lib_example import save_random_dataframe\n",
-    "from pathlib import Path\n",
-    "\n",
-    "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def determine_comma_role(name: str) -> str:\n",
-    "    \"\"\"Given a string (someone's name), attempts to determine the role of the\n",
-    "    comma in the name and where it ought to belong.\n",
-    "\n",
-    "    Some assumptions are made:\n",
-    "        * If a suffix is included in the name and the name is not just the last\n",
-    "          name(i.e \"Doe, Jr), the format is\n",
-    "          (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth\n",
-    "\n",
-    "        * If a comma is used anywhere else, it is in the format of\n",
-    "          (last_name, first and middle name) i.e Doe, Jane Elisabeth\n",
-    "    Args:\n",
-    "        name: a string representing a name/names of individuals\n",
-    "    Returns:\n",
-    "        the name with or without a comma based on some conditions\n",
-    "    \"\"\"\n",
-    "    suffixes = [\n",
-    "        \"sr\",\n",
-    "        \"jr\",\n",
-    "        \"i\",\n",
-    "        \"ii\",\n",
-    "        \"iii\",\n",
-    "        \"iv\",\n",
-    "        \"v\",\n",
-    "        \"vi\",\n",
-    "        \"vii\",\n",
-    "        \"viii\",\n",
-    "        \"ix\",\n",
-    "        \"x\",\n",
-    "    ]\n",
-    "    name_parts = name.lower().split(\",\")\n",
-    "    # if the comma is just in the end as a typo:\n",
-    "    if len(name_parts[1]) == 0:\n",
-    "        return name_parts[0].title()\n",
-    "    # if just the suffix in the end, leave the name as it is\n",
-    "    if name_parts[1].strip() in suffixes:\n",
-    "        return name.title()\n",
-    "    # at this point either it's just poor name placement, or the suffix is\n",
-    "    # in the beginning of the name. Either way, the first part of the list is\n",
-    "    # the true last name.\n",
-    "    last_part = name_parts.pop(0)\n",
-    "    first_part = \" \".join(name_parts)\n",
-    "    return first_part.title() + \" \" + last_part.title()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:\n",
-    "    \"\"\"Given name related columns, return a person's likely name\n",
-    "\n",
-    "    Given different formatting used accross states, errors in data entry\n",
-    "    and missing data, it can be difficult to determine someone's actual\n",
-    "    name. For example, some states have a last name column with values like\n",
-    "    \"Doe, Jane\", where the person's first name appears to have been erroneously\n",
-    "    included.\n",
-    "\n",
-    "    Args:\n",
-    "        first_name: raw value of first name column\n",
-    "        last_name: raw value last name column\n",
-    "        full_name: raw value of name or full_name column\n",
-    "    Returns:\n",
-    "        The most likely full name of the person listed\n",
-    "\n",
-    "    Sample Usage:\n",
-    "    >>> get_likely_name(\"Jane\", \"Doe\", \"\")\n",
-    "    'Jane Doe'\n",
-    "    >>> get_likely_name(\"\", \"\", \"Jane Doe\")\n",
-    "    'Jane Doe'\n",
-    "    >>> get_likely_name(\"\", \"Doe, Jane\", \"\")\n",
-    "    'Jane Doe'\n",
-    "    >>> get_likely_name(\"Jane Doe\", \"Doe\", \"Jane Doe\")\n",
-    "    'Jane Doe'\n",
-    "    >>> get_likely_name(\"Jane\",\"\",\"Doe, Sr\")\n",
-    "    'Jane Doe, Sr'\n",
-    "    >>> get_likely_name(\"Jane Elisabeth Doe, IV\",\"Elisabeth\",\"Doe, IV\")\n",
-    "    'Jane Elisabeth Doe, Iv'\n",
-    "    >>> get_likely_name(\"\",\"\",\"Jane Elisabeth Doe, IV\")\n",
-    "    'Jane Elisabeth Doe Iv'\n",
-    "    \"\"\"\n",
-    "    # first ensure clean input by deleting spaces:\n",
-    "    first_name, last_name, full_name = list(\n",
-    "        map(lambda x: x.lower().strip(), [first_name, last_name, full_name])\n",
-    "    )\n",
-    "\n",
-    "    # if data is clean:\n",
-    "    if first_name + \" \" + last_name == full_name:\n",
-    "        return full_name\n",
-    "\n",
-    "    # some names have titles or professions associated with the name. We need to\n",
-    "    # remove those from the name.\n",
-    "    titles = [\n",
-    "        \"mr\",\n",
-    "        \"ms\",\n",
-    "        \"mrs\",\n",
-    "        \"miss\",\n",
-    "        \"prof\",\n",
-    "        \"dr\",\n",
-    "        \"doctor\",\n",
-    "        \"sir\",\n",
-    "        \"madam\",\n",
-    "        \"professor\",\n",
-    "    ]\n",
-    "    names = [first_name, last_name, full_name]\n",
-    "\n",
-    "    for i in range(len(names)):\n",
-    "        # if there is a ',' deal with it accordingly\n",
-    "        if \",\" in names[i]:\n",
-    "            names[i] = determine_comma_role(names[i])\n",
-    "\n",
-    "        names[i] = names[i].replace(\".\", \"\").split(\" \")\n",
-    "        names[i] = [\n",
-    "            name_part for name_part in names[i] if name_part not in titles\n",
-    "        ]\n",
-    "        names[i] = \" \".join(names[i])\n",
-    "\n",
-    "    # one last check to remove any pieces that might add extra whitespace\n",
-    "    names = list(filter(lambda x: x != \"\", names))\n",
-    "    names = \" \".join(names)\n",
-    "    names = names.title().replace(\"  \",\" \").split(\" \")\n",
-    "    final_name = []\n",
-    "    [final_name.append(x) for x in names if x not in final_name]\n",
-    "    return \" \".join(final_name).strip()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "orgs_sample = pd.read_csv(\"../output/complete_organizations_table.csv\",index_col=0).sample(10000)\n",
-    "inds_sample = pd.read_csv(\"../output/complete_individuals_table.csv\",index_col=0, low_memory=False).sample(10000)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>state</th>\n",
-       "      <th>entity_type</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>50c7d9a1-b448-46a5-8e2d-cd15b3097360</td>\n",
-       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>50c7d9a1-b448-46a5-8e2d-cd15b3097360</td>\n",
-       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>50c7d9a1-b448-46a5-8e2d-cd15b3097360</td>\n",
-       "      <td>REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</td>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</td>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</td>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>d31df1ca-714e-4a82-9e88-1892c0451a71</td>\n",
-       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>d31df1ca-714e-4a82-9e88-1892c0451a71</td>\n",
-       "      <td>COMMITTEE TO ELECT DR PATRICIA BERNARD</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>62ea1e9c-ac12-400c-b3dc-519389c0f7d3</td>\n",
-       "      <td>UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>4db76e6e-f0d5-40eb-82de-6dbcdb562dd7</td>\n",
-       "      <td>Ugi Utilities Inc/Ugi Energy Services Llc Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>f71341d7-d27e-47eb-9b66-903af39d6cb5</td>\n",
-       "      <td>Pabar Pac (Pa Bar Assn)</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>c875d7de-94be-42f1-b994-dd89b114d51e</td>\n",
-       "      <td>Pa Fraternal Order Of Police Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>910c4d36-b036-469e-aa2a-ea4ff8855a6c</td>\n",
-       "      <td>Citizens For Kail</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>60d454d1-3773-4d88-80e9-132c161da0f0</td>\n",
-       "      <td>Paa Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe</td>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff</td>\n",
-       "      <td>MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>committee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd</td>\n",
-       "      <td>Paa Pac</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                      id  \\\n",
-       "0   50c7d9a1-b448-46a5-8e2d-cd15b3097360   \n",
-       "1   50c7d9a1-b448-46a5-8e2d-cd15b3097360   \n",
-       "2   50c7d9a1-b448-46a5-8e2d-cd15b3097360   \n",
-       "3   62ea1e9c-ac12-400c-b3dc-519389c0f7d3   \n",
-       "4   62ea1e9c-ac12-400c-b3dc-519389c0f7d3   \n",
-       "5   62ea1e9c-ac12-400c-b3dc-519389c0f7d3   \n",
-       "6   d31df1ca-714e-4a82-9e88-1892c0451a71   \n",
-       "7   d31df1ca-714e-4a82-9e88-1892c0451a71   \n",
-       "8   62ea1e9c-ac12-400c-b3dc-519389c0f7d3   \n",
-       "9   4db76e6e-f0d5-40eb-82de-6dbcdb562dd7   \n",
-       "10  f71341d7-d27e-47eb-9b66-903af39d6cb5   \n",
-       "11  c875d7de-94be-42f1-b994-dd89b114d51e   \n",
-       "12  910c4d36-b036-469e-aa2a-ea4ff8855a6c   \n",
-       "13  60d454d1-3773-4d88-80e9-132c161da0f0   \n",
-       "14  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
-       "15  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
-       "16  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe   \n",
-       "17  1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff   \n",
-       "18  1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd   \n",
-       "\n",
-       "                                                 name state   entity_type  \n",
-       "0   REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...    MI     committee  \n",
-       "1   REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...    MI     committee  \n",
-       "2   REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...    MI     committee  \n",
-       "3   UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...    MI     committee  \n",
-       "4   UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...    MI     committee  \n",
-       "5   UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...    MI     committee  \n",
-       "6              COMMITTEE TO ELECT DR PATRICIA BERNARD    MI     committee  \n",
-       "7              COMMITTEE TO ELECT DR PATRICIA BERNARD    MI     committee  \n",
-       "8   UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...    MI     committee  \n",
-       "9       Ugi Utilities Inc/Ugi Energy Services Llc Pac    PA  Organization  \n",
-       "10                            Pabar Pac (Pa Bar Assn)    PA  Organization  \n",
-       "11                   Pa Fraternal Order Of Police Pac    PA  Organization  \n",
-       "12                                  Citizens For Kail    PA  Organization  \n",
-       "13                                            Paa Pac    PA  Organization  \n",
-       "14     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee  \n",
-       "15     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee  \n",
-       "16     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee  \n",
-       "17     MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC    MI     committee  \n",
-       "18                                            Paa Pac    PA  Organization  "
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data = {'id':['50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360',\n",
-    "              '62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',\n",
-    "              'd31df1ca-714e-4a82-9e88-1892c0451a71','d31df1ca-714e-4a82-9e88-1892c0451a71','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',\n",
-    "              '4db76e6e-f0d5-40eb-82de-6dbcdb562dd7','f71341d7-d27e-47eb-9b66-903af39d6cb5','c875d7de-94be-42f1-b994-dd89b114d51e',\n",
-    "              '910c4d36-b036-469e-aa2a-ea4ff8855a6c','60d454d1-3773-4d88-80e9-132c161da0f0','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd',\n",
-    "              '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe','1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff',\n",
-    "              '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd'],\n",
-    "        'name':['REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC',\n",
-    "                'REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n",
-    "                'UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n",
-    "                'COMMITTEE TO ELECT DR PATRICIA BERNARD','COMMITTEE TO ELECT DR PATRICIA BERNARD','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n",
-    "                'Ugi Utilities Inc/Ugi Energy Services Llc Pac','Pabar Pac (Pa Bar Assn)','Pa Fraternal Order Of Police Pac','Citizens For Kail',\n",
-    "                'Paa Pac','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC',\n",
-    "                'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','Paa Pac'],\n",
-    "        'state':['MI','MI','MI','MI','MI','MI','MI','MI','MI','PA','PA','PA','PA','PA','MI','MI','MI','MI','PA'],\n",
-    "        'entity_type':['committee','committee','committee','committee','committee','committee','committee','committee','committee',\n",
-    "                       'Organization','Organization','Organization','Organization','Organization','committee','committee','committee','committee','Organization']}\n",
-    "\n",
-    "sample_df = pd.DataFrame(data)\n",
-    "sample_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils.constants import repo_root\n",
-    "def convert_duplicates_to_dict(df: pd.DataFrame)->pd.DataFrame:\n",
-    "    '''Takes a dataframe whose indexes are UUIDs, and a column that is a list of\n",
-    "    all other UUIDs that have duplicate values. The function then outputs a\n",
-    "    dictionary file where the deduped UUIDs map to the dataframe main UUID\n",
-    "    \n",
-    "    Args:\n",
-    "        A pandas dataframe with UUIDs as indexes and deduplicated UUIDs\n",
-    "        matching up to the index in the same row\n",
-    "        \n",
-    "    Returns\n",
-    "        None. However it outputs a dictionary to the output directory, with 2\n",
-    "        columns. The first, which indicates the deduplicated UUIDs, is labeled\n",
-    "        'duplicated_uuids', and the 2nd, which shows the uuids to which the\n",
-    "        deduplicated entries match two, is labeled 'mapped_uuids'.\n",
-    "    '''\n",
-    "    deduped_dict = {}\n",
-    "    for i in range(len(df)):\n",
-    "        deduped_uudis = df.iloc[i]['duplicated']\n",
-    "        for j in range(len(deduped_uudis)):\n",
-    "            deduped_dict.update({deduped_uudis[j]:df.iloc[i]['id']})\n",
-    "    \n",
-    "    # now convert dictionary into a csv file\n",
-    "    deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') \n",
-    "    deduped_df = deduped_df.reset_index().rename(columns={\"index\":\"duplicated_uuids\", 0:\"mapped_uuids\"})\n",
-    "    deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False, mode='a')\n",
-    "\n",
-    "\n",
-    "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n",
-    "    '''Given a dataframe, remove rows that have identical entry data beyond\n",
-    "    UUIDs, and output a file mapping an entry to other the UUIDs of the\n",
-    "    deduplicated rows\n",
-    "    \n",
-    "    Args:\n",
-    "        a pandas dataframe containing contribution data\n",
-    "    Returns:\n",
-    "        a deduplicated pandas dataframe containing contribution data\n",
-    "    '''\n",
-    "    #first remove all duplicate entries:\n",
-    "    new_df = df.drop_duplicates()\n",
-    "\n",
-    "    # now find the duplicates along all columns but the ID\n",
-    "    new_df=new_df.groupby(df.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n",
-    "    new_df.index=new_df[\"duplicated\"].str[0].tolist()\n",
-    "    new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n",
-    "\n",
-    "    # now convert the duplicated column into a dictionary that can will be\n",
-    "    # an output by only feeding the entries with duplicates\n",
-    "    new_df = new_df.reset_index().rename(columns = {'index':'id'})\n",
-    "    convert_duplicates_to_dict(new_df[new_df['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n",
-    "    new_df = new_df.drop(['duplicated'], axis=1)\n",
-    "    return new_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>state</th>\n",
-       "      <th>entity_type</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>3246120d-45fc-4d19-adee-d2aa2c5be6db</td>\n",
-       "      <td>1 BOLD STEP</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>corporation</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd</td>\n",
-       "      <td>12CDRC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>corporation</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>a5379930-7324-4f1d-b216-84d9e9ddea40</td>\n",
-       "      <td>303 MANAGEMENT INC.</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>corporation</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>9064112f-ef40-4690-9d0a-782a2375feb0</td>\n",
-       "      <td>314 ACTION FUND</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>corporation</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>9e11e7ae-ee29-4a50-9720-41c6ac556a1f</td>\n",
-       "      <td>A T AND T MICHIGAN PAC</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>corporation</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2149</th>\n",
-       "      <td>d79f9729-c9af-4347-868a-ae6e6814a295</td>\n",
-       "      <td>Zach Kirk</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2150</th>\n",
-       "      <td>fbfea472-e183-4479-b869-90eddfa5198c</td>\n",
-       "      <td>Zest Kitchen</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2151</th>\n",
-       "      <td>c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6</td>\n",
-       "      <td>Zoom Us</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2152</th>\n",
-       "      <td>59cc8db9-607e-4e1b-ba41-0850b6019360</td>\n",
-       "      <td>Zoom Video Communications Inc.</td>\n",
-       "      <td>PA</td>\n",
-       "      <td>Organization</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2153</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>corporation</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>2154 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                        id  \\\n",
-       "0     3246120d-45fc-4d19-adee-d2aa2c5be6db   \n",
-       "1     8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd   \n",
-       "2     a5379930-7324-4f1d-b216-84d9e9ddea40   \n",
-       "3     9064112f-ef40-4690-9d0a-782a2375feb0   \n",
-       "4     9e11e7ae-ee29-4a50-9720-41c6ac556a1f   \n",
-       "...                                    ...   \n",
-       "2149  d79f9729-c9af-4347-868a-ae6e6814a295   \n",
-       "2150  fbfea472-e183-4479-b869-90eddfa5198c   \n",
-       "2151  c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6   \n",
-       "2152  59cc8db9-607e-4e1b-ba41-0850b6019360   \n",
-       "2153                                   NaN   \n",
-       "\n",
-       "                                       name state   entity_type  \n",
-       "0      1 BOLD STEP                             MI   corporation  \n",
-       "1      12CDRC                                  MI   corporation  \n",
-       "2      303 MANAGEMENT INC.                     MI   corporation  \n",
-       "3      314 ACTION FUND                         MI   corporation  \n",
-       "4      A T AND T MICHIGAN PAC                  MI   corporation  \n",
-       "...                                     ...   ...           ...  \n",
-       "2149                              Zach Kirk    PA  Organization  \n",
-       "2150                           Zest Kitchen    PA  Organization  \n",
-       "2151                                Zoom Us    PA  Organization  \n",
-       "2152         Zoom Video Communications Inc.    PA  Organization  \n",
-       "2153                                    NaN    MI   corporation  \n",
-       "\n",
-       "[2154 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "x = deduplicate_perfect_matches(orgs_sample)\n",
-    "#len(x.iloc[2]['duplicated'])\n",
-    "x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>first_name</th>\n",
-       "      <th>last_name</th>\n",
-       "      <th>full_name</th>\n",
-       "      <th>entity_type</th>\n",
-       "      <th>state</th>\n",
-       "      <th>party</th>\n",
-       "      <th>company</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>f6df631a-e626-4861-b62b-e09512887bd3</td>\n",
-       "      <td>A SCOTT</td>\n",
-       "      <td>PARIS</td>\n",
-       "      <td>A SCOTT              PARIS                    ...</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NOT EMPLOYED</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>075fb1c6-6c70-4ec6-a439-fcebb76c4e0a</td>\n",
-       "      <td>A. MARK</td>\n",
-       "      <td>GLICKSTEIN</td>\n",
-       "      <td>A. MARK              GLICKSTEIN               ...</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>CA</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>PARTNERSHIP HEALTH PLAN OF CA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8</td>\n",
-       "      <td>A. MICHAEL</td>\n",
-       "      <td>PALIZZI</td>\n",
-       "      <td>A. MICHAEL           PALIZZI                  ...</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>MILLER CANFIELD</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>bb952efc-3dba-4449-9405-ea65202fbbea</td>\n",
-       "      <td>AARON</td>\n",
-       "      <td>ALDRICH</td>\n",
-       "      <td>AARON                ALDRICH                  ...</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>MILLER PIPELINE CORP.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>79ec4a73-f688-479a-a4e3-0b0a3813188a</td>\n",
-       "      <td>AARON</td>\n",
-       "      <td>BLAND</td>\n",
-       "      <td>AARON                BLAND                    ...</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MI</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7122</th>\n",
-       "      <td>a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Trone, Robert</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7123</th>\n",
-       "      <td>37ab55f5-3613-469c-8b66-ac8888f5bcae</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Wark, Mary Ann</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7124</th>\n",
-       "      <td>92d5ac7c-4702-420c-97a7-656111677f5a</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Wenstrom, Gene</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7125</th>\n",
-       "      <td>fa934bf1-f611-4cd3-9bff-451bdf2e5bd2</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Wika, Kevin</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7126</th>\n",
-       "      <td>fb8bb833-7010-418a-9f24-1a29771e0b67</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>wark, david</td>\n",
-       "      <td>Individual</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>7127 rows × 8 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                        id            first_name  \\\n",
-       "0     f6df631a-e626-4861-b62b-e09512887bd3  A SCOTT                \n",
-       "1     075fb1c6-6c70-4ec6-a439-fcebb76c4e0a  A. MARK                \n",
-       "2     4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8  A. MICHAEL             \n",
-       "3     bb952efc-3dba-4449-9405-ea65202fbbea  AARON                  \n",
-       "4     79ec4a73-f688-479a-a4e3-0b0a3813188a  AARON                  \n",
-       "...                                    ...                   ...   \n",
-       "7122  a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0                   NaN   \n",
-       "7123  37ab55f5-3613-469c-8b66-ac8888f5bcae                   NaN   \n",
-       "7124  92d5ac7c-4702-420c-97a7-656111677f5a                   NaN   \n",
-       "7125  fa934bf1-f611-4cd3-9bff-451bdf2e5bd2                   NaN   \n",
-       "7126  fb8bb833-7010-418a-9f24-1a29771e0b67                   NaN   \n",
-       "\n",
-       "                                 last_name  \\\n",
-       "0     PARIS                                  \n",
-       "1     GLICKSTEIN                             \n",
-       "2     PALIZZI                                \n",
-       "3     ALDRICH                                \n",
-       "4     BLAND                                  \n",
-       "...                                    ...   \n",
-       "7122                                   NaN   \n",
-       "7123                                   NaN   \n",
-       "7124                                   NaN   \n",
-       "7125                                   NaN   \n",
-       "7126                                   NaN   \n",
-       "\n",
-       "                                              full_name entity_type state  \\\n",
-       "0     A SCOTT              PARIS                    ...  Individual    MI   \n",
-       "1     A. MARK              GLICKSTEIN               ...  Individual    CA   \n",
-       "2     A. MICHAEL           PALIZZI                  ...  Individual    MI   \n",
-       "3     AARON                ALDRICH                  ...  Individual    MI   \n",
-       "4     AARON                BLAND                    ...  Individual    MI   \n",
-       "...                                                 ...         ...   ...   \n",
-       "7122                                      Trone, Robert  Individual    MN   \n",
-       "7123                                     Wark, Mary Ann  Individual    MN   \n",
-       "7124                                     Wenstrom, Gene  Individual    MN   \n",
-       "7125                                        Wika, Kevin  Individual    MN   \n",
-       "7126                                        wark, david  Individual    MN   \n",
-       "\n",
-       "     party                        company  \n",
-       "0      NaN                   NOT EMPLOYED  \n",
-       "1      NaN  PARTNERSHIP HEALTH PLAN OF CA  \n",
-       "2      NaN                MILLER CANFIELD  \n",
-       "3      NaN          MILLER PIPELINE CORP.  \n",
-       "4      NaN                            NaN  \n",
-       "...    ...                            ...  \n",
-       "7122   NaN                            NaN  \n",
-       "7123   NaN                            NaN  \n",
-       "7124   NaN                            NaN  \n",
-       "7125   NaN                            NaN  \n",
-       "7126   NaN                            NaN  \n",
-       "\n",
-       "[7127 rows x 8 columns]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "y=deduplicate_perfect_matches(inds_sample)\n",
-    "y"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "7207"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a = inds_sample.drop_duplicates()\n",
-    "len(a)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Max Speed</th>\n",
-       "      <th>Animal</th>\n",
-       "      <th>Color</th>\n",
-       "      <th>Age</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>380.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>green</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>370.0</td>\n",
-       "      <td>Falcon</td>\n",
-       "      <td>None</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>yellow</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Parrot</td>\n",
-       "      <td>blue</td>\n",
-       "      <td>6.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Max Speed  Animal   Color  Age\n",
-       "0      380.0    None   green  2.0\n",
-       "1      370.0  Falcon    None  NaN\n",
-       "2        NaN    None  yellow  5.0\n",
-       "3        NaN  Parrot    blue  6.0"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "df = pd.DataFrame({'Max Speed': [380., 370., np.nan, np.nan],\n",
-    "                   'Animal': ['None', 'Falcon', 'None', 'Parrot'],\n",
-    "                   'Color':['green',None,'yellow','blue'],\n",
-    "                   'Age':[2,np.nan,5,6]})\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fa2f9d5bb50>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df= df.groupby(df.columns[1:].tolist(), dropna=False)[\"Max Speed\"]#.agg(list)#.reset_index()\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Age</th>\n",
-       "      <th>Animal</th>\n",
-       "      <th>Color</th>\n",
-       "      <th>Max Speed</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>green</td>\n",
-       "      <td>[380.0]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>5.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>yellow</td>\n",
-       "      <td>[nan]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>Parrot</td>\n",
-       "      <td>blue</td>\n",
-       "      <td>[nan]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Falcon</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>[370.0]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Age  Animal   Color Max Speed\n",
-       "0  2.0    None   green   [380.0]\n",
-       "1  5.0    None  yellow     [nan]\n",
-       "2  6.0  Parrot    blue     [nan]\n",
-       "3  NaN  Falcon     NaN   [370.0]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = df.groupby((df.columns.difference(['Max Speed'])).tolist(),dropna=False)['Max Speed'].agg(list).reset_index()\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f59594f81d0>"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}