From 46a5c3b8d297598d561da2aaecc3d90493f8427e Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 18 Jan 2024 00:05:53 -0600 Subject: [PATCH 01/42] get_likely function done --- setup.py | 2 +- utils/linkage.py | 120 +++++++++++++++++------------------------------ 2 files changed, 44 insertions(+), 78 deletions(-) diff --git a/setup.py b/setup.py index 63ef672..07404ac 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup setup( - name="2023-fall-clinic-climate-cabinet", + name="2024-winter-clinic-climate-cabinet", version="0.1.0", packages=find_packages( include=[ diff --git a/utils/linkage.py b/utils/linkage.py index aa56307..c3ddf1b 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -3,33 +3,6 @@ """ -def calculate_string_similarity(string1: str, string2: str) -> float: - """Returns how similar two strings are on a scale of 0 to 1 - - The exact meaning of the metric is open, but the following must hold true: - 1. equivalent strings must return 1 - 2. strings with no similar characters must return 0 - 3. strings with higher intuitive similarity must return higher scores - - Args: - string1: any string - string2: any string - Returns: - similarity score - - Sample Usage: - >>> calculate_string_similarity("exact match", "exact match") - 1.0 - >>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb") - 0.0 - >>> similar_score = calculate_string_similarity("very similar", "vary similar") - >>> different_score = calculate_string_similarity("very similar", "very not close") - >>> similar_socre > different_score - True - """ - pass - - def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: """Given name related columns, return a person's likely name @@ -56,54 +29,47 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: >>> get_likely_name("Jane Doe", "Doe", "Jane Doe") "Jane Doe" """ - pass - - -def get_address_line_1_from_full_address(address: str) -> str: - """Given a full address, return the first line of the formatted address - - Address line 1 usually includes street address or PO Box information. - - Args: - address: raw string representing full address - Returns: - address_line_1 - - Sample Usage: - >>> get_address_line_1_from_full_address("6727 W. Corrine Dr. Peoria,AZ 85381") - "6727 W. Corrine Dr." - >>> get_address_line_1_from_full_address("P.O. Box 5456 Sun City West ,AZ 85375") - "P.O. Box 5456" - >>> get_address_line_1_from_full_address("119 S 5th St Niles,MI 49120") - "119 S 5th St" - >>> get_address_line_1_from_full_address( - ... "1415 PARKER STREET APT 251 DETROIT MI 48214-0000" - ... ) - "1415 PARKER STREET" - """ - pass - -def get_street_from_address_line_1(address_line_1: str) -> str: - """Given an address line 1, return the street name - - Args: - address_line_1: either street information or PO box - Returns: - street name - Raises: - ValueError: if string is malformed and no street can be reasonably - found. - - >>> get_street_from_address_line_1("5645 N. UBER ST") - "UBER ST" - >>> get_street_from_address_line_1("") - Traceback (most recent call last): - ... - ValueError: address_line_1 must have whitespace - >>> get_street_from_address_line_1("PO Box 1111") - Traceback (most recent call last): - ... - ValueError: address_line_1 is PO Box - """ - pass + # if data is clean: + if first_name + " " + last_name == full_name: + return full_name + + # some names have titles or professions associated with the name. We need to + # remove those from the name. + titles = [ + "mr", + "ms", + "mrs", + "miss", + "prof", + "dr", + "doctor", + "sir", + "madam", + "professor", + ] + names = [first_name, last_name, full_name] + + for i in range(len(names)): + # if there is a ',' switch around the names + if "," in names[i]: + index = names[i].find(",") + first_part = names[i][index + 1 :] + last_part = names[i][0:index] + names[i] = first_part + " " + last_part + + names[i] = names[i].lower().replace(".", "").split(" ") + names[i] = [ + name_part for name_part in names[i] if name_part not in titles + ] + names[i] = " ".join(names[i]) + + names = " ".join(names) + names = names.split(" ") + final_name = [] + [ + final_name.append(x) + for x in names + if ((x not in final_name) & (len(x) != 0)) + ] + return " ".join(final_name) From 073c935e3861ebc12a086ae5bc01fee4acadc373 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 18 Jan 2024 00:15:34 -0600 Subject: [PATCH 02/42] added .title() function to return proper name format --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index c3ddf1b..7461049 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -72,4 +72,4 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: for x in names if ((x not in final_name) & (len(x) != 0)) ] - return " ".join(final_name) + return " ".join(final_name).title() From 16a51dc7ba80ecd3ecdc4653f8623c1b5a8fb9a1 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 18 Jan 2024 00:55:07 -0600 Subject: [PATCH 03/42] struggling with converting single quotes into double quotes for function output --- utils/linkage.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 7461049..f695a0a 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -67,9 +67,5 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names = " ".join(names) names = names.split(" ") final_name = [] - [ - final_name.append(x) - for x in names - if ((x not in final_name) & (len(x) != 0)) - ] - return " ".join(final_name).title() + [final_name.append(x) for x in names if x not in final_name] + return " ".join(final_name).title().strip() From c446aaf79b5843a416d6951cc19a7f1554347f1a Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Sat, 20 Jan 2024 03:03:36 -0600 Subject: [PATCH 04/42] updates to get_likely_name function after feedback to consider generational suffixes and handle more edge cases --- utils/linkage.py | 64 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index f695a0a..f43e09f 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,6 +1,47 @@ -""" -Module for performing record linkage on state campaign finance dataset -""" +def determine_comma_role(name: str) -> str: + """Given a string (someone's name), attempts to determine the role of the + comma in the name and where it ought to belong. + + Some assumptions are made: + * If a suffix is included in the name and the name is not just the last + name(i.e "Doe, Jr), the format is + (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth + + * If a comma is used anywhere else, it is in the format of + (last_name, first and middle name) i.e Doe, Jane Elisabeth + + Args: + name: a string representing a name/names of individuals + Returns: + the name with or without a comma based on some conditions + """ + suffixes = [ + "sr", + "jr", + "i", + "ii", + "iii", + "iv", + "v", + "vi", + "vii", + "viii", + "ix", + "x", + ] + name_parts = name.split(",") + # if the comma is just in the end as a typo: + if len(name_parts[1]) == 0: + return name_parts[0] + # if just the suffix in the end, leave the name as it is + if name_parts[1].strip() in suffixes: + return name + # at this point either it's just poor name placement, or the suffix is + # in the beginning of the name. Either way, the first part of the list is the + # true last name. + last_part = name_parts.pop(0) + first_part = " ".join(name_parts) + return first_part + " " + last_part def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: @@ -29,6 +70,10 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: >>> get_likely_name("Jane Doe", "Doe", "Jane Doe") "Jane Doe" """ + # first ensure clean input by deleting spaces: + first_name, last_name, full_name = list( + map(lambda x: x.lower().strip(), [first_name, last_name, full_name]) + ) # if data is clean: if first_name + " " + last_name == full_name: @@ -51,20 +96,19 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names = [first_name, last_name, full_name] for i in range(len(names)): - # if there is a ',' switch around the names + # if there is a ',' deal with it accordingly if "," in names[i]: - index = names[i].find(",") - first_part = names[i][index + 1 :] - last_part = names[i][0:index] - names[i] = first_part + " " + last_part - - names[i] = names[i].lower().replace(".", "").split(" ") + names[i] = determine_comma_role(names[i]) + print(names[i]) + names[i] = names[i].replace(".", "").split(" ") names[i] = [ name_part for name_part in names[i] if name_part not in titles ] names[i] = " ".join(names[i]) + print(names[i]) names = " ".join(names) + print("after comma: ", names) names = names.split(" ") final_name = [] [final_name.append(x) for x in names if x not in final_name] From efc02e22ebc298095c2abf7a4adcd13db02b2a2d Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Sat, 20 Jan 2024 03:11:05 -0600 Subject: [PATCH 05/42] adjusted the sample usage output to single quotes as per Avery's suggestion --- utils/linkage.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index f43e09f..9b9ba22 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -62,13 +62,19 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: Sample Usage: >>> get_likely_name("Jane", "Doe", "") - "Jane Doe" + 'Jane Doe' >>> get_likely_name("", "", "Jane Doe") - "Jane Doe" + 'Jane Doe' >>> get_likely_name("", "Doe, Jane", "") - "Jane Doe" + 'Jane Doe' >>> get_likely_name("Jane Doe", "Doe", "Jane Doe") - "Jane Doe" + 'Jane Doe' + >>> get_likely_name("Jane","","Doe, Sr") + 'Jane Doe, Sr' + >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV) + 'Jane Elisabeth Doe, Iv' + >>> get_likely_name("","",Jane Elisabeth Doe, IV") + 'Jane Elisabeth Doe Iv' """ # first ensure clean input by deleting spaces: first_name, last_name, full_name = list( @@ -99,16 +105,14 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: # if there is a ',' deal with it accordingly if "," in names[i]: names[i] = determine_comma_role(names[i]) - print(names[i]) + names[i] = names[i].replace(".", "").split(" ") names[i] = [ name_part for name_part in names[i] if name_part not in titles ] names[i] = " ".join(names[i]) - print(names[i]) names = " ".join(names) - print("after comma: ", names) names = names.split(" ") final_name = [] [final_name.append(x) for x in names if x not in final_name] From 6c37c4576c39ec2d1ac6856c036ed6dceef6c628 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Sat, 20 Jan 2024 03:25:32 -0600 Subject: [PATCH 06/42] took care of empty strings that were adding extra whitespace to o output --- utils/linkage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 9b9ba22..521c75c 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -111,7 +111,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: name_part for name_part in names[i] if name_part not in titles ] names[i] = " ".join(names[i]) - + + #one last check to remove any pieces that might add extra whitespace + names = list(filter(lambda x: x != '', names)) names = " ".join(names) names = names.split(" ") final_name = [] From 81e52dbdb537e4ee6caae02462c49ba7a2ef1d1a Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Sat, 20 Jan 2024 03:27:12 -0600 Subject: [PATCH 07/42] took care of empty strings that were adding extra whitespace to output --- utils/linkage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 521c75c..4c1d24f 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -111,9 +111,9 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: name_part for name_part in names[i] if name_part not in titles ] names[i] = " ".join(names[i]) - - #one last check to remove any pieces that might add extra whitespace - names = list(filter(lambda x: x != '', names)) + + # one last check to remove any pieces that might add extra whitespace + names = list(filter(lambda x: x != "", names)) names = " ".join(names) names = names.split(" ") final_name = [] From 2dcb7d9592be19be15e688101509a25581848dcc Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Sat, 20 Jan 2024 03:30:06 -0600 Subject: [PATCH 08/42] fixed error in sample usage output --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 4c1d24f..df15117 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -71,7 +71,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: 'Jane Doe' >>> get_likely_name("Jane","","Doe, Sr") 'Jane Doe, Sr' - >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV) + >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV") 'Jane Elisabeth Doe, Iv' >>> get_likely_name("","",Jane Elisabeth Doe, IV") 'Jane Elisabeth Doe Iv' From 20f4e938e09fa98d1f5acddf7e6eee5c8c2684b5 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 25 Jan 2024 05:16:07 +0000 Subject: [PATCH 09/42] adding cleaning_company_column function --- utils/linkage.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/utils/linkage.py b/utils/linkage.py index fe4dfd3..86485d3 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,6 +1,7 @@ """ Module for performing record linkage on state campaign finance dataset """ +import pandas as pd import usaddress @@ -47,3 +48,54 @@ def get_street_from_address_line_1(address_line_1: str) -> str: string.append(key) return " ".join(string) + + +""" +Module for standardizing the 'company' columnn of the state campaign finance dataset +""" + + +def cleaning_company_column(company: str) -> str: + """ + Given a string, check if it contains a variation of self employed, unemployed, + or retired and return the standardized version. + + Args: + company: string of inputted company names + Returns: + standardized for retired, self employed, and unemployed, + or original string if no match or empty string + + >>> cleaning_company_column("Retireed") + 'Retired' + >>> cleaning_company_column("self") + 'Self Employed' + >>> cleaning_company_column("None") + 'Unemployed' + """ + if pd.isnull(company): + return company + + company_edited = company.lower() + company_edited = company_edited.strip() + company_edited = company_edited.replace(".", " ") + company_edited = company_edited.replace(",", " ") + company_edited = company_edited.replace("-", " ") + + if "retire" in company_edited: + return "Retired" + elif "self employe" in company_edited or company_edited == "self": + return "Self Employed" + elif ( + "unemploye" in company_edited + or company_edited == "none" + or company_edited == "not employed" + ): + return "Unemployed" + + else: + return company + + +# Example implementation of the function standardize_company_column for a dataframe +# df['standardized_company'] = df['company'].apply(standardize_company_column) From baf56f5707c31b222f97322c2b244892982873a5 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Mon, 29 Jan 2024 10:26:49 -0600 Subject: [PATCH 10/42] testing if merge was done correctly after git pull --- requirements.txt | 2 ++ utils/linkage.py | 86 +++++++++++++++++++++++++++++++++++++++++++++-- utils/pipeline.py | 1 + 3 files changed, 87 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6658f0e..944e1c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,5 @@ beautifulsoup4==4.11.1 numpy==1.25.0 Requests==2.31.0 setuptools==68.0.0 +textdistance==4.6.1 +usaddress==0.5.4 \ No newline at end of file diff --git a/utils/linkage.py b/utils/linkage.py index df15117..e88a4a3 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,3 +1,41 @@ +""" +Module for performing record linkage on state campaign finance dataset +""" +import textdistance as td +import usaddress + +def calculate_string_similarity(string1: str, string2: str) -> float: + """Returns how similar two strings are on a scale of 0 to 1 + + This version utilizes Jaro-Winkler distance, which is a metric of + edit distance. Jaro-Winkler specially prioritizes the early + characters in a string. + + Since the ends of strings are often more valuable in matching names + and addresses, we reverse the strings before matching them. + + https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance + https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js + + The exact meaning of the metric is open, but the following must hold true: + 1. equivalent strings must return 1 + 2. strings with no similar characters must return 0 + 3. strings with higher intuitive similarity must return higher scores + similarity score + + Sample Usage: + >>> calculate_string_similarity("exact match", "exact match") + 1.0 + >>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb") + 0.0 + >>> similar_score = calculate_string_similarity("very similar", "vary similar") + >>> different_score = calculate_string_similarity("very similar", "very not close") + >>> similar_score > different_score + True + """ + + return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1])) + def determine_comma_role(name: str) -> str: """Given a string (someone's name), attempts to determine the role of the comma in the name and where it ought to belong. @@ -9,7 +47,6 @@ def determine_comma_role(name: str) -> str: * If a comma is used anywhere else, it is in the format of (last_name, first and middle name) i.e Doe, Jane Elisabeth - Args: name: a string representing a name/names of individuals Returns: @@ -73,7 +110,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: 'Jane Doe, Sr' >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV") 'Jane Elisabeth Doe, Iv' - >>> get_likely_name("","",Jane Elisabeth Doe, IV") + >>> get_likely_name("","","Jane Elisabeth Doe, IV") 'Jane Elisabeth Doe Iv' """ # first ensure clean input by deleting spaces: @@ -119,3 +156,48 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: final_name = [] [final_name.append(x) for x in names if x not in final_name] return " ".join(final_name).title().strip() + + +def get_street_from_address_line_1(address_line_1: str) -> str: + """Given an address line 1, return the street name + + Args: + address_line_1: either street information or PO box + Returns: + street name + Raises: + ValueError: if string is malformed and no street can be reasonably + found. + + >>> get_street_from_address_line_1("5645 N. UBER ST") + 'UBER ST' + >>> get_street_from_address_line_1("") + Traceback (most recent call last): + ... + ValueError: address_line_1 must have whitespace + >>> get_street_from_address_line_1("PO Box 1111") + Traceback (most recent call last): + ... + ValueError: address_line_1 is PO Box + >>> get_street_from_address_line_1("300 59 St.") + '59 St.' + >>> get_street_from_address_line_1("Uber St.") + 'Uber St.' + >>> get_street_from_address_line_1("3NW 59th St") + '59th St' + """ + if not address_line_1 or address_line_1.isspace(): + raise ValueError("address_line_1 must have whitespace") + + address_line_lower = address_line_1.lower() + + if "po box" in address_line_lower: + raise ValueError("address_line_1 is PO Box") + + string = [] + address = usaddress.parse(address_line_1) + for key, val in address: + if val in ["StreetName", "StreetNamePostType"]: + string.append(key) + + return " ".join(string) diff --git a/utils/pipeline.py b/utils/pipeline.py index 7a288fd..e6b7a12 100644 --- a/utils/pipeline.py +++ b/utils/pipeline.py @@ -18,6 +18,7 @@ single_state_organizations_tables = [] single_state_transactions_tables = [] for state_cleaner in state_cleaners: + print("Cleaning...") ( individuals_table, organizations_table, From 3d6500cfb5ef60aa3a745c593802cb605f840800 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Mon, 29 Jan 2024 11:17:58 -0600 Subject: [PATCH 11/42] undoing the mistake of previous commit where I committed files from the data and output directories --- utils/linkage.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 1500dc0..e419c6f 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -4,6 +4,7 @@ import textdistance as td import usaddress + def calculate_string_similarity(string1: str, string2: str) -> float: """Returns how similar two strings are on a scale of 0 to 1 @@ -75,8 +76,8 @@ def determine_comma_role(name: str) -> str: if name_parts[1].strip() in suffixes: return name # at this point either it's just poor name placement, or the suffix is - # in the beginning of the name. Either way, the first part of the list is the - # true last name. + # in the beginning of the name. Either way, the first part of the list is + # the true last name. last_part = name_parts.pop(0) first_part = " ".join(name_parts) return first_part + " " + last_part @@ -157,7 +158,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: final_name = [] [final_name.append(x) for x in names if x not in final_name] return " ".join(final_name).title().strip() - + def get_street_from_address_line_1(address_line_1: str) -> str: """Given an address line 1, return the street name From ca8b3f7aa83262e8c8de1064d962a2f19f16da86 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Mon, 29 Jan 2024 23:21:27 -0600 Subject: [PATCH 12/42] standardizing corporate names function --- utils/linkage.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/utils/linkage.py b/utils/linkage.py index 44f24e5..f99ab5a 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -87,3 +87,69 @@ def get_street_from_address_line_1(address_line_1: str) -> str: string.append(key) return " ".join(string) + + +def standardize_corp_names(company_name: str) -> str: + """Given an employer name, return the standardized version + + Args: + company_name: corporate name + Returns: + standardized company name + + >>>standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC') + 'MI BEER WINE WHOLESALERS ASSOCIATION' + + >>>standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION') + 'MI COMMUNITY COLLEGE ASSOCIATION' + + >>>standardize_corp_names('STEPHANIES CHANGEMAKER FUND') + 'STEPHANIES CHANGEMAKER FUND' + + """ + + + company_name_split = company_name.upper().split(' ') + + company_types = { + 'CORP': 'CORPORATION', + 'CO': 'CORPORATION', + 'LLC': 'LIMITED LIABILITY COMPANY', + 'PTNR': 'PARTNERSHIP', + 'LP': 'LIMITED PARTNERSHIP', + 'LLP': 'LIMITED LIABILITY PARTNERSHIP', + 'SOLE PROP': 'SOLE PROPRIETORSHIP', + 'SP': 'SOLE PROPRIETORSHIP', + 'NPO': 'NONPROFIT ORGANIZATION', + 'PC': 'PROFESSIONAL CORPORATION', + 'CO-OP': 'COOPERATIVE', + 'LTD': 'LIMITED COMPANY', + 'JSC': 'JOINT STOCK COMPANY', + 'HOLDCO': 'HOLDING COMPANY', + 'PLC': 'PUBLIC LIMITED COMPANY', + 'PVT LTD': 'PRIVATE LIMITED COMPANY', + 'INC': 'INCORPORATED', + 'ASSOC': 'ASSOCIATION', + 'FDN': 'FOUNDATION', + 'TR': 'TRUST', + 'SOC': 'SOCIETY', + 'CONSORT': 'CONSORTIUM', + 'SYND': 'SYNDICATE', + 'GRP': 'GROUP', + 'CORP SOLE': 'CORPORATION SOLE', + 'JV': 'JOINT VENTURE', + 'SUB': 'SUBSIDIARY', + 'FRANCHISE': 'FRANCHISE', + 'PA': 'PROFESSIONAL ASSOCIATION', + 'CIC': 'COMMUNITY INTEREST COMPANY', + + 'PAC': 'POLITICAL ACTION COMMITTEE' +} + + for i in range(len(company_name_split)): + if company_name_split[i] in list(company_types.keys()): + hold = company_name_split[i] + company_name_split[i] = company_types[hold] + + new_company_name = ' '.join(company_name_split) + return new_company_name \ No newline at end of file From 663f08daf061f79cfba23a97cbaadfd9ff67d6a6 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Tue, 30 Jan 2024 20:02:15 -0600 Subject: [PATCH 13/42] corp names function update --- utils/linkage.py | 82 +++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index f99ab5a..65f4cb4 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -96,60 +96,58 @@ def standardize_corp_names(company_name: str) -> str: company_name: corporate name Returns: standardized company name - + >>>standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC') 'MI BEER WINE WHOLESALERS ASSOCIATION' - + >>>standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION') 'MI COMMUNITY COLLEGE ASSOCIATION' - + >>>standardize_corp_names('STEPHANIES CHANGEMAKER FUND') - 'STEPHANIES CHANGEMAKER FUND' - - """ + 'STEPHANIES CHANGEMAKER FUND' + """ - company_name_split = company_name.upper().split(' ') + company_name_split = company_name.upper().split(" ") company_types = { - 'CORP': 'CORPORATION', - 'CO': 'CORPORATION', - 'LLC': 'LIMITED LIABILITY COMPANY', - 'PTNR': 'PARTNERSHIP', - 'LP': 'LIMITED PARTNERSHIP', - 'LLP': 'LIMITED LIABILITY PARTNERSHIP', - 'SOLE PROP': 'SOLE PROPRIETORSHIP', - 'SP': 'SOLE PROPRIETORSHIP', - 'NPO': 'NONPROFIT ORGANIZATION', - 'PC': 'PROFESSIONAL CORPORATION', - 'CO-OP': 'COOPERATIVE', - 'LTD': 'LIMITED COMPANY', - 'JSC': 'JOINT STOCK COMPANY', - 'HOLDCO': 'HOLDING COMPANY', - 'PLC': 'PUBLIC LIMITED COMPANY', - 'PVT LTD': 'PRIVATE LIMITED COMPANY', - 'INC': 'INCORPORATED', - 'ASSOC': 'ASSOCIATION', - 'FDN': 'FOUNDATION', - 'TR': 'TRUST', - 'SOC': 'SOCIETY', - 'CONSORT': 'CONSORTIUM', - 'SYND': 'SYNDICATE', - 'GRP': 'GROUP', - 'CORP SOLE': 'CORPORATION SOLE', - 'JV': 'JOINT VENTURE', - 'SUB': 'SUBSIDIARY', - 'FRANCHISE': 'FRANCHISE', - 'PA': 'PROFESSIONAL ASSOCIATION', - 'CIC': 'COMMUNITY INTEREST COMPANY', - - 'PAC': 'POLITICAL ACTION COMMITTEE' -} + "CORP": "CORPORATION", + "CO": "CORPORATION", + "LLC": "LIMITED LIABILITY COMPANY", + "PTNR": "PARTNERSHIP", + "LP": "LIMITED PARTNERSHIP", + "LLP": "LIMITED LIABILITY PARTNERSHIP", + "SOLE PROP": "SOLE PROPRIETORSHIP", + "SP": "SOLE PROPRIETORSHIP", + "NPO": "NONPROFIT ORGANIZATION", + "PC": "PROFESSIONAL CORPORATION", + "CO-OP": "COOPERATIVE", + "LTD": "LIMITED COMPANY", + "JSC": "JOINT STOCK COMPANY", + "HOLDCO": "HOLDING COMPANY", + "PLC": "PUBLIC LIMITED COMPANY", + "PVT LTD": "PRIVATE LIMITED COMPANY", + "INC": "INCORPORATED", + "ASSOC": "ASSOCIATION", + "FDN": "FOUNDATION", + "TR": "TRUST", + "SOC": "SOCIETY", + "CONSORT": "CONSORTIUM", + "SYND": "SYNDICATE", + "GRP": "GROUP", + "CORP SOLE": "CORPORATION SOLE", + "JV": "JOINT VENTURE", + "SUB": "SUBSIDIARY", + "FRANCHISE": "FRANCHISE", + "PA": "PROFESSIONAL ASSOCIATION", + "CIC": "COMMUNITY INTEREST COMPANY", + "PAC": "POLITICAL ACTION COMMITTEE", + } for i in range(len(company_name_split)): if company_name_split[i] in list(company_types.keys()): hold = company_name_split[i] company_name_split[i] = company_types[hold] - new_company_name = ' '.join(company_name_split) - return new_company_name \ No newline at end of file + new_company_name = " ".join(company_name_split) + return new_company_name From 1ab1d4277f03f68a0fdf9b887af729f24f4e1d2c Mon Sep 17 00:00:00 2001 From: npashilkar Date: Tue, 30 Jan 2024 20:07:25 -0600 Subject: [PATCH 14/42] updated corp names --- utils/linkage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 65f4cb4..49d120b 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -97,13 +97,13 @@ def standardize_corp_names(company_name: str) -> str: Returns: standardized company name - >>>standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC') + >>> standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC') 'MI BEER WINE WHOLESALERS ASSOCIATION' - >>>standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION') + >>> standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION') 'MI COMMUNITY COLLEGE ASSOCIATION' - >>>standardize_corp_names('STEPHANIES CHANGEMAKER FUND') + >>> standardize_corp_names('STEPHANIES CHANGEMAKER FUND') 'STEPHANIES CHANGEMAKER FUND' """ From 6aad87ef2d9598a5745abf64d5eaf3326122041c Mon Sep 17 00:00:00 2001 From: npashilkar Date: Tue, 30 Jan 2024 21:29:16 -0600 Subject: [PATCH 15/42] moved dict to constants file --- utils/constants.py | 36 ++++++++++++++++++++++++++++++++++++ utils/linkage.py | 42 ++++++------------------------------------ 2 files changed, 42 insertions(+), 36 deletions(-) diff --git a/utils/constants.py b/utils/constants.py index b87d39d..f259db3 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -605,3 +605,39 @@ " WV ", " WY ", ] + +# utils/linkage.py constants + +COMPANY_TYPES = { + "CORP": "CORPORATION", + "CO": "CORPORATION", + "LLC": "LIMITED LIABILITY COMPANY", + "PTNR": "PARTNERSHIP", + "LP": "LIMITED PARTNERSHIP", + "LLP": "LIMITED LIABILITY PARTNERSHIP", + "SOLE PROP": "SOLE PROPRIETORSHIP", + "SP": "SOLE PROPRIETORSHIP", + "NPO": "NONPROFIT ORGANIZATION", + "PC": "PROFESSIONAL CORPORATION", + "CO-OP": "COOPERATIVE", + "LTD": "LIMITED COMPANY", + "JSC": "JOINT STOCK COMPANY", + "HOLDCO": "HOLDING COMPANY", + "PLC": "PUBLIC LIMITED COMPANY", + "PVT LTD": "PRIVATE LIMITED COMPANY", + "INC": "INCORPORATED", + "ASSOC": "ASSOCIATION", + "FDN": "FOUNDATION", + "TR": "TRUST", + "SOC": "SOCIETY", + "CONSORT": "CONSORTIUM", + "SYND": "SYNDICATE", + "GRP": "GROUP", + "CORP SOLE": "CORPORATION SOLE", + "JV": "JOINT VENTURE", + "SUB": "SUBSIDIARY", + "FRANCHISE": "FRANCHISE", + "PA": "PROFESSIONAL ASSOCIATION", + "CIC": "COMMUNITY INTEREST COMPANY", + "PAC": "POLITICAL ACTION COMMITTEE", +} diff --git a/utils/linkage.py b/utils/linkage.py index 49d120b..34b2579 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,3 +1,4 @@ +import constants import textdistance as td import usaddress @@ -110,44 +111,13 @@ def standardize_corp_names(company_name: str) -> str: company_name_split = company_name.upper().split(" ") - company_types = { - "CORP": "CORPORATION", - "CO": "CORPORATION", - "LLC": "LIMITED LIABILITY COMPANY", - "PTNR": "PARTNERSHIP", - "LP": "LIMITED PARTNERSHIP", - "LLP": "LIMITED LIABILITY PARTNERSHIP", - "SOLE PROP": "SOLE PROPRIETORSHIP", - "SP": "SOLE PROPRIETORSHIP", - "NPO": "NONPROFIT ORGANIZATION", - "PC": "PROFESSIONAL CORPORATION", - "CO-OP": "COOPERATIVE", - "LTD": "LIMITED COMPANY", - "JSC": "JOINT STOCK COMPANY", - "HOLDCO": "HOLDING COMPANY", - "PLC": "PUBLIC LIMITED COMPANY", - "PVT LTD": "PRIVATE LIMITED COMPANY", - "INC": "INCORPORATED", - "ASSOC": "ASSOCIATION", - "FDN": "FOUNDATION", - "TR": "TRUST", - "SOC": "SOCIETY", - "CONSORT": "CONSORTIUM", - "SYND": "SYNDICATE", - "GRP": "GROUP", - "CORP SOLE": "CORPORATION SOLE", - "JV": "JOINT VENTURE", - "SUB": "SUBSIDIARY", - "FRANCHISE": "FRANCHISE", - "PA": "PROFESSIONAL ASSOCIATION", - "CIC": "COMMUNITY INTEREST COMPANY", - "PAC": "POLITICAL ACTION COMMITTEE", - } - for i in range(len(company_name_split)): - if company_name_split[i] in list(company_types.keys()): + if company_name_split[i] in list(constants.COMPANY_TYPES.keys()): hold = company_name_split[i] - company_name_split[i] = company_types[hold] + company_name_split[i] = constants.COMPANY_TYPES[hold] new_company_name = " ".join(company_name_split) return new_company_name + + +print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOCIATION")) From 5b4de8c3c2c27164cf47df72d3eaa6101335cac1 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Tue, 30 Jan 2024 21:47:56 -0600 Subject: [PATCH 16/42] updated constants file --- utils/linkage.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 34b2579..a26a9fe 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -118,6 +118,3 @@ def standardize_corp_names(company_name: str) -> str: new_company_name = " ".join(company_name_split) return new_company_name - - -print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOCIATION")) From e4fe9fc354e4429d17b754c32b083b0eaae6a4c6 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Tue, 30 Jan 2024 21:53:36 -0600 Subject: [PATCH 17/42] updated constants file --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index a26a9fe..faa8860 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,4 +1,4 @@ -import constants +import utils.constants import textdistance as td import usaddress From 844d20e5ddccd35514da4ef52fb321677c15e919 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Tue, 30 Jan 2024 22:17:08 -0600 Subject: [PATCH 18/42] updated constants file --- utils/linkage.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index faa8860..9866a9b 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,4 +1,6 @@ -import utils.constants +from utils.constants import ( + COMPANY_TYPES +) import textdistance as td import usaddress @@ -112,9 +114,11 @@ def standardize_corp_names(company_name: str) -> str: company_name_split = company_name.upper().split(" ") for i in range(len(company_name_split)): - if company_name_split[i] in list(constants.COMPANY_TYPES.keys()): + if company_name_split[i] in list(COMPANY_TYPES.keys()): hold = company_name_split[i] - company_name_split[i] = constants.COMPANY_TYPES[hold] + company_name_split[i] = COMPANY_TYPES[hold] new_company_name = " ".join(company_name_split) return new_company_name + +print(standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')) From 976fc3ff4608874a9259977b21397027073ecfd6 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 31 Jan 2024 14:15:18 +0000 Subject: [PATCH 19/42] updated function --- utils/linkage.py | 52 +++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 86485d3..1dbf54b 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,7 +1,8 @@ """ Module for performing record linkage on state campaign finance dataset """ -import pandas as pd +import re + import usaddress @@ -50,12 +51,7 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) -""" -Module for standardizing the 'company' columnn of the state campaign finance dataset -""" - - -def cleaning_company_column(company: str) -> str: +def cleaning_company_column(company_entry: str) -> str: """ Given a string, check if it contains a variation of self employed, unemployed, or retired and return the standardized version. @@ -72,30 +68,44 @@ def cleaning_company_column(company: str) -> str: 'Self Employed' >>> cleaning_company_column("None") 'Unemployed' + >>> cleaning_company_column("N/A") + 'Unemployed' + >>> cleaning_company_column("nan") + 'Unemployed' """ - if pd.isnull(company): - return company - company_edited = company.lower() - company_edited = company_edited.strip() - company_edited = company_edited.replace(".", " ") - company_edited = company_edited.replace(",", " ") - company_edited = company_edited.replace("-", " ") + if not company_entry: + return company_entry - if "retire" in company_edited: + company_edited = company_entry.lower() + + if company_edited == "n/a": + return "Unemployed" + + company_edited = re.sub(r"[^\w\s]", "", company_edited) + + if ( + company_edited == "retired" + or company_edited == "retiree" + or company_edited == "retire" + or "retiree" in company_edited + ): return "Retired" - elif "self employe" in company_edited or company_edited == "self": + + elif ( + "self employe" in company_edited + or "freelance" in company_edited + or company_edited == "self" + or company_edited == "independent contractor" + ): return "Self Employed" elif ( "unemploye" in company_edited or company_edited == "none" or company_edited == "not employed" + or company_edited == "nan" ): return "Unemployed" else: - return company - - -# Example implementation of the function standardize_company_column for a dataframe -# df['standardized_company'] = df['company'].apply(standardize_company_column) + return company_edited From 87ea3da197ea722b5c54f99b7f5cdd29b890060d Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 31 Jan 2024 09:02:44 -0600 Subject: [PATCH 20/42] Adding Avery's feedback --- utils/linkage.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index e419c6f..2cdd11b 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -53,6 +53,16 @@ def determine_comma_role(name: str) -> str: name: a string representing a name/names of individuals Returns: the name with or without a comma based on some conditions + + Sample Usage: + >>> determine_comma_role("Jane Doe, Jr") + 'Jane Doe, Jr' + >>> determine_comma_role("Doe, Jane Elisabeth") + ' Jane Elisabeth Doe' + >>> determine_comma_role("Jane Doe,") + 'Jane Doe' + >>> determine_comma_role("DOe, Jane") + ' Jane Doe' """ suffixes = [ "sr", @@ -68,19 +78,19 @@ def determine_comma_role(name: str) -> str: "ix", "x", ] - name_parts = name.split(",") + name_parts = name.lower().split(",") # if the comma is just in the end as a typo: if len(name_parts[1]) == 0: - return name_parts[0] + return name_parts[0].title() # if just the suffix in the end, leave the name as it is if name_parts[1].strip() in suffixes: - return name + return name.title() # at this point either it's just poor name placement, or the suffix is # in the beginning of the name. Either way, the first part of the list is # the true last name. last_part = name_parts.pop(0) first_part = " ".join(name_parts) - return first_part + " " + last_part + return first_part.title() + " " + last_part.title() def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: @@ -114,6 +124,8 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: 'Jane Elisabeth Doe, Iv' >>> get_likely_name("","","Jane Elisabeth Doe, IV") 'Jane Elisabeth Doe Iv' + >>> get_likely_name("Jane","","Doe, Jane, Elisabeth") + 'Jane Elisabeth Doe' """ # first ensure clean input by deleting spaces: first_name, last_name, full_name = list( @@ -154,10 +166,10 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: # one last check to remove any pieces that might add extra whitespace names = list(filter(lambda x: x != "", names)) names = " ".join(names) - names = names.split(" ") + names = names.title().replace(" ", " ").split(" ") final_name = [] [final_name.append(x) for x in names if x not in final_name] - return " ".join(final_name).title().strip() + return " ".join(final_name).strip() def get_street_from_address_line_1(address_line_1: str) -> str: From 23a8c1ffca9935aeef5b74341c7562eb1f020fe2 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 31 Jan 2024 09:07:26 -0600 Subject: [PATCH 21/42] Adding Avery's feedback --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 2cdd11b..0450fca 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -123,7 +123,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV") 'Jane Elisabeth Doe, Iv' >>> get_likely_name("","","Jane Elisabeth Doe, IV") - 'Jane Elisabeth Doe Iv' + 'Jane Elisabeth Doe, Iv' >>> get_likely_name("Jane","","Doe, Jane, Elisabeth") 'Jane Elisabeth Doe' """ From 4081715a2d4b83875c3def1c086f3d9f1b579e78 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 31 Jan 2024 09:40:58 -0600 Subject: [PATCH 22/42] saving personal work before merging, no need to look or review @Avery @Trevor --- notebooks/Test.ipynb | 421 ++++++++++++++++++++++++++++++++++++++++++- utils/linkage.py | 13 ++ 2 files changed, 433 insertions(+), 1 deletion(-) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index 5df942e..1176ab7 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -12,6 +12,425 @@ "\n", "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))" ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def determine_comma_role(name: str) -> str:\n", + " \"\"\"Given a string (someone's name), attempts to determine the role of the\n", + " comma in the name and where it ought to belong.\n", + "\n", + " Some assumptions are made:\n", + " * If a suffix is included in the name and the name is not just the last\n", + " name(i.e \"Doe, Jr), the format is\n", + " (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth\n", + "\n", + " * If a comma is used anywhere else, it is in the format of\n", + " (last_name, first and middle name) i.e Doe, Jane Elisabeth\n", + " Args:\n", + " name: a string representing a name/names of individuals\n", + " Returns:\n", + " the name with or without a comma based on some conditions\n", + " \"\"\"\n", + " suffixes = [\n", + " \"sr\",\n", + " \"jr\",\n", + " \"i\",\n", + " \"ii\",\n", + " \"iii\",\n", + " \"iv\",\n", + " \"v\",\n", + " \"vi\",\n", + " \"vii\",\n", + " \"viii\",\n", + " \"ix\",\n", + " \"x\",\n", + " ]\n", + " name_parts = name.lower().split(\",\")\n", + " # if the comma is just in the end as a typo:\n", + " if len(name_parts[1]) == 0:\n", + " return name_parts[0].title()\n", + " # if just the suffix in the end, leave the name as it is\n", + " if name_parts[1].strip() in suffixes:\n", + " return name.title()\n", + " # at this point either it's just poor name placement, or the suffix is\n", + " # in the beginning of the name. Either way, the first part of the list is\n", + " # the true last name.\n", + " last_part = name_parts.pop(0)\n", + " first_part = \" \".join(name_parts)\n", + " return first_part.title() + \" \" + last_part.title()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' Jane Jr Doe'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "determine_comma_role(\"DOe, Jane, Jr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:\n", + " \"\"\"Given name related columns, return a person's likely name\n", + "\n", + " Given different formatting used accross states, errors in data entry\n", + " and missing data, it can be difficult to determine someone's actual\n", + " name. For example, some states have a last name column with values like\n", + " \"Doe, Jane\", where the person's first name appears to have been erroneously\n", + " included.\n", + "\n", + " Args:\n", + " first_name: raw value of first name column\n", + " last_name: raw value last name column\n", + " full_name: raw value of name or full_name column\n", + " Returns:\n", + " The most likely full name of the person listed\n", + "\n", + " Sample Usage:\n", + " >>> get_likely_name(\"Jane\", \"Doe\", \"\")\n", + " 'Jane Doe'\n", + " >>> get_likely_name(\"\", \"\", \"Jane Doe\")\n", + " 'Jane Doe'\n", + " >>> get_likely_name(\"\", \"Doe, Jane\", \"\")\n", + " 'Jane Doe'\n", + " >>> get_likely_name(\"Jane Doe\", \"Doe\", \"Jane Doe\")\n", + " 'Jane Doe'\n", + " >>> get_likely_name(\"Jane\",\"\",\"Doe, Sr\")\n", + " 'Jane Doe, Sr'\n", + " >>> get_likely_name(\"Jane Elisabeth Doe, IV\",\"Elisabeth\",\"Doe, IV\")\n", + " 'Jane Elisabeth Doe, Iv'\n", + " >>> get_likely_name(\"\",\"\",\"Jane Elisabeth Doe, IV\")\n", + " 'Jane Elisabeth Doe Iv'\n", + " \"\"\"\n", + " # first ensure clean input by deleting spaces:\n", + " first_name, last_name, full_name = list(\n", + " map(lambda x: x.lower().strip(), [first_name, last_name, full_name])\n", + " )\n", + "\n", + " # if data is clean:\n", + " if first_name + \" \" + last_name == full_name:\n", + " return full_name\n", + "\n", + " # some names have titles or professions associated with the name. We need to\n", + " # remove those from the name.\n", + " titles = [\n", + " \"mr\",\n", + " \"ms\",\n", + " \"mrs\",\n", + " \"miss\",\n", + " \"prof\",\n", + " \"dr\",\n", + " \"doctor\",\n", + " \"sir\",\n", + " \"madam\",\n", + " \"professor\",\n", + " ]\n", + " names = [first_name, last_name, full_name]\n", + "\n", + " for i in range(len(names)):\n", + " # if there is a ',' deal with it accordingly\n", + " if \",\" in names[i]:\n", + " names[i] = determine_comma_role(names[i])\n", + "\n", + " names[i] = names[i].replace(\".\", \"\").split(\" \")\n", + " names[i] = [\n", + " name_part for name_part in names[i] if name_part not in titles\n", + " ]\n", + " names[i] = \" \".join(names[i])\n", + "\n", + " # one last check to remove any pieces that might add extra whitespace\n", + " names = list(filter(lambda x: x != \"\", names))\n", + " names = \" \".join(names)\n", + " names = names.title().replace(\" \",\" \").split(\" \")\n", + " final_name = []\n", + " [final_name.append(x) for x in names if x not in final_name]\n", + " return \" \".join(final_name).strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_4143866/1500712151.py:2: DtypeWarning: Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " ind_df = pd.read_csv(\"../output/complete_individuals_table.csv\")\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0idfirst_namelast_namefull_nameentity_typestatepartycompany
001869727NaNNaNwilliam \bstonerindividualNaNNaNNaN
111779679NaNNaNrm coulonindividualNaNNaNarea agency on aging
222277221NaNNaNjames engelsonindividualNaNNaNretired
332277156NaNNaNmarivic franciaskinnerindividualNaNNaNfibre source international corp
442341373NaNNaNanthony grindleindividualNaNNaNzimmerbiomet
..............................
25053418612606acfa74b-d5e1-4afd-b020-dbe429eb1c3fNaNNaNMelissa HartCandidatePAREPNaN
2505342861271f111045d-bc3d-4050-9ad7-b3b1e6d72e56NaNNaNHeather MillerCandidatePADEMNaN
2505343861277d40859d7-b523-4ef5-895b-c3a947ab582fNaNNaNChristopher M. GebhardCandidatePAREPNaN
2505344861775f5d76d43-86f4-40f9-aeb9-3df97ca8cdf0NaNNaNApril WeaverCandidatePAREPNaN
25053458619201a0cf90d-3252-4c8d-b109-dea084a01f69NaNNaNKrista PaolucciCandidatePAREPNaN
\n", + "

2505346 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 id first_name \\\n", + "0 0 1869727 NaN \n", + "1 1 1779679 NaN \n", + "2 2 2277221 NaN \n", + "3 3 2277156 NaN \n", + "4 4 2341373 NaN \n", + "... ... ... ... \n", + "2505341 861260 6acfa74b-d5e1-4afd-b020-dbe429eb1c3f NaN \n", + "2505342 861271 f111045d-bc3d-4050-9ad7-b3b1e6d72e56 NaN \n", + "2505343 861277 d40859d7-b523-4ef5-895b-c3a947ab582f NaN \n", + "2505344 861775 f5d76d43-86f4-40f9-aeb9-3df97ca8cdf0 NaN \n", + "2505345 861920 1a0cf90d-3252-4c8d-b109-dea084a01f69 NaN \n", + "\n", + " last_name full_name entity_type state party \\\n", + "0 NaN william \bstoner individual NaN NaN \n", + "1 NaN rm coulon individual NaN NaN \n", + "2 NaN james engelson individual NaN NaN \n", + "3 NaN marivic franciaskinner individual NaN NaN \n", + "4 NaN anthony grindle individual NaN NaN \n", + "... ... ... ... ... ... \n", + "2505341 NaN Melissa Hart Candidate PA REP \n", + "2505342 NaN Heather Miller Candidate PA DEM \n", + "2505343 NaN Christopher M. Gebhard Candidate PA REP \n", + "2505344 NaN April Weaver Candidate PA REP \n", + "2505345 NaN Krista Paolucci Candidate PA REP \n", + "\n", + " company \n", + "0 NaN \n", + "1 area agency on aging \n", + "2 retired \n", + "3 fibre source international corp \n", + "4 zimmerbiomet \n", + "... ... \n", + "2505341 NaN \n", + "2505342 NaN \n", + "2505343 NaN \n", + "2505344 NaN \n", + "2505345 NaN \n", + "\n", + "[2505346 rows x 9 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "ind_df = pd.read_csv(\"../output/complete_individuals_table.csv\")\n", + "ind_df.sample(1000)\n", + "ind_df" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Doe, Jr, Jane'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "determine_comma_role(\"Doe, Jr, Jane\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -30,7 +449,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.7" }, "orig_nbformat": 4 }, diff --git a/utils/linkage.py b/utils/linkage.py index 0450fca..f501897 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -3,6 +3,7 @@ """ import textdistance as td import usaddress +import pandas as pd def calculate_string_similarity(string1: str, string2: str) -> float: @@ -215,3 +216,15 @@ def get_street_from_address_line_1(address_line_1: str) -> str: string.append(key) return " ".join(string) + +def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: + '''Given a dataframe, remove rows that have identical entry data beyond + UUIDs, and output a file mapping an entry to other the UUIDs of the + deduplicated rows + + Args: + a pandas dataframe containing contribution data + Returns: + a deduplicated pandas dataframe containing contribution data + ''' + pass \ No newline at end of file From 3fcbc5b6539edc5fdf1102c9ec9d3727552c57ee Mon Sep 17 00:00:00 2001 From: npashilkar Date: Wed, 31 Jan 2024 09:51:34 -0600 Subject: [PATCH 23/42] precommit checks --- utils/linkage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 9866a9b..5788eb0 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,9 +1,8 @@ -from utils.constants import ( - COMPANY_TYPES -) import textdistance as td import usaddress +from utils.constants import COMPANY_TYPES + """ Module for performing record linkage on state campaign finance dataset """ @@ -121,4 +120,5 @@ def standardize_corp_names(company_name: str) -> str: new_company_name = " ".join(company_name_split) return new_company_name -print(standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')) + +print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOC")) From f07dae2a96ebc9ed00d7056721361d3684165b5c Mon Sep 17 00:00:00 2001 From: npashilkar Date: Wed, 31 Jan 2024 10:23:53 -0600 Subject: [PATCH 24/42] get address number from line 1 function --- utils/linkage.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index a96b816..1333024 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -165,4 +165,32 @@ def standardize_corp_names(company_name: str) -> str: return new_company_name -print(standardize_corp_names("MI BEER WINE WHOLESALERS ASSOC")) +def get_address_number_from_address_line_1(address_line_1: str) -> str: + """Given an address line 1, return the building number or po box + + Args: + address_line_1: either street information or PO box + Returns: + address or po box number + + Sample Usage: + >>> get_building_from_address_line_1('6727 W. Corrine Dr. Peoria,AZ 85381') + '6727' + >>> get_building_from_address_line_1('P.O. Box 5456 Sun City West ,AZ 85375') + 'P.O. Box 5456' + >>> get_building_from_address_line_1('119 S 5th St Niles,MI 49120') + '119' + >>> get_building_from_address_line_1( + ... '1415 PARKER STREET APT 251 DETROIT MI 48214-0000' + ... ) + '1415' + """ + + address_line_1_components = usaddress.parse(address_line_1) + + for i in range(len(address_line_1_components)): + if address_line_1_components[i][1] == "AddressNumber": + return address_line_1_components[i][0] + elif address_line_1_components[i][1] == "USPSBoxID": + return address_line_1_components[i][0] + raise ValueError("Can not find Address Number") From 8849f462925bbc3064f5f5539513cb16cf7c20b7 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Wed, 31 Jan 2024 10:29:06 -0600 Subject: [PATCH 25/42] get address number from line 1 function --- utils/linkage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 1333024..379e6d4 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -174,13 +174,13 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: address or po box number Sample Usage: - >>> get_building_from_address_line_1('6727 W. Corrine Dr. Peoria,AZ 85381') + >>> get_address_number_from_address_line_1('6727 W. Corrine Dr. Peoria,AZ 85381') '6727' - >>> get_building_from_address_line_1('P.O. Box 5456 Sun City West ,AZ 85375') + >>> get_address_number_from_address_line_1('P.O. Box 5456 Sun City West ,AZ 85375') 'P.O. Box 5456' - >>> get_building_from_address_line_1('119 S 5th St Niles,MI 49120') + >>> get_address_number_from_address_line_1('119 S 5th St Niles,MI 49120') '119' - >>> get_building_from_address_line_1( + >>> get_address_number_from_address_line_1( ... '1415 PARKER STREET APT 251 DETROIT MI 48214-0000' ... ) '1415' From d0086ef22db122a6e8bd6add3f7e2fdfcc9fb221 Mon Sep 17 00:00:00 2001 From: npashilkar Date: Wed, 31 Jan 2024 11:07:37 -0600 Subject: [PATCH 26/42] get address number from line 1 function --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 379e6d4..ac11a5a 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -177,7 +177,7 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: >>> get_address_number_from_address_line_1('6727 W. Corrine Dr. Peoria,AZ 85381') '6727' >>> get_address_number_from_address_line_1('P.O. Box 5456 Sun City West ,AZ 85375') - 'P.O. Box 5456' + '5456' >>> get_address_number_from_address_line_1('119 S 5th St Niles,MI 49120') '119' >>> get_address_number_from_address_line_1( From 5f65159fbe7d8752755e814878486d8f50697b48 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 31 Jan 2024 23:48:08 -0600 Subject: [PATCH 27/42] attempt so far at dedup --- utils/linkage.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 6e8e6a5..f8ea7bb 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -271,4 +271,13 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: Returns: a deduplicated pandas dataframe containing contribution data ''' - pass \ No newline at end of file + #first remove all duplicate entries: + new_df = df.drop_duplicates() + + # now find the duplicates along all columns but the ID + cols = new_df.columns[1:] + duplicates = new_df[new_df.duplicated(cols)] + new_df = new_df.drop(index=duplicates.index.tolist()) + #for index in duplicates.index: + + return new_df \ No newline at end of file From 71a3174aabda2137f4980cb8df7952374f3ca7a5 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 1 Feb 2024 00:12:47 -0600 Subject: [PATCH 28/42] attempt so far at dedup --- utils/linkage.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index f8ea7bb..bc2f062 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,9 +1,9 @@ """ Module for performing record linkage on state campaign finance dataset """ +import pandas as pd import textdistance as td import usaddress -import pandas as pd def get_address_line_1_from_full_address(address: str) -> str: @@ -261,23 +261,24 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) + def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: - '''Given a dataframe, remove rows that have identical entry data beyond + """Given a dataframe, remove rows that have identical entry data beyond UUIDs, and output a file mapping an entry to other the UUIDs of the deduplicated rows - + Args: a pandas dataframe containing contribution data Returns: a deduplicated pandas dataframe containing contribution data - ''' - #first remove all duplicate entries: + """ + # first remove all duplicate entries: new_df = df.drop_duplicates() # now find the duplicates along all columns but the ID cols = new_df.columns[1:] - duplicates = new_df[new_df.duplicated(cols)] + duplicates = new_df[new_df.duplicated(cols)] new_df = new_df.drop(index=duplicates.index.tolist()) - #for index in duplicates.index: + # for index in duplicates.index: - return new_df \ No newline at end of file + return new_df From 56cde5f003a2e3a49817e3c04e2305252110ef96 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 1 Feb 2024 00:13:15 -0600 Subject: [PATCH 29/42] attempt so far at dedup --- utils/linkage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index bc2f062..25e110d 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -279,6 +279,5 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: cols = new_df.columns[1:] duplicates = new_df[new_df.duplicated(cols)] new_df = new_df.drop(index=duplicates.index.tolist()) - # for index in duplicates.index: return new_df From 161a175c8f31bf79fea702d7b7497cb33218bd0b Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 1 Feb 2024 01:54:17 -0600 Subject: [PATCH 30/42] updates on linkage doc, ignore notebooks/Test.ipynb --- notebooks/Test.ipynb | 287 +++++++++++++++++++++++++++++++++++++------ utils/linkage.py | 27 +++- 2 files changed, 276 insertions(+), 38 deletions(-) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index cf4679f..e4cac62 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -191,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -403,7 +403,7 @@ "18 Paa Pac PA Organization " ] }, - "execution_count": 5, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -433,10 +433,29 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ + "from utils.constants import repo_root\n", + "def convert_duplicates_to_dict(df: pd.DataFrame)->pd.DataFrame:\n", + " '''Takes a dataframe whose indexes are UUIDs, and a column that is a list of\n", + " all other UUIDs that have duplicate values. The function then outputs a\n", + " dictionary file where the deduped UUIDs map to the dataframe main UUID\n", + " \n", + " Args:\n", + " A pandas dataframe with UUIDs as indexes and deduplicated UUIDs\n", + " matching up to the index in the same row\n", + " \n", + " Returns\n", + " None. However it outputs a dictionary\n", + " '''\n", + " #for index in df.index:\n", + " \n", + " #entities.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n", + " pass\n", + "\n", + "\n", "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n", " '''Given a dataframe, remove rows that have identical entry data beyond\n", " UUIDs, and output a file mapping an entry to other the UUIDs of the\n", @@ -451,17 +470,20 @@ " new_df = df.drop_duplicates()\n", "\n", " # now find the duplicates along all columns but the ID\n", - " cols = new_df.columns[1:]\n", - " duplicates = new_df[new_df.duplicated(cols)] \n", - " new_df = new_df.drop(index=duplicates.index.tolist())\n", - " #for index in duplicates.index:\n", + " new_df=new_df.groupby(df.columns[1:].tolist())[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n", + " new_df.index=new_df[\"duplicated\"].str[0].tolist()\n", + " new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n", "\n", + " # now convert the duplicated column into a dictionary that can will be\n", + " # an output\n", + " convert_duplicates_to_dict(new_df[['duplicated']])\n", + " #new_df = new_df.drop(['duplicated'], axis=1)\n", " return new_df" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -485,63 +507,151 @@ " \n", " \n", " \n", - " id\n", " name\n", " state\n", " entity_type\n", + " duplicated\n", " \n", " \n", " \n", " \n", - " 16\n", - " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe\n", - " MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC\n", + " d31df1ca-714e-4a82-9e88-1892c0451a71\n", + " COMMITTEE TO ELECT DR PATRICIA BERNARD\n", " MI\n", " committee\n", + " []\n", " \n", " \n", - " 17\n", - " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff\n", + " 910c4d36-b036-469e-aa2a-ea4ff8855a6c\n", + " Citizens For Kail\n", + " PA\n", + " Organization\n", + " []\n", + " \n", + " \n", + " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd\n", " MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC\n", " MI\n", " committee\n", + " [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...\n", " \n", " \n", - " 18\n", - " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd\n", + " c875d7de-94be-42f1-b994-dd89b114d51e\n", + " Pa Fraternal Order Of Police Pac\n", + " PA\n", + " Organization\n", + " []\n", + " \n", + " \n", + " 60d454d1-3773-4d88-80e9-132c161da0f0\n", " Paa Pac\n", " PA\n", " Organization\n", + " [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]\n", + " \n", + " \n", + " f71341d7-d27e-47eb-9b66-903af39d6cb5\n", + " Pabar Pac (Pa Bar Assn)\n", + " PA\n", + " Organization\n", + " []\n", + " \n", + " \n", + " 50c7d9a1-b448-46a5-8e2d-cd15b3097360\n", + " REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...\n", + " MI\n", + " committee\n", + " []\n", + " \n", + " \n", + " 62ea1e9c-ac12-400c-b3dc-519389c0f7d3\n", + " UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...\n", + " MI\n", + " committee\n", + " []\n", + " \n", + " \n", + " 4db76e6e-f0d5-40eb-82de-6dbcdb562dd7\n", + " Ugi Utilities Inc/Ugi Energy Services Llc Pac\n", + " PA\n", + " Organization\n", + " []\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id \\\n", - "16 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe \n", - "17 1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff \n", - "18 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", + " name \\\n", + "d31df1ca-714e-4a82-9e88-1892c0451a71 COMMITTEE TO ELECT DR PATRICIA BERNARD \n", + "910c4d36-b036-469e-aa2a-ea4ff8855a6c Citizens For Kail \n", + "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC \n", + "c875d7de-94be-42f1-b994-dd89b114d51e Pa Fraternal Order Of Police Pac \n", + "60d454d1-3773-4d88-80e9-132c161da0f0 Paa Pac \n", + "f71341d7-d27e-47eb-9b66-903af39d6cb5 Pabar Pac (Pa Bar Assn) \n", + "50c7d9a1-b448-46a5-8e2d-cd15b3097360 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... \n", + "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... \n", + "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 Ugi Utilities Inc/Ugi Energy Services Llc Pac \n", + "\n", + " state entity_type \\\n", + "d31df1ca-714e-4a82-9e88-1892c0451a71 MI committee \n", + "910c4d36-b036-469e-aa2a-ea4ff8855a6c PA Organization \n", + "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MI committee \n", + "c875d7de-94be-42f1-b994-dd89b114d51e PA Organization \n", + "60d454d1-3773-4d88-80e9-132c161da0f0 PA Organization \n", + "f71341d7-d27e-47eb-9b66-903af39d6cb5 PA Organization \n", + "50c7d9a1-b448-46a5-8e2d-cd15b3097360 MI committee \n", + "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 MI committee \n", + "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 PA Organization \n", "\n", - " name state entity_type \n", - "16 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee \n", - "17 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee \n", - "18 Paa Pac PA Organization " + " duplicated \n", + "d31df1ca-714e-4a82-9e88-1892c0451a71 [] \n", + "910c4d36-b036-469e-aa2a-ea4ff8855a6c [] \n", + "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc... \n", + "c875d7de-94be-42f1-b994-dd89b114d51e [] \n", + "60d454d1-3773-4d88-80e9-132c161da0f0 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd] \n", + "f71341d7-d27e-47eb-9b66-903af39d6cb5 [] \n", + "50c7d9a1-b448-46a5-8e2d-cd15b3097360 [] \n", + "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 [] \n", + "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 [] " ] }, - "execution_count": 7, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = deduplicate_perfect_matches(sample_df)\n", - "for i in range(len(x)):\n", - " curr_row = x.loc[i]\n", - " sample_df.loc[(sample_df.name == 'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC') &\n", - "# (sample_df.state == 'MI') &\n", - "# (sample_df.entity_type == 'committee')]\n", - "x\n" + "x#[['duplicated']]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "[]\n", + "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe', '1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff']\n", + "[]\n", + "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd']\n", + "[]\n", + "[]\n", + "[]\n", + "[]\n" + ] + } + ], + "source": [ + "y = x[['duplicated']]\n", + "for i in range(len(y)):\n", + " #print(y.iloc[i]['duplicated'])\n", + " print(y.iloc[i]['duplicated'])" ] }, { @@ -637,21 +747,128 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namestateentity_type
d31df1ca-714e-4a82-9e88-1892c0451a71COMMITTEE TO ELECT DR PATRICIA BERNARDMIcommittee
910c4d36-b036-469e-aa2a-ea4ff8855a6cCitizens For KailPAOrganization
1d2b5bc0-9385-4cd7-ac48-df43b3eca6fdMICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PACMIcommittee
c875d7de-94be-42f1-b994-dd89b114d51ePa Fraternal Order Of Police PacPAOrganization
60d454d1-3773-4d88-80e9-132c161da0f0Paa PacPAOrganization
f71341d7-d27e-47eb-9b66-903af39d6cb5Pabar Pac (Pa Bar Assn)PAOrganization
50c7d9a1-b448-46a5-8e2d-cd15b3097360REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...MIcommittee
62ea1e9c-ac12-400c-b3dc-519389c0f7d3UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...MIcommittee
4db76e6e-f0d5-40eb-82de-6dbcdb562dd7Ugi Utilities Inc/Ugi Energy Services Llc PacPAOrganization
\n", + "
" + ], "text/plain": [ - "['name', 'state', 'entity_type']" + " name \\\n", + "d31df1ca-714e-4a82-9e88-1892c0451a71 COMMITTEE TO ELECT DR PATRICIA BERNARD \n", + "910c4d36-b036-469e-aa2a-ea4ff8855a6c Citizens For Kail \n", + "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC \n", + "c875d7de-94be-42f1-b994-dd89b114d51e Pa Fraternal Order Of Police Pac \n", + "60d454d1-3773-4d88-80e9-132c161da0f0 Paa Pac \n", + "f71341d7-d27e-47eb-9b66-903af39d6cb5 Pabar Pac (Pa Bar Assn) \n", + "50c7d9a1-b448-46a5-8e2d-cd15b3097360 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... \n", + "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... \n", + "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 Ugi Utilities Inc/Ugi Energy Services Llc Pac \n", + "\n", + " state entity_type \n", + "d31df1ca-714e-4a82-9e88-1892c0451a71 MI committee \n", + "910c4d36-b036-469e-aa2a-ea4ff8855a6c PA Organization \n", + "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MI committee \n", + "c875d7de-94be-42f1-b994-dd89b114d51e PA Organization \n", + "60d454d1-3773-4d88-80e9-132c161da0f0 PA Organization \n", + "f71341d7-d27e-47eb-9b66-903af39d6cb5 PA Organization \n", + "50c7d9a1-b448-46a5-8e2d-cd15b3097360 MI committee \n", + "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 MI committee \n", + "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 PA Organization " ] }, - "execution_count": 19, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], - "source": [] + "source": [ + "#for index in x.index:\n", + "# print(index)\n", + "x" + ] }, { "cell_type": "code", diff --git a/utils/linkage.py b/utils/linkage.py index 6ee0de0..01d05de 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -270,6 +270,22 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) +def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame: + """Takes a dataframe whose indexes are UUIDs, and a column that is a list of + all other UUIDs that have duplicate values. The function then outputs a + dictionary file where the deduped UUIDs map to the dataframe main UUID + + Args: + A pandas dataframe with UUIDs as indexes and deduplicated UUIDs + matching up to the index in the same row + + Returns + None. However it outputs a dictionary + """ + # df.to_csv(repo_root / "output" / "deduplicated_UUIDs.csv", index=False) + pass + + def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: """Given a dataframe, remove rows that have identical entry data beyond UUIDs, and output a file mapping an entry to other the UUIDs of the @@ -284,9 +300,14 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: new_df = df.drop_duplicates() # now find the duplicates along all columns but the ID - cols = new_df.columns[1:] - duplicates = new_df[new_df.duplicated(cols)] - new_df = new_df.drop(index=duplicates.index.tolist()) + new_df = ( + new_df.groupby(df.columns[1:].tolist())["id"] + .agg(list) + .reset_index() + .rename(columns={"id": "duplicated"}) + ) + new_df.index = new_df["duplicated"].str[0].tolist() + new_df["duplicated"] = new_df["duplicated"].str[1:] return new_df From b519fa164babf8498930abcddfcc0aa4abd8f135 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Thu, 1 Feb 2024 09:22:40 -0600 Subject: [PATCH 31/42] modifications to dedup function, not yet done, no need to review yet --- notebooks/Test.ipynb | 195 +++++++++++++++++++++++++------------------ utils/linkage.py | 9 ++ 2 files changed, 124 insertions(+), 80 deletions(-) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index e4cac62..bc73185 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -191,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -403,7 +403,7 @@ "18 Paa Pac PA Organization " ] }, - "execution_count": 45, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -475,15 +475,16 @@ " new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n", "\n", " # now convert the duplicated column into a dictionary that can will be\n", - " # an output\n", - " convert_duplicates_to_dict(new_df[['duplicated']])\n", - " #new_df = new_df.drop(['duplicated'], axis=1)\n", + " # an output by only feeding the entries with duplicates\n", + " new_df = new_df.reset_index().rename(columns = {'index':'id'})\n", + " convert_duplicates_to_dict(new_df[new_df['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n", + " new_df = new_df.drop(['duplicated'], axis=1)\n", " return new_df" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -507,74 +508,120 @@ " \n", " \n", " \n", - " name\n", - " state\n", - " entity_type\n", + " id\n", " duplicated\n", " \n", " \n", " \n", " \n", - " d31df1ca-714e-4a82-9e88-1892c0451a71\n", - " COMMITTEE TO ELECT DR PATRICIA BERNARD\n", - " MI\n", - " committee\n", + " 2\n", + " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd\n", + " [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...\n", + " \n", + " \n", + " 4\n", + " 60d454d1-3773-4d88-80e9-132c161da0f0\n", + " [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " id \\\n", + "2 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", + "4 60d454d1-3773-4d88-80e9-132c161da0f0 \n", + "\n", + " duplicated \n", + "2 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc... \n", + "4 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd] " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = deduplicate_perfect_matches(sample_df)\n", + "#len(x.iloc[2]['duplicated'])\n", + "x[x['duplicated'].apply(lambda x: len(x)) > 0][['id','duplicated']]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -582,48 +629,36 @@ "" ], "text/plain": [ - " name \\\n", - "d31df1ca-714e-4a82-9e88-1892c0451a71 COMMITTEE TO ELECT DR PATRICIA BERNARD \n", - "910c4d36-b036-469e-aa2a-ea4ff8855a6c Citizens For Kail \n", - "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC \n", - "c875d7de-94be-42f1-b994-dd89b114d51e Pa Fraternal Order Of Police Pac \n", - "60d454d1-3773-4d88-80e9-132c161da0f0 Paa Pac \n", - "f71341d7-d27e-47eb-9b66-903af39d6cb5 Pabar Pac (Pa Bar Assn) \n", - "50c7d9a1-b448-46a5-8e2d-cd15b3097360 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... \n", - "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... \n", - "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 Ugi Utilities Inc/Ugi Energy Services Llc Pac \n", - "\n", - " state entity_type \\\n", - "d31df1ca-714e-4a82-9e88-1892c0451a71 MI committee \n", - "910c4d36-b036-469e-aa2a-ea4ff8855a6c PA Organization \n", - "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MI committee \n", - "c875d7de-94be-42f1-b994-dd89b114d51e PA Organization \n", - "60d454d1-3773-4d88-80e9-132c161da0f0 PA Organization \n", - "f71341d7-d27e-47eb-9b66-903af39d6cb5 PA Organization \n", - "50c7d9a1-b448-46a5-8e2d-cd15b3097360 MI committee \n", - "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 MI committee \n", - "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 PA Organization \n", + " id \\\n", + "0 d31df1ca-714e-4a82-9e88-1892c0451a71 \n", + "1 910c4d36-b036-469e-aa2a-ea4ff8855a6c \n", + "2 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", + "3 c875d7de-94be-42f1-b994-dd89b114d51e \n", + "4 60d454d1-3773-4d88-80e9-132c161da0f0 \n", + "5 f71341d7-d27e-47eb-9b66-903af39d6cb5 \n", + "6 50c7d9a1-b448-46a5-8e2d-cd15b3097360 \n", + "7 62ea1e9c-ac12-400c-b3dc-519389c0f7d3 \n", + "8 4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 \n", "\n", - " duplicated \n", - "d31df1ca-714e-4a82-9e88-1892c0451a71 [] \n", - "910c4d36-b036-469e-aa2a-ea4ff8855a6c [] \n", - "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc... \n", - "c875d7de-94be-42f1-b994-dd89b114d51e [] \n", - "60d454d1-3773-4d88-80e9-132c161da0f0 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd] \n", - "f71341d7-d27e-47eb-9b66-903af39d6cb5 [] \n", - "50c7d9a1-b448-46a5-8e2d-cd15b3097360 [] \n", - "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 [] \n", - "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 [] " + " duplicated \n", + "0 [] \n", + "1 [] \n", + "2 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc... \n", + "3 [] \n", + "4 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd] \n", + "5 [] \n", + "6 [] \n", + "7 [] \n", + "8 [] " ] }, - "execution_count": 47, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x = deduplicate_perfect_matches(sample_df)\n", - "x#[['duplicated']]" + "x[['id','duplicated']]" ] }, { diff --git a/utils/linkage.py b/utils/linkage.py index 01d05de..0d2ebf7 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -309,6 +309,15 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: new_df.index = new_df["duplicated"].str[0].tolist() new_df["duplicated"] = new_df["duplicated"].str[1:] + # now convert the duplicated column into a dictionary that can will be + # an output by only feeding the entries with duplicates + new_df = new_df.reset_index().rename(columns={"index": "id"}) + convert_duplicates_to_dict( + new_df[new_df["duplicated"].apply(lambda x: len(x)) > 0][ + ["id", "duplicated"] + ] + ) + new_df = new_df.drop(["duplicated"], axis=1) return new_df From 4ac551fa498be733717a7f50af2084cb28e6c321 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Fri, 2 Feb 2024 02:34:42 +0000 Subject: [PATCH 32/42] passing pre-commits and doctests --- utils/linkage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 1dbf54b..d223617 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -70,8 +70,6 @@ def cleaning_company_column(company_entry: str) -> str: 'Unemployed' >>> cleaning_company_column("N/A") 'Unemployed' - >>> cleaning_company_column("nan") - 'Unemployed' """ if not company_entry: From 37dcbf76a638c0007ff0de1620b93f6ec2f24ec3 Mon Sep 17 00:00:00 2001 From: Avery Schoen <33437601+averyschoen@users.noreply.github.com> Date: Fri, 2 Feb 2024 14:02:34 -0600 Subject: [PATCH 33/42] Update linkage.py --- utils/linkage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index c884238..74cbc93 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -252,4 +252,5 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: return address_line_1_components[i][0] elif address_line_1_components[i][1] == "USPSBoxID": return address_line_1_components[i][0] - raise ValueError("Can not find Address Number") \ No newline at end of file + raise ValueError("Can not find Address Number") + From 7f9135f7acc77ee429557bc48d19d3d9a5f69cf6 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Sun, 4 Feb 2024 14:31:07 -0600 Subject: [PATCH 34/42] finished dedup function with helper function to output to a csv_file in the output directory --- utils/linkage.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index ad5589a..f2242da 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -5,7 +5,7 @@ import textdistance as td import usaddress -from utils.constants import COMPANY_TYPES +from utils.constants import COMPANY_TYPES, repo_root def get_address_line_1_from_full_address(address: str) -> str: @@ -280,10 +280,25 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame: matching up to the index in the same row Returns - None. However it outputs a dictionary + None. However it outputs a dictionary to the output directory, with 2 + columns. The first, which indicates the deduplicated UUIDs, is labeled + 'duplicated_uuids', and the 2nd, which shows the uuids to which the + deduplicated entries match two, is labeled 'mapped_uuids'. """ - # df.to_csv(repo_root / "output" / "deduplicated_UUIDs.csv", index=False) - pass + deduped_dict = {} + for i in range(len(df)): + deduped_uudis = df.iloc[i]["duplicated"] + for j in range(len(deduped_uudis)): + deduped_dict.update({deduped_uudis[j]: df.iloc[i]["id"]}) + + # now convert dictionary into a csv file + deduped_df = pd.DataFrame.from_dict(deduped_dict, "index") + deduped_df = deduped_df.reset_index().rename( + columns={"index": "duplicated_uuids", 0: "mapped_uuids"} + ) + deduped_df.to_csv( + repo_root / "output" / "deduplicated_UUIDs.csv", index=False + ) def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: From fb106545507614b4306c7652589eb3dbf93a7059 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 5 Feb 2024 01:13:27 +0000 Subject: [PATCH 35/42] updated function --- utils/linkage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 74cbc93..26fbd5b 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,6 +1,7 @@ +import re + import textdistance as td import usaddress -import re from utils.constants import COMPANY_TYPES @@ -253,4 +254,3 @@ def get_address_number_from_address_line_1(address_line_1: str) -> str: elif address_line_1_components[i][1] == "USPSBoxID": return address_line_1_components[i][0] raise ValueError("Can not find Address Number") - From 29ee6bb63e198256d83a22019f98561f303a764b Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Tue, 6 Feb 2024 09:57:53 -0600 Subject: [PATCH 36/42] made modifications to the deduplication function --- utils/linkage.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index f2242da..5db8745 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -271,19 +271,21 @@ def get_street_from_address_line_1(address_line_1: str) -> str: def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame: - """Takes a dataframe whose indexes are UUIDs, and a column that is a list of - all other UUIDs that have duplicate values. The function then outputs a - dictionary file where the deduped UUIDs map to the dataframe main UUID + """Saves to the "output" directory a file mapping multiple strings to one + string + + Given a dataframe where each row contains one string in a column and a list + of strings in another column, the function maps each string in the list to + the single string. Args: - A pandas dataframe with UUIDs as indexes and deduplicated UUIDs - matching up to the index in the same row + A pandas dataframe Returns - None. However it outputs a dictionary to the output directory, with 2 - columns. The first, which indicates the deduplicated UUIDs, is labeled + None. However it outputs a file to the output directory, with 2 + columns. The first, which indicates the duplicated UUIDs, is labeled 'duplicated_uuids', and the 2nd, which shows the uuids to which the - deduplicated entries match two, is labeled 'mapped_uuids'. + deduplicated entries match to, is labeled 'mapped_uuids'. """ deduped_dict = {} for i in range(len(df)): @@ -297,14 +299,17 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame: columns={"index": "duplicated_uuids", 0: "mapped_uuids"} ) deduped_df.to_csv( - repo_root / "output" / "deduplicated_UUIDs.csv", index=False + repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a" ) def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: - """Given a dataframe, remove rows that have identical entry data beyond - UUIDs, and output a file mapping an entry to other the UUIDs of the - deduplicated rows + """Return a dataframe with duplicated entries removed. + + Given a dataframe, combines rows that have identical data beyond their + UUIDs, keeps the first UUID amond the similarly grouped UUIDs, and saves the + rest of the UUIDS to a file in the "output" directory linking them to the + first selected UUID. Args: a pandas dataframe containing contribution data @@ -316,7 +321,7 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: # now find the duplicates along all columns but the ID new_df = ( - new_df.groupby(df.columns[1:].tolist())["id"] + new_df.groupby(df.columns[1:].tolist(), dropna=False)["id"] .agg(list) .reset_index() .rename(columns={"id": "duplicated"}) From cfa15d079459a30032a61325fa2f1dcf8a74e3f8 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Tue, 6 Feb 2024 10:07:25 -0600 Subject: [PATCH 37/42] received a git push error stating that the tip of my branch is behind its remote counterpart...commiting my changes before rebasing --- notebooks/Test.ipynb | 667 ++++++++++++++++++++++++++++--------------- 1 file changed, 431 insertions(+), 236 deletions(-) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index bc73185..188591d 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -66,27 +66,7 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "' Jane Jr Doe'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "determine_comma_role(\"DOe, Jane, Jr\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -169,29 +149,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "str" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import pandas as pd\n", - "orgs = pd.read_csv(\"../output/complete_organizations_table.csv\")\n", - "type(orgs.id.tolist()[1000])" + "orgs_sample = pd.read_csv(\"../output/complete_organizations_table.csv\",index_col=0).sample(10000)\n", + "inds_sample = pd.read_csv(\"../output/complete_individuals_table.csv\",index_col=0, low_memory=False).sample(10000)\n" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -403,7 +372,7 @@ "18 Paa Pac PA Organization " ] }, - "execution_count": 28, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -448,12 +417,21 @@ " matching up to the index in the same row\n", " \n", " Returns\n", - " None. However it outputs a dictionary\n", + " None. However it outputs a dictionary to the output directory, with 2\n", + " columns. The first, which indicates the deduplicated UUIDs, is labeled\n", + " 'duplicated_uuids', and the 2nd, which shows the uuids to which the\n", + " deduplicated entries match two, is labeled 'mapped_uuids'.\n", " '''\n", - " #for index in df.index:\n", - " \n", - " #entities.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n", - " pass\n", + " deduped_dict = {}\n", + " for i in range(len(df)):\n", + " deduped_uudis = df.iloc[i]['duplicated']\n", + " for j in range(len(deduped_uudis)):\n", + " deduped_dict.update({deduped_uudis[j]:df.iloc[i]['id']})\n", + " \n", + " # now convert dictionary into a csv file\n", + " deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') \n", + " deduped_df = deduped_df.reset_index().rename(columns={\"index\":\"duplicated_uuids\", 0:\"mapped_uuids\"})\n", + " deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n", "\n", "\n", "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n", @@ -470,7 +448,7 @@ " new_df = df.drop_duplicates()\n", "\n", " # now find the duplicates along all columns but the ID\n", - " new_df=new_df.groupby(df.columns[1:].tolist())[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n", + " new_df=new_df.groupby(df.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n", " new_df.index=new_df[\"duplicated\"].str[0].tolist()\n", " new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n", "\n", @@ -484,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -509,48 +487,138 @@ " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idduplicated
0d31df1ca-714e-4a82-9e88-1892c0451a71[]
910c4d36-b036-469e-aa2a-ea4ff8855a6cCitizens For KailPAOrganization1910c4d36-b036-469e-aa2a-ea4ff8855a6c[]
1d2b5bc0-9385-4cd7-ac48-df43b3eca6fdMICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PACMIcommittee21d2b5bc0-9385-4cd7-ac48-df43b3eca6fd[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...
c875d7de-94be-42f1-b994-dd89b114d51ePa Fraternal Order Of Police PacPAOrganization3c875d7de-94be-42f1-b994-dd89b114d51e[]
60d454d1-3773-4d88-80e9-132c161da0f0Paa PacPAOrganization460d454d1-3773-4d88-80e9-132c161da0f0[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]
f71341d7-d27e-47eb-9b66-903af39d6cb5Pabar Pac (Pa Bar Assn)PAOrganization5f71341d7-d27e-47eb-9b66-903af39d6cb5[]
50c7d9a1-b448-46a5-8e2d-cd15b3097360REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...MIcommittee650c7d9a1-b448-46a5-8e2d-cd15b3097360[]
62ea1e9c-ac12-400c-b3dc-519389c0f7d3UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...MIcommittee762ea1e9c-ac12-400c-b3dc-519389c0f7d3[]
4db76e6e-f0d5-40eb-82de-6dbcdb562dd7Ugi Utilities Inc/Ugi Energy Services Llc PacPAOrganization84db76e6e-f0d5-40eb-82de-6dbcdb562dd7[]
idduplicatednamestateentity_type
043a79b93-fed7-4f3c-a279-0441cdc7e72214TH DISTRICT DEMOCRATIC PARTYMIcorporation
1215f3104-2df0-4799-9a13-d0c5ec27d6f214TH DISTRICT DEMOCRATSMIcorporation
21d2b5bc0-9385-4cd7-ac48-df43b3eca6fd[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...022d2951-8fe9-42d6-a6ac-01e82d90fa6521ST CENTURY MEDIA - MICHIGANMIcorporation
3e1150dce-219c-4eef-995d-ee2759a92923360 TOUCHMIcorporation
460d454d1-3773-4d88-80e9-132c161da0f0[1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]88c3b805-e0f1-42d5-8b77-536734731c4a50+1 STRATEGIES LLCMIcorporation
...............
2135f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8ZoomPAOrganization
2136616c47f1-39cc-4b12-a93d-f7d3bdc88047Zoom Video CommunicationsPAOrganization
2137df101e29-4adf-4496-8d96-9732d9f7dbc8Zoom.UsPAOrganization
2138d02d1f6d-4a13-428e-a040-d35bd5cfcf9fZupancich, Andrea Senate CommitteeGACommittee
2139df42f2ec-9ee0-49d0-9020-d1a441ef8b42womenwinning State PACMNCommittee
\n", + "

2140 rows × 4 columns

\n", "
" ], "text/plain": [ - " id \\\n", - "2 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", - "4 60d454d1-3773-4d88-80e9-132c161da0f0 \n", + " id \\\n", + "0 43a79b93-fed7-4f3c-a279-0441cdc7e722 \n", + "1 215f3104-2df0-4799-9a13-d0c5ec27d6f2 \n", + "2 022d2951-8fe9-42d6-a6ac-01e82d90fa65 \n", + "3 e1150dce-219c-4eef-995d-ee2759a92923 \n", + "4 88c3b805-e0f1-42d5-8b77-536734731c4a \n", + "... ... \n", + "2135 f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8 \n", + "2136 616c47f1-39cc-4b12-a93d-f7d3bdc88047 \n", + "2137 df101e29-4adf-4496-8d96-9732d9f7dbc8 \n", + "2138 d02d1f6d-4a13-428e-a040-d35bd5cfcf9f \n", + "2139 df42f2ec-9ee0-49d0-9020-d1a441ef8b42 \n", "\n", - " duplicated \n", - "2 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc... \n", - "4 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd] " + " name state entity_type \n", + "0 14TH DISTRICT DEMOCRATIC PARTY MI corporation \n", + "1 14TH DISTRICT DEMOCRATS MI corporation \n", + "2 21ST CENTURY MEDIA - MICHIGAN MI corporation \n", + "3 360 TOUCH MI corporation \n", + "4 50+1 STRATEGIES LLC MI corporation \n", + "... ... ... ... \n", + "2135 Zoom PA Organization \n", + "2136 Zoom Video Communications PA Organization \n", + "2137 Zoom.Us PA Organization \n", + "2138 Zupancich, Andrea Senate Committee GA Committee \n", + "2139 womenwinning State PAC MN Committee \n", + "\n", + "[2140 rows x 4 columns]" ] }, - "execution_count": 32, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x = deduplicate_perfect_matches(sample_df)\n", + "x = deduplicate_perfect_matches(orgs_sample)\n", "#len(x.iloc[2]['duplicated'])\n", - "x[x['duplicated'].apply(lambda x: len(x)) > 0][['id','duplicated']]" + "x" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -575,123 +643,257 @@ " \n", " \n", " id\n", + " first_name\n", + " last_name\n", + " full_name\n", + " entity_type\n", + " state\n", + " party\n", + " company\n", " duplicated\n", " \n", " \n", " \n", " \n", " 0\n", - " d31df1ca-714e-4a82-9e88-1892c0451a71\n", + " 6c833843-2f4f-416c-9092-f1d95d9b27dc\n", + " 'JESSE' PHILIP\n", + " SHERMAN\n", + " 'JESSE' PHILIP SHERMAN ...\n", + " Individual\n", + " CA\n", + " NaN\n", + " NaN\n", " []\n", " \n", " \n", " 1\n", - " 910c4d36-b036-469e-aa2a-ea4ff8855a6c\n", + " cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4\n", + " AARON\n", + " AEBIG\n", + " AARON AEBIG ...\n", + " Individual\n", + " MI\n", + " NaN\n", + " NaN\n", " []\n", " \n", " \n", " 2\n", - " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd\n", - " [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc...\n", + " a7304cd4-76ae-4223-86c3-f50da82a62aa\n", + " AARON\n", + " BATES\n", + " AARON BATES ...\n", + " Individual\n", + " MI\n", + " NaN\n", + " NaN\n", + " []\n", " \n", " \n", " 3\n", - " c875d7de-94be-42f1-b994-dd89b114d51e\n", + " cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d\n", + " AARON\n", + " BIRD\n", + " AARON BIRD ...\n", + " Individual\n", + " WA\n", + " NaN\n", + " L0021\n", " []\n", " \n", " \n", " 4\n", - " 60d454d1-3773-4d88-80e9-132c161da0f0\n", - " [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd]\n", + " 1302bf1f-393b-43ed-a15d-8cf6e121223c\n", + " AARON\n", + " COHEN\n", + " AARON COHEN ...\n", + " Individual\n", + " IL\n", + " NaN\n", + " NaN\n", + " []\n", " \n", " \n", - " 5\n", - " f71341d7-d27e-47eb-9b66-903af39d6cb5\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 7182\n", + " 160a5c9e-d04a-40c9-a0fd-c28e21dd70dc\n", + " NaN\n", + " NaN\n", + " Wilkinson, James\n", + " Individual\n", + " MN\n", + " NaN\n", + " NaN\n", " []\n", " \n", " \n", - " 6\n", - " 50c7d9a1-b448-46a5-8e2d-cd15b3097360\n", + " 7183\n", + " 7a19cbb7-d681-46a5-8f9f-1e7be7071f06\n", + " NaN\n", + " NaN\n", + " Wolf, Linda\n", + " Individual\n", + " MN\n", + " NaN\n", + " NaN\n", " []\n", " \n", " \n", - " 7\n", - " 62ea1e9c-ac12-400c-b3dc-519389c0f7d3\n", + " 7184\n", + " ce5156f8-23d4-40e0-8711-f19bff942543\n", + " NaN\n", + " NaN\n", + " Wollenburg, George\n", + " Individual\n", + " MN\n", + " NaN\n", + " NaN\n", " []\n", " \n", " \n", - " 8\n", - " 4db76e6e-f0d5-40eb-82de-6dbcdb562dd7\n", + " 7185\n", + " 1948661\n", + " NaN\n", + " NaN\n", + " richard 3033 shoreham\n", + " individual\n", + " NaN\n", + " NaN\n", + " NaN\n", + " []\n", + " \n", + " \n", + " 7186\n", + " 69744565-e7e4-47e1-8555-ede565fca705\n", + " NaN\n", + " NaN\n", + " wark, david\n", + " Individual\n", + " MN\n", + " NaN\n", + " NaN\n", " []\n", " \n", " \n", "\n", + "

7187 rows × 9 columns

\n", "" ], "text/plain": [ - " id \\\n", - "0 d31df1ca-714e-4a82-9e88-1892c0451a71 \n", - "1 910c4d36-b036-469e-aa2a-ea4ff8855a6c \n", - "2 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", - "3 c875d7de-94be-42f1-b994-dd89b114d51e \n", - "4 60d454d1-3773-4d88-80e9-132c161da0f0 \n", - "5 f71341d7-d27e-47eb-9b66-903af39d6cb5 \n", - "6 50c7d9a1-b448-46a5-8e2d-cd15b3097360 \n", - "7 62ea1e9c-ac12-400c-b3dc-519389c0f7d3 \n", - "8 4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 \n", + " id first_name \\\n", + "0 6c833843-2f4f-416c-9092-f1d95d9b27dc 'JESSE' PHILIP \n", + "1 cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4 AARON \n", + "2 a7304cd4-76ae-4223-86c3-f50da82a62aa AARON \n", + "3 cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d AARON \n", + "4 1302bf1f-393b-43ed-a15d-8cf6e121223c AARON \n", + "... ... ... \n", + "7182 160a5c9e-d04a-40c9-a0fd-c28e21dd70dc NaN \n", + "7183 7a19cbb7-d681-46a5-8f9f-1e7be7071f06 NaN \n", + "7184 ce5156f8-23d4-40e0-8711-f19bff942543 NaN \n", + "7185 1948661 NaN \n", + "7186 69744565-e7e4-47e1-8555-ede565fca705 NaN \n", + "\n", + " last_name \\\n", + "0 SHERMAN \n", + "1 AEBIG \n", + "2 BATES \n", + "3 BIRD \n", + "4 COHEN \n", + "... ... \n", + "7182 NaN \n", + "7183 NaN \n", + "7184 NaN \n", + "7185 NaN \n", + "7186 NaN \n", "\n", - " duplicated \n", - "0 [] \n", - "1 [] \n", - "2 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe, 1d2b5bc... \n", - "3 [] \n", - "4 [1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd] \n", - "5 [] \n", - "6 [] \n", - "7 [] \n", - "8 [] " + " full_name entity_type state \\\n", + "0 'JESSE' PHILIP SHERMAN ... Individual CA \n", + "1 AARON AEBIG ... Individual MI \n", + "2 AARON BATES ... Individual MI \n", + "3 AARON BIRD ... Individual WA \n", + "4 AARON COHEN ... Individual IL \n", + "... ... ... ... \n", + "7182 Wilkinson, James Individual MN \n", + "7183 Wolf, Linda Individual MN \n", + "7184 Wollenburg, George Individual MN \n", + "7185 richard 3033 shoreham individual NaN \n", + "7186 wark, david Individual MN \n", + "\n", + " party company duplicated \n", + "0 NaN NaN [] \n", + "1 NaN NaN [] \n", + "2 NaN NaN [] \n", + "3 NaN L0021 [] \n", + "4 NaN NaN [] \n", + "... ... ... ... \n", + "7182 NaN NaN [] \n", + "7183 NaN NaN [] \n", + "7184 NaN NaN [] \n", + "7185 NaN NaN [] \n", + "7186 NaN NaN [] \n", + "\n", + "[7187 rows x 9 columns]" ] }, - "execution_count": 20, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x[['id','duplicated']]" + "y = inds_sample.drop_duplicates()\n", + "\n", + "# now find the duplicates along all columns but the ID\n", + "y=y.groupby(inds_sample.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n", + "y.index=y[\"duplicated\"].str[0].tolist()\n", + "y[\"duplicated\"]=y[\"duplicated\"].str[1:]\n", + "\n", + "# now convert the duplicated column into a dictionary that can will be\n", + "# an output by only feeding the entries with duplicates\n", + "y = y.reset_index().rename(columns = {'index':'id'})\n", + "convert_duplicates_to_dict(y[y['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n", + "new_df = y.drop(['duplicated'], axis=1)\n", + "#return new_df\n", + "y" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 9, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n", - "[]\n", - "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe', '1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff']\n", - "[]\n", - "['1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd']\n", - "[]\n", - "[]\n", - "[]\n", - "[]\n" - ] + "data": { + "text/plain": [ + "Index(['first_name', 'last_name', 'full_name', 'entity_type', 'state', 'party',\n", + " 'company'],\n", + " dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "y = x[['duplicated']]\n", - "for i in range(len(y)):\n", - " #print(y.iloc[i]['duplicated'])\n", - " print(y.iloc[i]['duplicated'])" + "inds_sample.columns[1:]" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -715,74 +917,92 @@ " \n", " \n", " \n", - " name\n", - " state\n", - " entity_type\n", - " id\n", + " Max Speed\n", + " Animal\n", + " Color\n", + " Age\n", " \n", " \n", " \n", " \n", " 0\n", - " COMMITTEE TO ELECT DR PATRICIA BERNARD\n", - " MI\n", - " committee\n", - " 2\n", + " 380.0\n", + " None\n", + " green\n", + " 2.0\n", " \n", " \n", - " 2\n", - " MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC\n", - " MI\n", - " committee\n", - " 4\n", - " \n", - " \n", - " 4\n", - " Paa Pac\n", - " PA\n", - " Organization\n", - " 2\n", + " 1\n", + " 370.0\n", + " Falcon\n", + " None\n", + " NaN\n", " \n", " \n", - " 6\n", - " REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...\n", - " MI\n", - " committee\n", - " 3\n", + " 2\n", + " NaN\n", + " None\n", + " yellow\n", + " 5.0\n", " \n", " \n", - " 7\n", - " UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...\n", - " MI\n", - " committee\n", - " 4\n", + " 3\n", + " NaN\n", + " Parrot\n", + " blue\n", + " 6.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " name state entity_type id\n", - "0 COMMITTEE TO ELECT DR PATRICIA BERNARD MI committee 2\n", - "2 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee 4\n", - "4 Paa Pac PA Organization 2\n", - "6 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... MI committee 3\n", - "7 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... MI committee 4" + " Max Speed Animal Color Age\n", + "0 380.0 None green 2.0\n", + "1 370.0 Falcon None NaN\n", + "2 NaN None yellow 5.0\n", + "3 NaN Parrot blue 6.0" ] }, - "execution_count": 13, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x = sample_df.groupby(sample_df.columns[1:].tolist()).count().reset_index()\n", - "x.loc[x.id >1]" + "import numpy as np\n", + "import pandas as pd\n", + "df = pd.DataFrame({'Max Speed': [380., 370., np.nan, np.nan],\n", + " 'Animal': ['None', 'Falcon', 'None', 'Parrot'],\n", + " 'Color':['green',None,'yellow','blue'],\n", + " 'Age':[2,np.nan,5,6]})\n", + "df" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df= df.groupby(df.columns[1:].tolist(), dropna=False)[\"Max Speed\"]#.agg(list)#.reset_index()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -806,114 +1026,89 @@ " \n", " \n", " \n", - " name\n", - " state\n", - " entity_type\n", + " Age\n", + " Animal\n", + " Color\n", + " Max Speed\n", " \n", " \n", " \n", " \n", - " d31df1ca-714e-4a82-9e88-1892c0451a71\n", - " COMMITTEE TO ELECT DR PATRICIA BERNARD\n", - " MI\n", - " committee\n", - " \n", - " \n", - " 910c4d36-b036-469e-aa2a-ea4ff8855a6c\n", - " Citizens For Kail\n", - " PA\n", - " Organization\n", - " \n", - " \n", - " 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd\n", - " MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC\n", - " MI\n", - " committee\n", - " \n", - " \n", - " c875d7de-94be-42f1-b994-dd89b114d51e\n", - " Pa Fraternal Order Of Police Pac\n", - " PA\n", - " Organization\n", - " \n", - " \n", - " 60d454d1-3773-4d88-80e9-132c161da0f0\n", - " Paa Pac\n", - " PA\n", - " Organization\n", - " \n", - " \n", - " f71341d7-d27e-47eb-9b66-903af39d6cb5\n", - " Pabar Pac (Pa Bar Assn)\n", - " PA\n", - " Organization\n", + " 0\n", + " 2.0\n", + " None\n", + " green\n", + " [380.0]\n", " \n", " \n", - " 50c7d9a1-b448-46a5-8e2d-cd15b3097360\n", - " REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...\n", - " MI\n", - " committee\n", + " 1\n", + " 5.0\n", + " None\n", + " yellow\n", + " [nan]\n", " \n", " \n", - " 62ea1e9c-ac12-400c-b3dc-519389c0f7d3\n", - " UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...\n", - " MI\n", - " committee\n", + " 2\n", + " 6.0\n", + " Parrot\n", + " blue\n", + " [nan]\n", " \n", " \n", - " 4db76e6e-f0d5-40eb-82de-6dbcdb562dd7\n", - " Ugi Utilities Inc/Ugi Energy Services Llc Pac\n", - " PA\n", - " Organization\n", + " 3\n", + " NaN\n", + " Falcon\n", + " NaN\n", + " [370.0]\n", " \n", " \n", "\n", "" ], "text/plain": [ - " name \\\n", - "d31df1ca-714e-4a82-9e88-1892c0451a71 COMMITTEE TO ELECT DR PATRICIA BERNARD \n", - "910c4d36-b036-469e-aa2a-ea4ff8855a6c Citizens For Kail \n", - "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC \n", - "c875d7de-94be-42f1-b994-dd89b114d51e Pa Fraternal Order Of Police Pac \n", - "60d454d1-3773-4d88-80e9-132c161da0f0 Paa Pac \n", - "f71341d7-d27e-47eb-9b66-903af39d6cb5 Pabar Pac (Pa Bar Assn) \n", - "50c7d9a1-b448-46a5-8e2d-cd15b3097360 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... \n", - "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... \n", - "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 Ugi Utilities Inc/Ugi Energy Services Llc Pac \n", - "\n", - " state entity_type \n", - "d31df1ca-714e-4a82-9e88-1892c0451a71 MI committee \n", - "910c4d36-b036-469e-aa2a-ea4ff8855a6c PA Organization \n", - "1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd MI committee \n", - "c875d7de-94be-42f1-b994-dd89b114d51e PA Organization \n", - "60d454d1-3773-4d88-80e9-132c161da0f0 PA Organization \n", - "f71341d7-d27e-47eb-9b66-903af39d6cb5 PA Organization \n", - "50c7d9a1-b448-46a5-8e2d-cd15b3097360 MI committee \n", - "62ea1e9c-ac12-400c-b3dc-519389c0f7d3 MI committee \n", - "4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 PA Organization " + " Age Animal Color Max Speed\n", + "0 2.0 None green [380.0]\n", + "1 5.0 None yellow [nan]\n", + "2 6.0 Parrot blue [nan]\n", + "3 NaN Falcon NaN [370.0]" ] }, - "execution_count": 41, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#for index in x.index:\n", - "# print(index)\n", - "x" + "df = df.groupby((df.columns.difference(['Max Speed'])).tolist(),dropna=False)['Max Speed'].agg(list).reset_index()\n", + "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from utils.constants import repo_root\n", - "entities.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n" + "df" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 3d26fdef9d0d56459c36f61cb7b4d9fa309f7925 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 7 Feb 2024 09:47:25 -0600 Subject: [PATCH 38/42] trying to see what the git branch issues are...no need to review this commit --- notebooks/Test.ipynb | 333 ++++++++++++++++++++----------------------- 1 file changed, 154 insertions(+), 179 deletions(-) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index 188591d..26d98b5 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -160,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -372,7 +372,7 @@ "18 Paa Pac PA Organization " ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ " # now convert dictionary into a csv file\n", " deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') \n", " deduped_df = deduped_df.reset_index().rename(columns={\"index\":\"duplicated_uuids\", 0:\"mapped_uuids\"})\n", - " deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False)\n", + " deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False, mode='a')\n", "\n", "\n", "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n", @@ -462,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -495,36 +495,36 @@ " \n", " \n", " 0\n", - " 43a79b93-fed7-4f3c-a279-0441cdc7e722\n", - " 14TH DISTRICT DEMOCRATIC PARTY\n", + " 3246120d-45fc-4d19-adee-d2aa2c5be6db\n", + " 1 BOLD STEP\n", " MI\n", " corporation\n", " \n", " \n", " 1\n", - " 215f3104-2df0-4799-9a13-d0c5ec27d6f2\n", - " 14TH DISTRICT DEMOCRATS\n", + " 8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd\n", + " 12CDRC\n", " MI\n", " corporation\n", " \n", " \n", " 2\n", - " 022d2951-8fe9-42d6-a6ac-01e82d90fa65\n", - " 21ST CENTURY MEDIA - MICHIGAN\n", + " a5379930-7324-4f1d-b216-84d9e9ddea40\n", + " 303 MANAGEMENT INC.\n", " MI\n", " corporation\n", " \n", " \n", " 3\n", - " e1150dce-219c-4eef-995d-ee2759a92923\n", - " 360 TOUCH\n", + " 9064112f-ef40-4690-9d0a-782a2375feb0\n", + " 314 ACTION FUND\n", " MI\n", " corporation\n", " \n", " \n", " 4\n", - " 88c3b805-e0f1-42d5-8b77-536734731c4a\n", - " 50+1 STRATEGIES LLC\n", + " 9e11e7ae-ee29-4a50-9720-41c6ac556a1f\n", + " A T AND T MICHIGAN PAC\n", " MI\n", " corporation\n", " \n", @@ -536,76 +536,76 @@ " ...\n", " \n", " \n", - " 2135\n", - " f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8\n", - " Zoom\n", + " 2149\n", + " d79f9729-c9af-4347-868a-ae6e6814a295\n", + " Zach Kirk\n", " PA\n", " Organization\n", " \n", " \n", - " 2136\n", - " 616c47f1-39cc-4b12-a93d-f7d3bdc88047\n", - " Zoom Video Communications\n", + " 2150\n", + " fbfea472-e183-4479-b869-90eddfa5198c\n", + " Zest Kitchen\n", " PA\n", " Organization\n", " \n", " \n", - " 2137\n", - " df101e29-4adf-4496-8d96-9732d9f7dbc8\n", - " Zoom.Us\n", + " 2151\n", + " c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6\n", + " Zoom Us\n", " PA\n", " Organization\n", " \n", " \n", - " 2138\n", - " d02d1f6d-4a13-428e-a040-d35bd5cfcf9f\n", - " Zupancich, Andrea Senate Committee\n", - " GA\n", - " Committee\n", + " 2152\n", + " 59cc8db9-607e-4e1b-ba41-0850b6019360\n", + " Zoom Video Communications Inc.\n", + " PA\n", + " Organization\n", " \n", " \n", - " 2139\n", - " df42f2ec-9ee0-49d0-9020-d1a441ef8b42\n", - " womenwinning State PAC\n", - " MN\n", - " Committee\n", + " 2153\n", + " NaN\n", + " NaN\n", + " MI\n", + " corporation\n", " \n", " \n", "\n", - "

2140 rows × 4 columns

\n", + "

2154 rows × 4 columns

\n", "" ], "text/plain": [ " id \\\n", - "0 43a79b93-fed7-4f3c-a279-0441cdc7e722 \n", - "1 215f3104-2df0-4799-9a13-d0c5ec27d6f2 \n", - "2 022d2951-8fe9-42d6-a6ac-01e82d90fa65 \n", - "3 e1150dce-219c-4eef-995d-ee2759a92923 \n", - "4 88c3b805-e0f1-42d5-8b77-536734731c4a \n", + "0 3246120d-45fc-4d19-adee-d2aa2c5be6db \n", + "1 8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd \n", + "2 a5379930-7324-4f1d-b216-84d9e9ddea40 \n", + "3 9064112f-ef40-4690-9d0a-782a2375feb0 \n", + "4 9e11e7ae-ee29-4a50-9720-41c6ac556a1f \n", "... ... \n", - "2135 f5fbf8f5-bd03-43f6-bfdd-42113bdd02a8 \n", - "2136 616c47f1-39cc-4b12-a93d-f7d3bdc88047 \n", - "2137 df101e29-4adf-4496-8d96-9732d9f7dbc8 \n", - "2138 d02d1f6d-4a13-428e-a040-d35bd5cfcf9f \n", - "2139 df42f2ec-9ee0-49d0-9020-d1a441ef8b42 \n", + "2149 d79f9729-c9af-4347-868a-ae6e6814a295 \n", + "2150 fbfea472-e183-4479-b869-90eddfa5198c \n", + "2151 c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6 \n", + "2152 59cc8db9-607e-4e1b-ba41-0850b6019360 \n", + "2153 NaN \n", "\n", " name state entity_type \n", - "0 14TH DISTRICT DEMOCRATIC PARTY MI corporation \n", - "1 14TH DISTRICT DEMOCRATS MI corporation \n", - "2 21ST CENTURY MEDIA - MICHIGAN MI corporation \n", - "3 360 TOUCH MI corporation \n", - "4 50+1 STRATEGIES LLC MI corporation \n", + "0 1 BOLD STEP MI corporation \n", + "1 12CDRC MI corporation \n", + "2 303 MANAGEMENT INC. MI corporation \n", + "3 314 ACTION FUND MI corporation \n", + "4 A T AND T MICHIGAN PAC MI corporation \n", "... ... ... ... \n", - "2135 Zoom PA Organization \n", - "2136 Zoom Video Communications PA Organization \n", - "2137 Zoom.Us PA Organization \n", - "2138 Zupancich, Andrea Senate Committee GA Committee \n", - "2139 womenwinning State PAC MN Committee \n", + "2149 Zach Kirk PA Organization \n", + "2150 Zest Kitchen PA Organization \n", + "2151 Zoom Us PA Organization \n", + "2152 Zoom Video Communications Inc. PA Organization \n", + "2153 NaN MI corporation \n", "\n", - "[2140 rows x 4 columns]" + "[2154 rows x 4 columns]" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -618,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -650,69 +650,63 @@ " state\n", " party\n", " company\n", - " duplicated\n", " \n", " \n", " \n", " \n", " 0\n", - " 6c833843-2f4f-416c-9092-f1d95d9b27dc\n", - " 'JESSE' PHILIP\n", - " SHERMAN\n", - " 'JESSE' PHILIP SHERMAN ...\n", + " f6df631a-e626-4861-b62b-e09512887bd3\n", + " A SCOTT\n", + " PARIS\n", + " A SCOTT PARIS ...\n", " Individual\n", - " CA\n", - " NaN\n", + " MI\n", " NaN\n", - " []\n", + " NOT EMPLOYED\n", " \n", " \n", " 1\n", - " cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4\n", - " AARON\n", - " AEBIG\n", - " AARON AEBIG ...\n", + " 075fb1c6-6c70-4ec6-a439-fcebb76c4e0a\n", + " A. MARK\n", + " GLICKSTEIN\n", + " A. MARK GLICKSTEIN ...\n", " Individual\n", - " MI\n", - " NaN\n", + " CA\n", " NaN\n", - " []\n", + " PARTNERSHIP HEALTH PLAN OF CA\n", " \n", " \n", " 2\n", - " a7304cd4-76ae-4223-86c3-f50da82a62aa\n", - " AARON\n", - " BATES\n", - " AARON BATES ...\n", + " 4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8\n", + " A. MICHAEL\n", + " PALIZZI\n", + " A. MICHAEL PALIZZI ...\n", " Individual\n", " MI\n", " NaN\n", - " NaN\n", - " []\n", + " MILLER CANFIELD\n", " \n", " \n", " 3\n", - " cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d\n", + " bb952efc-3dba-4449-9405-ea65202fbbea\n", " AARON\n", - " BIRD\n", - " AARON BIRD ...\n", + " ALDRICH\n", + " AARON ALDRICH ...\n", " Individual\n", - " WA\n", + " MI\n", " NaN\n", - " L0021\n", - " []\n", + " MILLER PIPELINE CORP.\n", " \n", " \n", " 4\n", - " 1302bf1f-393b-43ed-a15d-8cf6e121223c\n", + " 79ec4a73-f688-479a-a4e3-0b0a3813188a\n", " AARON\n", - " COHEN\n", - " AARON COHEN ...\n", + " BLAND\n", + " AARON BLAND ...\n", " Individual\n", - " IL\n", + " MI\n", " NaN\n", " NaN\n", - " []\n", " \n", " \n", " ...\n", @@ -724,59 +718,54 @@ " ...\n", " ...\n", " ...\n", - " ...\n", " \n", " \n", - " 7182\n", - " 160a5c9e-d04a-40c9-a0fd-c28e21dd70dc\n", + " 7122\n", + " a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0\n", " NaN\n", " NaN\n", - " Wilkinson, James\n", + " Trone, Robert\n", " Individual\n", " MN\n", " NaN\n", " NaN\n", - " []\n", " \n", " \n", - " 7183\n", - " 7a19cbb7-d681-46a5-8f9f-1e7be7071f06\n", + " 7123\n", + " 37ab55f5-3613-469c-8b66-ac8888f5bcae\n", " NaN\n", " NaN\n", - " Wolf, Linda\n", + " Wark, Mary Ann\n", " Individual\n", " MN\n", " NaN\n", " NaN\n", - " []\n", " \n", " \n", - " 7184\n", - " ce5156f8-23d4-40e0-8711-f19bff942543\n", + " 7124\n", + " 92d5ac7c-4702-420c-97a7-656111677f5a\n", " NaN\n", " NaN\n", - " Wollenburg, George\n", + " Wenstrom, Gene\n", " Individual\n", " MN\n", " NaN\n", " NaN\n", - " []\n", " \n", " \n", - " 7185\n", - " 1948661\n", + " 7125\n", + " fa934bf1-f611-4cd3-9bff-451bdf2e5bd2\n", " NaN\n", " NaN\n", - " richard 3033 shoreham\n", - " individual\n", - " NaN\n", + " Wika, Kevin\n", + " Individual\n", + " MN\n", " NaN\n", " NaN\n", - " []\n", " \n", " \n", - " 7186\n", - " 69744565-e7e4-47e1-8555-ede565fca705\n", + " 7126\n", + " fb8bb833-7010-418a-9f24-1a29771e0b67\n", " NaN\n", " NaN\n", " wark, david\n", @@ -784,111 +773,97 @@ " MN\n", " NaN\n", " NaN\n", - " []\n", " \n", " \n", "\n", - "

7187 rows × 9 columns

\n", + "

7127 rows × 8 columns

\n", "" ], "text/plain": [ " id first_name \\\n", - "0 6c833843-2f4f-416c-9092-f1d95d9b27dc 'JESSE' PHILIP \n", - "1 cdbe7cd4-f57b-4b89-b85d-d0b812e76aa4 AARON \n", - "2 a7304cd4-76ae-4223-86c3-f50da82a62aa AARON \n", - "3 cce5ccc0-cd28-4a6a-afdf-8a08ce31b94d AARON \n", - "4 1302bf1f-393b-43ed-a15d-8cf6e121223c AARON \n", + "0 f6df631a-e626-4861-b62b-e09512887bd3 A SCOTT \n", + "1 075fb1c6-6c70-4ec6-a439-fcebb76c4e0a A. MARK \n", + "2 4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8 A. MICHAEL \n", + "3 bb952efc-3dba-4449-9405-ea65202fbbea AARON \n", + "4 79ec4a73-f688-479a-a4e3-0b0a3813188a AARON \n", "... ... ... \n", - "7182 160a5c9e-d04a-40c9-a0fd-c28e21dd70dc NaN \n", - "7183 7a19cbb7-d681-46a5-8f9f-1e7be7071f06 NaN \n", - "7184 ce5156f8-23d4-40e0-8711-f19bff942543 NaN \n", - "7185 1948661 NaN \n", - "7186 69744565-e7e4-47e1-8555-ede565fca705 NaN \n", + "7122 a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0 NaN \n", + "7123 37ab55f5-3613-469c-8b66-ac8888f5bcae NaN \n", + "7124 92d5ac7c-4702-420c-97a7-656111677f5a NaN \n", + "7125 fa934bf1-f611-4cd3-9bff-451bdf2e5bd2 NaN \n", + "7126 fb8bb833-7010-418a-9f24-1a29771e0b67 NaN \n", "\n", " last_name \\\n", - "0 SHERMAN \n", - "1 AEBIG \n", - "2 BATES \n", - "3 BIRD \n", - "4 COHEN \n", + "0 PARIS \n", + "1 GLICKSTEIN \n", + "2 PALIZZI \n", + "3 ALDRICH \n", + "4 BLAND \n", "... ... \n", - "7182 NaN \n", - "7183 NaN \n", - "7184 NaN \n", - "7185 NaN \n", - "7186 NaN \n", + "7122 NaN \n", + "7123 NaN \n", + "7124 NaN \n", + "7125 NaN \n", + "7126 NaN \n", "\n", " full_name entity_type state \\\n", - "0 'JESSE' PHILIP SHERMAN ... Individual CA \n", - "1 AARON AEBIG ... Individual MI \n", - "2 AARON BATES ... Individual MI \n", - "3 AARON BIRD ... Individual WA \n", - "4 AARON COHEN ... Individual IL \n", + "0 A SCOTT PARIS ... Individual MI \n", + "1 A. MARK GLICKSTEIN ... Individual CA \n", + "2 A. MICHAEL PALIZZI ... Individual MI \n", + "3 AARON ALDRICH ... Individual MI \n", + "4 AARON BLAND ... Individual MI \n", "... ... ... ... \n", - "7182 Wilkinson, James Individual MN \n", - "7183 Wolf, Linda Individual MN \n", - "7184 Wollenburg, George Individual MN \n", - "7185 richard 3033 shoreham individual NaN \n", - "7186 wark, david Individual MN \n", + "7122 Trone, Robert Individual MN \n", + "7123 Wark, Mary Ann Individual MN \n", + "7124 Wenstrom, Gene Individual MN \n", + "7125 Wika, Kevin Individual MN \n", + "7126 wark, david Individual MN \n", "\n", - " party company duplicated \n", - "0 NaN NaN [] \n", - "1 NaN NaN [] \n", - "2 NaN NaN [] \n", - "3 NaN L0021 [] \n", - "4 NaN NaN [] \n", - "... ... ... ... \n", - "7182 NaN NaN [] \n", - "7183 NaN NaN [] \n", - "7184 NaN NaN [] \n", - "7185 NaN NaN [] \n", - "7186 NaN NaN [] \n", + " party company \n", + "0 NaN NOT EMPLOYED \n", + "1 NaN PARTNERSHIP HEALTH PLAN OF CA \n", + "2 NaN MILLER CANFIELD \n", + "3 NaN MILLER PIPELINE CORP. \n", + "4 NaN NaN \n", + "... ... ... \n", + "7122 NaN NaN \n", + "7123 NaN NaN \n", + "7124 NaN NaN \n", + "7125 NaN NaN \n", + "7126 NaN NaN \n", "\n", - "[7187 rows x 9 columns]" + "[7127 rows x 8 columns]" ] }, - "execution_count": 13, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "y = inds_sample.drop_duplicates()\n", - "\n", - "# now find the duplicates along all columns but the ID\n", - "y=y.groupby(inds_sample.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n", - "y.index=y[\"duplicated\"].str[0].tolist()\n", - "y[\"duplicated\"]=y[\"duplicated\"].str[1:]\n", - "\n", - "# now convert the duplicated column into a dictionary that can will be\n", - "# an output by only feeding the entries with duplicates\n", - "y = y.reset_index().rename(columns = {'index':'id'})\n", - "convert_duplicates_to_dict(y[y['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n", - "new_df = y.drop(['duplicated'], axis=1)\n", - "#return new_df\n", + "y=deduplicate_perfect_matches(inds_sample)\n", "y" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['first_name', 'last_name', 'full_name', 'entity_type', 'state', 'party',\n", - " 'company'],\n", - " dtype='object')" + "7207" ] }, - "execution_count": 9, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "inds_sample.columns[1:]" + "a = inds_sample.drop_duplicates()\n", + "len(a)" ] }, { From 5843485fbeb48f4adb4a20a86a79cece154e10c0 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 7 Feb 2024 23:50:38 -0600 Subject: [PATCH 39/42] implementing PR feedback --- utils/linkage.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 5db8745..1b27a84 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -4,6 +4,7 @@ import pandas as pd import textdistance as td import usaddress +import os.path from utils.constants import COMPANY_TYPES, repo_root @@ -270,9 +271,9 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) -def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame: - """Saves to the "output" directory a file mapping multiple strings to one - string +def convert_duplicates_to_dict(df: pd.DataFrame) -> None: + """Saves to the "output" directory a file where each row represents a string + matching to another string Given a dataframe where each row contains one string in a column and a list of strings in another column, the function maps each string in the list to @@ -296,11 +297,9 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> pd.DataFrame: # now convert dictionary into a csv file deduped_df = pd.DataFrame.from_dict(deduped_dict, "index") deduped_df = deduped_df.reset_index().rename( - columns={"index": "duplicated_uuids", 0: "mapped_uuids"} - ) + columns={"index": "duplicated_uuids", 0: "mapped_uuids"}) deduped_df.to_csv( - repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a" - ) + repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a", header= not os.path.exists('../output/deduplicated_UUIDs.csv')) def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: From 97b89dd7dba65a71b0c3ba31225e559d16c21617 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 7 Feb 2024 23:51:44 -0600 Subject: [PATCH 40/42] addressing linter tests failure due to formatting --- utils/linkage.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 1b27a84..0b8459d 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,10 +1,11 @@ """ Module for performing record linkage on state campaign finance dataset """ +import os.path + import pandas as pd import textdistance as td import usaddress -import os.path from utils.constants import COMPANY_TYPES, repo_root @@ -297,9 +298,14 @@ def convert_duplicates_to_dict(df: pd.DataFrame) -> None: # now convert dictionary into a csv file deduped_df = pd.DataFrame.from_dict(deduped_dict, "index") deduped_df = deduped_df.reset_index().rename( - columns={"index": "duplicated_uuids", 0: "mapped_uuids"}) + columns={"index": "duplicated_uuids", 0: "mapped_uuids"} + ) deduped_df.to_csv( - repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode="a", header= not os.path.exists('../output/deduplicated_UUIDs.csv')) + repo_root / "output" / "deduplicated_UUIDs.csv", + index=False, + mode="a", + header=not os.path.exists("../output/deduplicated_UUIDs.csv"), + ) def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: From 665519241ceaafdd5361d69bbfc6162226ea46e9 Mon Sep 17 00:00:00 2001 From: Alan Mburu Kagiri Date: Wed, 14 Feb 2024 02:27:03 -0600 Subject: [PATCH 41/42] updates to dedup file and beginning steps on netorkx --- requirements.txt | 1 + utils/linkage.py | 9 ++------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index db05b66..d28ae9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ Requests==2.31.0 setuptools==68.0.0 textdistance==4.6.1 usaddress==0.5.4 +networkx~=3.1 \ No newline at end of file diff --git a/utils/linkage.py b/utils/linkage.py index f323188..ee8dcd6 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -190,7 +190,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: # if data is clean: if first_name + " " + last_name == full_name: - return full_name + return full_name.title() # some names have titles or professions associated with the name. We need to # remove those from the name. @@ -333,16 +333,11 @@ def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: .rename(columns={"id": "duplicated"}) ) new_df.index = new_df["duplicated"].str[0].tolist() - new_df["duplicated"] = new_df["duplicated"].str[1:] # now convert the duplicated column into a dictionary that can will be # an output by only feeding the entries with duplicates new_df = new_df.reset_index().rename(columns={"index": "id"}) - convert_duplicates_to_dict( - new_df[new_df["duplicated"].apply(lambda x: len(x)) > 0][ - ["id", "duplicated"] - ] - ) + convert_duplicates_to_dict(new_df[["id", "duplicated"]]) new_df = new_df.drop(["duplicated"], axis=1) return new_df From b24041d9b532a1c3e363e3ab8c70d8a7fd2d9d79 Mon Sep 17 00:00:00 2001 From: Avery Schoen <33437601+averyschoen@users.noreply.github.com> Date: Wed, 14 Feb 2024 10:01:12 -0600 Subject: [PATCH 42/42] Delete notebooks/Test.ipynb --- notebooks/Test.ipynb | 1111 ------------------------------------------ 1 file changed, 1111 deletions(-) delete mode 100644 notebooks/Test.ipynb diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb deleted file mode 100644 index 26d98b5..0000000 --- a/notebooks/Test.ipynb +++ /dev/null @@ -1,1111 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Example Notebook file demonstrating how to use the file structure\n", - "from utils.preprocess_util_lib_example import save_random_dataframe\n", - "from pathlib import Path\n", - "\n", - "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "def determine_comma_role(name: str) -> str:\n", - " \"\"\"Given a string (someone's name), attempts to determine the role of the\n", - " comma in the name and where it ought to belong.\n", - "\n", - " Some assumptions are made:\n", - " * If a suffix is included in the name and the name is not just the last\n", - " name(i.e \"Doe, Jr), the format is\n", - " (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth\n", - "\n", - " * If a comma is used anywhere else, it is in the format of\n", - " (last_name, first and middle name) i.e Doe, Jane Elisabeth\n", - " Args:\n", - " name: a string representing a name/names of individuals\n", - " Returns:\n", - " the name with or without a comma based on some conditions\n", - " \"\"\"\n", - " suffixes = [\n", - " \"sr\",\n", - " \"jr\",\n", - " \"i\",\n", - " \"ii\",\n", - " \"iii\",\n", - " \"iv\",\n", - " \"v\",\n", - " \"vi\",\n", - " \"vii\",\n", - " \"viii\",\n", - " \"ix\",\n", - " \"x\",\n", - " ]\n", - " name_parts = name.lower().split(\",\")\n", - " # if the comma is just in the end as a typo:\n", - " if len(name_parts[1]) == 0:\n", - " return name_parts[0].title()\n", - " # if just the suffix in the end, leave the name as it is\n", - " if name_parts[1].strip() in suffixes:\n", - " return name.title()\n", - " # at this point either it's just poor name placement, or the suffix is\n", - " # in the beginning of the name. Either way, the first part of the list is\n", - " # the true last name.\n", - " last_part = name_parts.pop(0)\n", - " first_part = \" \".join(name_parts)\n", - " return first_part.title() + \" \" + last_part.title()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:\n", - " \"\"\"Given name related columns, return a person's likely name\n", - "\n", - " Given different formatting used accross states, errors in data entry\n", - " and missing data, it can be difficult to determine someone's actual\n", - " name. For example, some states have a last name column with values like\n", - " \"Doe, Jane\", where the person's first name appears to have been erroneously\n", - " included.\n", - "\n", - " Args:\n", - " first_name: raw value of first name column\n", - " last_name: raw value last name column\n", - " full_name: raw value of name or full_name column\n", - " Returns:\n", - " The most likely full name of the person listed\n", - "\n", - " Sample Usage:\n", - " >>> get_likely_name(\"Jane\", \"Doe\", \"\")\n", - " 'Jane Doe'\n", - " >>> get_likely_name(\"\", \"\", \"Jane Doe\")\n", - " 'Jane Doe'\n", - " >>> get_likely_name(\"\", \"Doe, Jane\", \"\")\n", - " 'Jane Doe'\n", - " >>> get_likely_name(\"Jane Doe\", \"Doe\", \"Jane Doe\")\n", - " 'Jane Doe'\n", - " >>> get_likely_name(\"Jane\",\"\",\"Doe, Sr\")\n", - " 'Jane Doe, Sr'\n", - " >>> get_likely_name(\"Jane Elisabeth Doe, IV\",\"Elisabeth\",\"Doe, IV\")\n", - " 'Jane Elisabeth Doe, Iv'\n", - " >>> get_likely_name(\"\",\"\",\"Jane Elisabeth Doe, IV\")\n", - " 'Jane Elisabeth Doe Iv'\n", - " \"\"\"\n", - " # first ensure clean input by deleting spaces:\n", - " first_name, last_name, full_name = list(\n", - " map(lambda x: x.lower().strip(), [first_name, last_name, full_name])\n", - " )\n", - "\n", - " # if data is clean:\n", - " if first_name + \" \" + last_name == full_name:\n", - " return full_name\n", - "\n", - " # some names have titles or professions associated with the name. We need to\n", - " # remove those from the name.\n", - " titles = [\n", - " \"mr\",\n", - " \"ms\",\n", - " \"mrs\",\n", - " \"miss\",\n", - " \"prof\",\n", - " \"dr\",\n", - " \"doctor\",\n", - " \"sir\",\n", - " \"madam\",\n", - " \"professor\",\n", - " ]\n", - " names = [first_name, last_name, full_name]\n", - "\n", - " for i in range(len(names)):\n", - " # if there is a ',' deal with it accordingly\n", - " if \",\" in names[i]:\n", - " names[i] = determine_comma_role(names[i])\n", - "\n", - " names[i] = names[i].replace(\".\", \"\").split(\" \")\n", - " names[i] = [\n", - " name_part for name_part in names[i] if name_part not in titles\n", - " ]\n", - " names[i] = \" \".join(names[i])\n", - "\n", - " # one last check to remove any pieces that might add extra whitespace\n", - " names = list(filter(lambda x: x != \"\", names))\n", - " names = \" \".join(names)\n", - " names = names.title().replace(\" \",\" \").split(\" \")\n", - " final_name = []\n", - " [final_name.append(x) for x in names if x not in final_name]\n", - " return \" \".join(final_name).strip()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "orgs_sample = pd.read_csv(\"../output/complete_organizations_table.csv\",index_col=0).sample(10000)\n", - "inds_sample = pd.read_csv(\"../output/complete_individuals_table.csv\",index_col=0, low_memory=False).sample(10000)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamestateentity_type
050c7d9a1-b448-46a5-8e2d-cd15b3097360REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...MIcommittee
150c7d9a1-b448-46a5-8e2d-cd15b3097360REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...MIcommittee
250c7d9a1-b448-46a5-8e2d-cd15b3097360REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...MIcommittee
362ea1e9c-ac12-400c-b3dc-519389c0f7d3UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...MIcommittee
462ea1e9c-ac12-400c-b3dc-519389c0f7d3UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...MIcommittee
562ea1e9c-ac12-400c-b3dc-519389c0f7d3UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...MIcommittee
6d31df1ca-714e-4a82-9e88-1892c0451a71COMMITTEE TO ELECT DR PATRICIA BERNARDMIcommittee
7d31df1ca-714e-4a82-9e88-1892c0451a71COMMITTEE TO ELECT DR PATRICIA BERNARDMIcommittee
862ea1e9c-ac12-400c-b3dc-519389c0f7d3UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...MIcommittee
94db76e6e-f0d5-40eb-82de-6dbcdb562dd7Ugi Utilities Inc/Ugi Energy Services Llc PacPAOrganization
10f71341d7-d27e-47eb-9b66-903af39d6cb5Pabar Pac (Pa Bar Assn)PAOrganization
11c875d7de-94be-42f1-b994-dd89b114d51ePa Fraternal Order Of Police PacPAOrganization
12910c4d36-b036-469e-aa2a-ea4ff8855a6cCitizens For KailPAOrganization
1360d454d1-3773-4d88-80e9-132c161da0f0Paa PacPAOrganization
141d2b5bc0-9385-4cd7-ac48-df43b3eca6fdMICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PACMIcommittee
151d2b5bc0-9385-4cd7-ac48-df43b3eca6fdMICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PACMIcommittee
161d2b5bc0-9385-4cd7-ac48-df43b3eca6feMICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PACMIcommittee
171d2b5bc0-9385-4cd7-ac48-df43b3eca6ffMICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PACMIcommittee
181d2b5bc0-9385-4cd7-ac48-df43b3eca6fdPaa PacPAOrganization
\n", - "
" - ], - "text/plain": [ - " id \\\n", - "0 50c7d9a1-b448-46a5-8e2d-cd15b3097360 \n", - "1 50c7d9a1-b448-46a5-8e2d-cd15b3097360 \n", - "2 50c7d9a1-b448-46a5-8e2d-cd15b3097360 \n", - "3 62ea1e9c-ac12-400c-b3dc-519389c0f7d3 \n", - "4 62ea1e9c-ac12-400c-b3dc-519389c0f7d3 \n", - "5 62ea1e9c-ac12-400c-b3dc-519389c0f7d3 \n", - "6 d31df1ca-714e-4a82-9e88-1892c0451a71 \n", - "7 d31df1ca-714e-4a82-9e88-1892c0451a71 \n", - "8 62ea1e9c-ac12-400c-b3dc-519389c0f7d3 \n", - "9 4db76e6e-f0d5-40eb-82de-6dbcdb562dd7 \n", - "10 f71341d7-d27e-47eb-9b66-903af39d6cb5 \n", - "11 c875d7de-94be-42f1-b994-dd89b114d51e \n", - "12 910c4d36-b036-469e-aa2a-ea4ff8855a6c \n", - "13 60d454d1-3773-4d88-80e9-132c161da0f0 \n", - "14 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", - "15 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", - "16 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe \n", - "17 1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff \n", - "18 1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd \n", - "\n", - " name state entity_type \n", - "0 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... MI committee \n", - "1 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... MI committee \n", - "2 REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN... MI committee \n", - "3 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... MI committee \n", - "4 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... MI committee \n", - "5 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... MI committee \n", - "6 COMMITTEE TO ELECT DR PATRICIA BERNARD MI committee \n", - "7 COMMITTEE TO ELECT DR PATRICIA BERNARD MI committee \n", - "8 UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL... MI committee \n", - "9 Ugi Utilities Inc/Ugi Energy Services Llc Pac PA Organization \n", - "10 Pabar Pac (Pa Bar Assn) PA Organization \n", - "11 Pa Fraternal Order Of Police Pac PA Organization \n", - "12 Citizens For Kail PA Organization \n", - "13 Paa Pac PA Organization \n", - "14 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee \n", - "15 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee \n", - "16 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee \n", - "17 MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC MI committee \n", - "18 Paa Pac PA Organization " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = {'id':['50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360',\n", - " '62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',\n", - " 'd31df1ca-714e-4a82-9e88-1892c0451a71','d31df1ca-714e-4a82-9e88-1892c0451a71','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',\n", - " '4db76e6e-f0d5-40eb-82de-6dbcdb562dd7','f71341d7-d27e-47eb-9b66-903af39d6cb5','c875d7de-94be-42f1-b994-dd89b114d51e',\n", - " '910c4d36-b036-469e-aa2a-ea4ff8855a6c','60d454d1-3773-4d88-80e9-132c161da0f0','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd',\n", - " '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe','1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff',\n", - " '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd'],\n", - " 'name':['REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC',\n", - " 'REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n", - " 'UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n", - " 'COMMITTEE TO ELECT DR PATRICIA BERNARD','COMMITTEE TO ELECT DR PATRICIA BERNARD','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',\n", - " 'Ugi Utilities Inc/Ugi Energy Services Llc Pac','Pabar Pac (Pa Bar Assn)','Pa Fraternal Order Of Police Pac','Citizens For Kail',\n", - " 'Paa Pac','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC',\n", - " 'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','Paa Pac'],\n", - " 'state':['MI','MI','MI','MI','MI','MI','MI','MI','MI','PA','PA','PA','PA','PA','MI','MI','MI','MI','PA'],\n", - " 'entity_type':['committee','committee','committee','committee','committee','committee','committee','committee','committee',\n", - " 'Organization','Organization','Organization','Organization','Organization','committee','committee','committee','committee','Organization']}\n", - "\n", - "sample_df = pd.DataFrame(data)\n", - "sample_df" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from utils.constants import repo_root\n", - "def convert_duplicates_to_dict(df: pd.DataFrame)->pd.DataFrame:\n", - " '''Takes a dataframe whose indexes are UUIDs, and a column that is a list of\n", - " all other UUIDs that have duplicate values. The function then outputs a\n", - " dictionary file where the deduped UUIDs map to the dataframe main UUID\n", - " \n", - " Args:\n", - " A pandas dataframe with UUIDs as indexes and deduplicated UUIDs\n", - " matching up to the index in the same row\n", - " \n", - " Returns\n", - " None. However it outputs a dictionary to the output directory, with 2\n", - " columns. The first, which indicates the deduplicated UUIDs, is labeled\n", - " 'duplicated_uuids', and the 2nd, which shows the uuids to which the\n", - " deduplicated entries match two, is labeled 'mapped_uuids'.\n", - " '''\n", - " deduped_dict = {}\n", - " for i in range(len(df)):\n", - " deduped_uudis = df.iloc[i]['duplicated']\n", - " for j in range(len(deduped_uudis)):\n", - " deduped_dict.update({deduped_uudis[j]:df.iloc[i]['id']})\n", - " \n", - " # now convert dictionary into a csv file\n", - " deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') \n", - " deduped_df = deduped_df.reset_index().rename(columns={\"index\":\"duplicated_uuids\", 0:\"mapped_uuids\"})\n", - " deduped_df.to_csv(repo_root / \"output\" / \"deduplicated_UUIDs.csv\", index=False, mode='a')\n", - "\n", - "\n", - "def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:\n", - " '''Given a dataframe, remove rows that have identical entry data beyond\n", - " UUIDs, and output a file mapping an entry to other the UUIDs of the\n", - " deduplicated rows\n", - " \n", - " Args:\n", - " a pandas dataframe containing contribution data\n", - " Returns:\n", - " a deduplicated pandas dataframe containing contribution data\n", - " '''\n", - " #first remove all duplicate entries:\n", - " new_df = df.drop_duplicates()\n", - "\n", - " # now find the duplicates along all columns but the ID\n", - " new_df=new_df.groupby(df.columns[1:].tolist(),dropna=False)[\"id\"].agg(list).reset_index().rename(columns={\"id\": \"duplicated\"})\n", - " new_df.index=new_df[\"duplicated\"].str[0].tolist()\n", - " new_df[\"duplicated\"]=new_df[\"duplicated\"].str[1:]\n", - "\n", - " # now convert the duplicated column into a dictionary that can will be\n", - " # an output by only feeding the entries with duplicates\n", - " new_df = new_df.reset_index().rename(columns = {'index':'id'})\n", - " convert_duplicates_to_dict(new_df[new_df['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])\n", - " new_df = new_df.drop(['duplicated'], axis=1)\n", - " return new_df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamestateentity_type
03246120d-45fc-4d19-adee-d2aa2c5be6db1 BOLD STEPMIcorporation
18fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd12CDRCMIcorporation
2a5379930-7324-4f1d-b216-84d9e9ddea40303 MANAGEMENT INC.MIcorporation
39064112f-ef40-4690-9d0a-782a2375feb0314 ACTION FUNDMIcorporation
49e11e7ae-ee29-4a50-9720-41c6ac556a1fA T AND T MICHIGAN PACMIcorporation
...............
2149d79f9729-c9af-4347-868a-ae6e6814a295Zach KirkPAOrganization
2150fbfea472-e183-4479-b869-90eddfa5198cZest KitchenPAOrganization
2151c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6Zoom UsPAOrganization
215259cc8db9-607e-4e1b-ba41-0850b6019360Zoom Video Communications Inc.PAOrganization
2153NaNNaNMIcorporation
\n", - "

2154 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " id \\\n", - "0 3246120d-45fc-4d19-adee-d2aa2c5be6db \n", - "1 8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd \n", - "2 a5379930-7324-4f1d-b216-84d9e9ddea40 \n", - "3 9064112f-ef40-4690-9d0a-782a2375feb0 \n", - "4 9e11e7ae-ee29-4a50-9720-41c6ac556a1f \n", - "... ... \n", - "2149 d79f9729-c9af-4347-868a-ae6e6814a295 \n", - "2150 fbfea472-e183-4479-b869-90eddfa5198c \n", - "2151 c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6 \n", - "2152 59cc8db9-607e-4e1b-ba41-0850b6019360 \n", - "2153 NaN \n", - "\n", - " name state entity_type \n", - "0 1 BOLD STEP MI corporation \n", - "1 12CDRC MI corporation \n", - "2 303 MANAGEMENT INC. MI corporation \n", - "3 314 ACTION FUND MI corporation \n", - "4 A T AND T MICHIGAN PAC MI corporation \n", - "... ... ... ... \n", - "2149 Zach Kirk PA Organization \n", - "2150 Zest Kitchen PA Organization \n", - "2151 Zoom Us PA Organization \n", - "2152 Zoom Video Communications Inc. PA Organization \n", - "2153 NaN MI corporation \n", - "\n", - "[2154 rows x 4 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x = deduplicate_perfect_matches(orgs_sample)\n", - "#len(x.iloc[2]['duplicated'])\n", - "x" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idfirst_namelast_namefull_nameentity_typestatepartycompany
0f6df631a-e626-4861-b62b-e09512887bd3A SCOTTPARISA SCOTT PARIS ...IndividualMINaNNOT EMPLOYED
1075fb1c6-6c70-4ec6-a439-fcebb76c4e0aA. MARKGLICKSTEINA. MARK GLICKSTEIN ...IndividualCANaNPARTNERSHIP HEALTH PLAN OF CA
24a3968f5-7f5e-4ed1-8f39-bfc70bc67af8A. MICHAELPALIZZIA. MICHAEL PALIZZI ...IndividualMINaNMILLER CANFIELD
3bb952efc-3dba-4449-9405-ea65202fbbeaAARONALDRICHAARON ALDRICH ...IndividualMINaNMILLER PIPELINE CORP.
479ec4a73-f688-479a-a4e3-0b0a3813188aAARONBLANDAARON BLAND ...IndividualMINaNNaN
...........................
7122a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0NaNNaNTrone, RobertIndividualMNNaNNaN
712337ab55f5-3613-469c-8b66-ac8888f5bcaeNaNNaNWark, Mary AnnIndividualMNNaNNaN
712492d5ac7c-4702-420c-97a7-656111677f5aNaNNaNWenstrom, GeneIndividualMNNaNNaN
7125fa934bf1-f611-4cd3-9bff-451bdf2e5bd2NaNNaNWika, KevinIndividualMNNaNNaN
7126fb8bb833-7010-418a-9f24-1a29771e0b67NaNNaNwark, davidIndividualMNNaNNaN
\n", - "

7127 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " id first_name \\\n", - "0 f6df631a-e626-4861-b62b-e09512887bd3 A SCOTT \n", - "1 075fb1c6-6c70-4ec6-a439-fcebb76c4e0a A. MARK \n", - "2 4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8 A. MICHAEL \n", - "3 bb952efc-3dba-4449-9405-ea65202fbbea AARON \n", - "4 79ec4a73-f688-479a-a4e3-0b0a3813188a AARON \n", - "... ... ... \n", - "7122 a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0 NaN \n", - "7123 37ab55f5-3613-469c-8b66-ac8888f5bcae NaN \n", - "7124 92d5ac7c-4702-420c-97a7-656111677f5a NaN \n", - "7125 fa934bf1-f611-4cd3-9bff-451bdf2e5bd2 NaN \n", - "7126 fb8bb833-7010-418a-9f24-1a29771e0b67 NaN \n", - "\n", - " last_name \\\n", - "0 PARIS \n", - "1 GLICKSTEIN \n", - "2 PALIZZI \n", - "3 ALDRICH \n", - "4 BLAND \n", - "... ... \n", - "7122 NaN \n", - "7123 NaN \n", - "7124 NaN \n", - "7125 NaN \n", - "7126 NaN \n", - "\n", - " full_name entity_type state \\\n", - "0 A SCOTT PARIS ... Individual MI \n", - "1 A. MARK GLICKSTEIN ... Individual CA \n", - "2 A. MICHAEL PALIZZI ... Individual MI \n", - "3 AARON ALDRICH ... Individual MI \n", - "4 AARON BLAND ... Individual MI \n", - "... ... ... ... \n", - "7122 Trone, Robert Individual MN \n", - "7123 Wark, Mary Ann Individual MN \n", - "7124 Wenstrom, Gene Individual MN \n", - "7125 Wika, Kevin Individual MN \n", - "7126 wark, david Individual MN \n", - "\n", - " party company \n", - "0 NaN NOT EMPLOYED \n", - "1 NaN PARTNERSHIP HEALTH PLAN OF CA \n", - "2 NaN MILLER CANFIELD \n", - "3 NaN MILLER PIPELINE CORP. \n", - "4 NaN NaN \n", - "... ... ... \n", - "7122 NaN NaN \n", - "7123 NaN NaN \n", - "7124 NaN NaN \n", - "7125 NaN NaN \n", - "7126 NaN NaN \n", - "\n", - "[7127 rows x 8 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y=deduplicate_perfect_matches(inds_sample)\n", - "y" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "7207" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = inds_sample.drop_duplicates()\n", - "len(a)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Max SpeedAnimalColorAge
0380.0Nonegreen2.0
1370.0FalconNoneNaN
2NaNNoneyellow5.0
3NaNParrotblue6.0
\n", - "
" - ], - "text/plain": [ - " Max Speed Animal Color Age\n", - "0 380.0 None green 2.0\n", - "1 370.0 Falcon None NaN\n", - "2 NaN None yellow 5.0\n", - "3 NaN Parrot blue 6.0" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "df = pd.DataFrame({'Max Speed': [380., 370., np.nan, np.nan],\n", - " 'Animal': ['None', 'Falcon', 'None', 'Parrot'],\n", - " 'Color':['green',None,'yellow','blue'],\n", - " 'Age':[2,np.nan,5,6]})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df= df.groupby(df.columns[1:].tolist(), dropna=False)[\"Max Speed\"]#.agg(list)#.reset_index()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AgeAnimalColorMax Speed
02.0Nonegreen[380.0]
15.0Noneyellow[nan]
26.0Parrotblue[nan]
3NaNFalconNaN[370.0]
\n", - "
" - ], - "text/plain": [ - " Age Animal Color Max Speed\n", - "0 2.0 None green [380.0]\n", - "1 5.0 None yellow [nan]\n", - "2 6.0 Parrot blue [nan]\n", - "3 NaN Falcon NaN [370.0]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.groupby((df.columns.difference(['Max Speed'])).tolist(),dropna=False)['Max Speed'].agg(list).reset_index()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}