diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb deleted file mode 100644 index 5df942e..0000000 --- a/notebooks/Test.ipynb +++ /dev/null @@ -1,39 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Example Notebook file demonstrating how to use the file structure\n", - "from utils.preprocess_util_lib_example import save_random_dataframe\n", - "from pathlib import Path\n", - "\n", - "save_random_dataframe(Path(\"../output\"), Path(\"test.csv\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/requirements.txt b/requirements.txt index fa82b10..1dee38a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,5 @@ setuptools==68.0.0 textdistance==4.6.1 usaddress==0.5.4 names-dataset==3.1.0 +networkx~=3.1 + diff --git a/setup.py b/setup.py index 63ef672..07404ac 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup setup( - name="2023-fall-clinic-climate-cabinet", + name="2024-winter-clinic-climate-cabinet", version="0.1.0", packages=find_packages( include=[ diff --git a/utils/constants.py b/utils/constants.py index b87d39d..f259db3 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -605,3 +605,39 @@ " WV ", " WY ", ] + +# utils/linkage.py constants + +COMPANY_TYPES = { + "CORP": "CORPORATION", + "CO": "CORPORATION", + "LLC": "LIMITED LIABILITY COMPANY", + "PTNR": "PARTNERSHIP", + "LP": "LIMITED PARTNERSHIP", + "LLP": "LIMITED LIABILITY PARTNERSHIP", + "SOLE PROP": "SOLE PROPRIETORSHIP", + "SP": "SOLE PROPRIETORSHIP", + "NPO": "NONPROFIT ORGANIZATION", + "PC": "PROFESSIONAL CORPORATION", + "CO-OP": "COOPERATIVE", + "LTD": "LIMITED COMPANY", + "JSC": "JOINT STOCK COMPANY", + "HOLDCO": "HOLDING COMPANY", + "PLC": "PUBLIC LIMITED COMPANY", + "PVT LTD": "PRIVATE LIMITED COMPANY", + "INC": "INCORPORATED", + "ASSOC": "ASSOCIATION", + "FDN": "FOUNDATION", + "TR": "TRUST", + "SOC": "SOCIETY", + "CONSORT": "CONSORTIUM", + "SYND": "SYNDICATE", + "GRP": "GROUP", + "CORP SOLE": "CORPORATION SOLE", + "JV": "JOINT VENTURE", + "SUB": "SUBSIDIARY", + "FRANCHISE": "FRANCHISE", + "PA": "PROFESSIONAL ASSOCIATION", + "CIC": "COMMUNITY INTEREST COMPANY", + "PAC": "POLITICAL ACTION COMMITTEE", +} diff --git a/utils/linkage.py b/utils/linkage.py index 403ff16..2e1f9c9 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -2,12 +2,18 @@ import usaddress from names_dataset import NameDataset -# Initialize the NameDataset class, takes too long to initialize within the function -nd = NameDataset() """ Module for performing record linkage on state campaign finance dataset """ +import os.path +import re + +import pandas as pd +import textdistance as td +import usaddress + +from utils.constants import COMPANY_TYPES, repo_root def get_address_line_1_from_full_address(address: str) -> str: @@ -71,6 +77,7 @@ def calculate_string_similarity(string1: str, string2: str) -> float: 1. equivalent strings must return 1 2. strings with no similar characters must return 0 3. strings with higher intuitive similarity must return higher scores + similarity score Args: string1: any string @@ -92,6 +99,140 @@ def calculate_string_similarity(string1: str, string2: str) -> float: return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1])) +def determine_comma_role(name: str) -> str: + """Given a string (someone's name), attempts to determine the role of the + comma in the name and where it ought to belong. + + Some assumptions are made: + * If a suffix is included in the name and the name is not just the last + name(i.e "Doe, Jr), the format is + (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth + + * If a comma is used anywhere else, it is in the format of + (last_name, first and middle name) i.e Doe, Jane Elisabeth + Args: + name: a string representing a name/names of individuals + Returns: + the name with or without a comma based on some conditions + + Sample Usage: + >>> determine_comma_role("Jane Doe, Jr") + 'Jane Doe, Jr' + >>> determine_comma_role("Doe, Jane Elisabeth") + ' Jane Elisabeth Doe' + >>> determine_comma_role("Jane Doe,") + 'Jane Doe' + >>> determine_comma_role("DOe, Jane") + ' Jane Doe' + """ + suffixes = [ + "sr", + "jr", + "i", + "ii", + "iii", + "iv", + "v", + "vi", + "vii", + "viii", + "ix", + "x", + ] + name_parts = name.lower().split(",") + # if the comma is just in the end as a typo: + if len(name_parts[1]) == 0: + return name_parts[0].title() + # if just the suffix in the end, leave the name as it is + if name_parts[1].strip() in suffixes: + return name.title() + # at this point either it's just poor name placement, or the suffix is + # in the beginning of the name. Either way, the first part of the list is + # the true last name. + last_part = name_parts.pop(0) + first_part = " ".join(name_parts) + return first_part.title() + " " + last_part.title() + + +def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: + """Given name related columns, return a person's likely name + + Given different formatting used accross states, errors in data entry + and missing data, it can be difficult to determine someone's actual + name. For example, some states have a last name column with values like + "Doe, Jane", where the person's first name appears to have been erroneously + included. + + Args: + first_name: raw value of first name column + last_name: raw value last name column + full_name: raw value of name or full_name column + Returns: + The most likely full name of the person listed + + Sample Usage: + >>> get_likely_name("Jane", "Doe", "") + 'Jane Doe' + >>> get_likely_name("", "", "Jane Doe") + 'Jane Doe' + >>> get_likely_name("", "Doe, Jane", "") + 'Jane Doe' + >>> get_likely_name("Jane Doe", "Doe", "Jane Doe") + 'Jane Doe' + >>> get_likely_name("Jane","","Doe, Sr") + 'Jane Doe, Sr' + >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV") + 'Jane Elisabeth Doe, Iv' + >>> get_likely_name("","","Jane Elisabeth Doe, IV") + 'Jane Elisabeth Doe, Iv' + >>> get_likely_name("Jane","","Doe, Jane, Elisabeth") + 'Jane Elisabeth Doe' + """ + # first ensure clean input by deleting spaces: + first_name, last_name, full_name = list( + map(lambda x: x.lower().strip(), [first_name, last_name, full_name]) + ) + + # if data is clean: + if first_name + " " + last_name == full_name: + return full_name.title() + + # some names have titles or professions associated with the name. We need to + # remove those from the name. + titles = [ + "mr", + "ms", + "mrs", + "miss", + "prof", + "dr", + "doctor", + "sir", + "madam", + "professor", + ] + names = [first_name, last_name, full_name] + + for i in range(len(names)): + # if there is a ',' deal with it accordingly + if "," in names[i]: + names[i] = determine_comma_role(names[i]) + + names[i] = names[i].replace(".", "").split(" ") + names[i] = [ + name_part for name_part in names[i] if name_part not in titles + ] + names[i] = " ".join(names[i]) + + # one last check to remove any pieces that might add extra whitespace + names = list(filter(lambda x: x != "", names)) + names = " ".join(names) + names = names.title().replace(" ", " ").split(" ") + final_name = [] + [final_name.append(x) for x in names if x not in final_name] + return " ".join(final_name).strip() + + def get_street_from_address_line_1(address_line_1: str) -> str: """Given an address line 1, return the street name @@ -137,6 +278,7 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) + def name_rank(first_name: str, last_name: str) -> list: """Returns a score for the rank of a given first name and last name https://github.com/philipperemy/name-dataset @@ -158,6 +300,10 @@ def name_rank(first_name: str, last_name: str) -> list: >>> name_rank(None, 9) [None, None] """ + + # Initialize the NameDataset class + nd = NameDataset() + first_name_rank = 0 last_name_rank = 0 if isinstance(first_name, str): @@ -179,3 +325,190 @@ def name_rank(first_name: str, last_name: str) -> list: else: last_name_rank = None return [first_name_rank, last_name_rank] +======= +def convert_duplicates_to_dict(df: pd.DataFrame) -> None: + """Saves to the "output" directory a file where each row represents a string + matching to another string + + Given a dataframe where each row contains one string in a column and a list + of strings in another column, the function maps each string in the list to + the single string. + + Args: + A pandas dataframe + + Returns + None. However it outputs a file to the output directory, with 2 + columns. The first, which indicates the duplicated UUIDs, is labeled + 'duplicated_uuids', and the 2nd, which shows the uuids to which the + deduplicated entries match to, is labeled 'mapped_uuids'. + """ + deduped_dict = {} + for i in range(len(df)): + deduped_uudis = df.iloc[i]["duplicated"] + for j in range(len(deduped_uudis)): + deduped_dict.update({deduped_uudis[j]: df.iloc[i]["id"]}) + + # now convert dictionary into a csv file + deduped_df = pd.DataFrame.from_dict(deduped_dict, "index") + deduped_df = deduped_df.reset_index().rename( + columns={"index": "duplicated_uuids", 0: "mapped_uuids"} + ) + deduped_df.to_csv( + repo_root / "output" / "deduplicated_UUIDs.csv", + index=False, + mode="a", + header=not os.path.exists("../output/deduplicated_UUIDs.csv"), + ) + + +def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame: + """Return a dataframe with duplicated entries removed. + + Given a dataframe, combines rows that have identical data beyond their + UUIDs, keeps the first UUID amond the similarly grouped UUIDs, and saves the + rest of the UUIDS to a file in the "output" directory linking them to the + first selected UUID. + + Args: + a pandas dataframe containing contribution data + Returns: + a deduplicated pandas dataframe containing contribution data + """ + # first remove all duplicate entries: + new_df = df.drop_duplicates() + + # now find the duplicates along all columns but the ID + new_df = ( + new_df.groupby(df.columns[1:].tolist(), dropna=False)["id"] + .agg(list) + .reset_index() + .rename(columns={"id": "duplicated"}) + ) + new_df.index = new_df["duplicated"].str[0].tolist() + + # now convert the duplicated column into a dictionary that can will be + # an output by only feeding the entries with duplicates + new_df = new_df.reset_index().rename(columns={"index": "id"}) + convert_duplicates_to_dict(new_df[["id", "duplicated"]]) + new_df = new_df.drop(["duplicated"], axis=1) + return new_df + + +def cleaning_company_column(company_entry: str) -> str: + """ + Given a string, check if it contains a variation of self employed, unemployed, + or retired and return the standardized version. + + Args: + company: string of inputted company names + Returns: + standardized for retired, self employed, and unemployed, + or original string if no match or empty string + + >>> cleaning_company_column("Retireed") + 'Retired' + >>> cleaning_company_column("self") + 'Self Employed' + >>> cleaning_company_column("None") + 'Unemployed' + >>> cleaning_company_column("N/A") + 'Unemployed' + """ + + if not company_entry: + return company_entry + + company_edited = company_entry.lower() + + if company_edited == "n/a": + return "Unemployed" + + company_edited = re.sub(r"[^\w\s]", "", company_edited) + + if ( + company_edited == "retired" + or company_edited == "retiree" + or company_edited == "retire" + or "retiree" in company_edited + ): + return "Retired" + + elif ( + "self employe" in company_edited + or "freelance" in company_edited + or company_edited == "self" + or company_edited == "independent contractor" + ): + return "Self Employed" + elif ( + "unemploye" in company_edited + or company_edited == "none" + or company_edited == "not employed" + or company_edited == "nan" + ): + return "Unemployed" + + else: + return company_edited + + +def standardize_corp_names(company_name: str) -> str: + """Given an employer name, return the standardized version + + Args: + company_name: corporate name + Returns: + standardized company name + + >>> standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC') + 'MI BEER WINE WHOLESALERS ASSOCIATION' + + >>> standardize_corp_names('MI COMMUNITY COLLEGE ASSOCIATION') + 'MI COMMUNITY COLLEGE ASSOCIATION' + + >>> standardize_corp_names('STEPHANIES CHANGEMAKER FUND') + 'STEPHANIES CHANGEMAKER FUND' + + """ + + company_name_split = company_name.upper().split(" ") + + for i in range(len(company_name_split)): + if company_name_split[i] in list(COMPANY_TYPES.keys()): + hold = company_name_split[i] + company_name_split[i] = COMPANY_TYPES[hold] + + new_company_name = " ".join(company_name_split) + return new_company_name + + +def get_address_number_from_address_line_1(address_line_1: str) -> str: + """Given an address line 1, return the building number or po box + + Args: + address_line_1: either street information or PO box + Returns: + address or po box number + + Sample Usage: + >>> get_address_number_from_address_line_1('6727 W. Corrine Dr. Peoria,AZ 85381') + '6727' + >>> get_address_number_from_address_line_1('P.O. Box 5456 Sun City West ,AZ 85375') + '5456' + >>> get_address_number_from_address_line_1('119 S 5th St Niles,MI 49120') + '119' + >>> get_address_number_from_address_line_1( + ... '1415 PARKER STREET APT 251 DETROIT MI 48214-0000' + ... ) + '1415' + """ + + address_line_1_components = usaddress.parse(address_line_1) + + for i in range(len(address_line_1_components)): + if address_line_1_components[i][1] == "AddressNumber": + return address_line_1_components[i][0] + elif address_line_1_components[i][1] == "USPSBoxID": + return address_line_1_components[i][0] + raise ValueError("Can not find Address Number")