diff --git a/requirements.txt b/requirements.txt index d28ae9f..1dee38a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,6 @@ Requests==2.31.0 setuptools==68.0.0 textdistance==4.6.1 usaddress==0.5.4 -networkx~=3.1 \ No newline at end of file +names-dataset==3.1.0 +networkx~=3.1 + diff --git a/utils/linkage.py b/utils/linkage.py index 9884482..cae5024 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,3 +1,7 @@ +import textdistance as td +import usaddress +from names_dataset import NameDataset + """ Module for performing record linkage on state campaign finance dataset """ @@ -7,8 +11,6 @@ import numpy as np import pandas as pd -import textdistance as td -import usaddress from utils.constants import COMPANY_TYPES, repo_root @@ -397,6 +399,54 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) +def name_rank(first_name: str, last_name: str) -> list: + """Returns a score for the rank of a given first name and last name + https://github.com/philipperemy/name-dataset + Args: + first_name: any string + last_name: any string + Returns: + name rank for first name and last names + 1 is the most common name, only for names in the United States + First element in the list corresponds to the rank of the first name + Second element in the list corresponds to the rank of the last name + Empty or non string values will return None + Names that are not found in the dataset will return 0 + + >>> name_rank("John", "Smith") + [5, 7] + >>> name_rank("Adil", "Kassim") + [0, 7392] + >>> name_rank(None, 9) + [None, None] + """ + + # Initialize the NameDataset class + nd = NameDataset() + + first_name_rank = 0 + last_name_rank = 0 + if isinstance(first_name, str): + first_name_result = nd.search(first_name) + if first_name_result and isinstance(first_name_result, dict): + first_name_data = first_name_result.get("first_name") + if first_name_data and "rank" in first_name_data: + first_name_rank = first_name_data["rank"].get( + "United States", 0 + ) + else: + first_name_rank = None + if isinstance(last_name, str): + last_name_result = nd.search(last_name) + if last_name_result and isinstance(last_name_result, dict): + last_name_data = last_name_result.get("last_name") + if last_name_data and "rank" in last_name_data: + last_name_rank = last_name_data["rank"].get("United States", 0) + else: + last_name_rank = None + return [first_name_rank, last_name_rank] + + def convert_duplicates_to_dict(df: pd.DataFrame) -> None: """Saves to the "output" directory a file where each row represents a string matching to another string