From 50537f9e620400630d1a92f1b4f7962a48955fa8 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 31 Jan 2024 15:43:13 +0000 Subject: [PATCH 1/6] updating requirements.txt to include names-dataset package --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index db05b66..fa82b10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ Requests==2.31.0 setuptools==68.0.0 textdistance==4.6.1 usaddress==0.5.4 +names-dataset==3.1.0 From b21fd5299d3351f28a8f3896c11729fc14390629 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Wed, 31 Jan 2024 16:24:03 +0000 Subject: [PATCH 2/6] initial name_rank function --- utils/linkage.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/utils/linkage.py b/utils/linkage.py index d013bfd..d910587 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,5 +1,9 @@ import textdistance as td import usaddress +from names_dataset import NameDataset + +nd = NameDataset() +# 'The library takes time to initialize because the database is massive.' """ Module for performing record linkage on state campaign finance dataset @@ -131,3 +135,36 @@ def get_street_from_address_line_1(address_line_1: str) -> str: string.append(key) return " ".join(string) + + +def name_rank(first_name: str, last_name: str) -> list: + """Returns a score for the rank of a first name and last name in the US + https://github.com/philipperemy/name-dataset + + Args: + first_name: any string + last_name: any string + Returns: + name rank for first name and last names + 1 is the most common name, only for names in the 'United States' + first element is the element corresponds to the rank of the first name + second element is the element corresponds to the rank of the last name + """ + + first_name_result = nd.search(first_name) + last_name_result = nd.search(last_name) + first_name_rank = 0 + last_name_rank = 0 + try: + first_name_rank = first_name_result["first_name"]["rank"][ + "United States" + ] + except KeyError: + pass + + try: + last_name_rank = last_name_result["last_name"]["rank"]["United States"] + except KeyError: + pass + + return [first_name_rank, last_name_rank] From 28c003433545676a9f09827e29814d40543ff4c4 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 1 Feb 2024 06:02:46 +0000 Subject: [PATCH 3/6] edited function --- utils/linkage.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index d910587..5370b30 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -151,20 +151,25 @@ def name_rank(first_name: str, last_name: str) -> list: second element is the element corresponds to the rank of the last name """ + if first_name is None or last_name is None: + return [None, None] + + if not isinstance(first_name, str) or not isinstance(last_name, str): + return [None, None] + first_name_result = nd.search(first_name) last_name_result = nd.search(last_name) - first_name_rank = 0 - last_name_rank = 0 - try: - first_name_rank = first_name_result["first_name"]["rank"][ - "United States" - ] - except KeyError: - pass - - try: - last_name_rank = last_name_result["last_name"]["rank"]["United States"] - except KeyError: - pass + first_name_rank = None + last_name_rank = None + + if first_name_result and isinstance(first_name_result, dict): + first_name_data = first_name_result.get("first_name") + if first_name_data and "rank" in first_name_data: + first_name_rank = first_name_data["rank"].get("United States", None) + + if last_name_result and isinstance(last_name_result, dict): + last_name_data = last_name_result.get("last_name") + if last_name_data and "rank" in last_name_data: + last_name_rank = last_name_data["rank"].get("United States", None) return [first_name_rank, last_name_rank] From dbaad50d25540e680ffef004021f917fcf5b265d Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Thu, 15 Feb 2024 05:17:21 +0000 Subject: [PATCH 4/6] updated name_rank function --- utils/linkage.py | 62 ++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 5370b30..9f146da 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -2,8 +2,8 @@ import usaddress from names_dataset import NameDataset +# Initialize the NameDataset class, takes too long to initialize within the function nd = NameDataset() -# 'The library takes time to initialize because the database is massive.' """ Module for performing record linkage on state campaign finance dataset @@ -138,38 +138,44 @@ def get_street_from_address_line_1(address_line_1: str) -> str: def name_rank(first_name: str, last_name: str) -> list: - """Returns a score for the rank of a first name and last name in the US + """Returns a score for the rank of a given first name and last name https://github.com/philipperemy/name-dataset - Args: first_name: any string last_name: any string Returns: name rank for first name and last names - 1 is the most common name, only for names in the 'United States' - first element is the element corresponds to the rank of the first name - second element is the element corresponds to the rank of the last name + 1 is the most common name, only for names in the United States + First element in the list corresponds to the rank of the first name + Second element in the list corresponds to the rank of the last name + Empty or non string values will return None + Names that are not found in the dataset will return 0 + + >>> name_rank("John", "Smith") + [5, 7] + >>> name_rank("Adil", "Kassim") + [0, 7392] + >>> name_rank(None, 9) + [None, None """ - - if first_name is None or last_name is None: - return [None, None] - - if not isinstance(first_name, str) or not isinstance(last_name, str): - return [None, None] - - first_name_result = nd.search(first_name) - last_name_result = nd.search(last_name) - first_name_rank = None - last_name_rank = None - - if first_name_result and isinstance(first_name_result, dict): - first_name_data = first_name_result.get("first_name") - if first_name_data and "rank" in first_name_data: - first_name_rank = first_name_data["rank"].get("United States", None) - - if last_name_result and isinstance(last_name_result, dict): - last_name_data = last_name_result.get("last_name") - if last_name_data and "rank" in last_name_data: - last_name_rank = last_name_data["rank"].get("United States", None) - + first_name_rank = 0 + last_name_rank = 0 + if isinstance(first_name, str): + first_name_result = nd.search(first_name) + if first_name_result and isinstance(first_name_result, dict): + first_name_data = first_name_result.get("first_name") + if first_name_data and "rank" in first_name_data: + first_name_rank = first_name_data["rank"].get( + "United States", 0 + ) + else: + first_name_rank = None + if isinstance(last_name, str): + last_name_result = nd.search(last_name) + if last_name_result and isinstance(last_name_result, dict): + last_name_data = last_name_result.get("last_name") + if last_name_data and "rank" in last_name_data: + last_name_rank = last_name_data["rank"].get("United States", 0) + else: + last_name_rank = None return [first_name_rank, last_name_rank] From fbc579c0dfbd6a8e6b6d4a6a7a5ac418ff31d380 Mon Sep 17 00:00:00 2001 From: Avery Schoen <33437601+averyschoen@users.noreply.github.com> Date: Thu, 15 Feb 2024 08:37:58 -0600 Subject: [PATCH 5/6] Update linkage.py --- utils/linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/linkage.py b/utils/linkage.py index 9f146da..403ff16 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -156,7 +156,7 @@ def name_rank(first_name: str, last_name: str) -> list: >>> name_rank("Adil", "Kassim") [0, 7392] >>> name_rank(None, 9) - [None, None + [None, None] """ first_name_rank = 0 last_name_rank = 0 From 4e353273bd7535d96dec2d932d3b769baff8cc52 Mon Sep 17 00:00:00 2001 From: Adil Kassim Date: Mon, 19 Feb 2024 15:46:27 +0000 Subject: [PATCH 6/6] slight formatting changes --- utils/linkage.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/utils/linkage.py b/utils/linkage.py index 2e1f9c9..8810877 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -2,7 +2,6 @@ import usaddress from names_dataset import NameDataset - """ Module for performing record linkage on state campaign finance dataset """ @@ -10,8 +9,6 @@ import re import pandas as pd -import textdistance as td -import usaddress from utils.constants import COMPANY_TYPES, repo_root @@ -278,7 +275,6 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) - def name_rank(first_name: str, last_name: str) -> list: """Returns a score for the rank of a given first name and last name https://github.com/philipperemy/name-dataset @@ -300,10 +296,10 @@ def name_rank(first_name: str, last_name: str) -> list: >>> name_rank(None, 9) [None, None] """ - + # Initialize the NameDataset class nd = NameDataset() - + first_name_rank = 0 last_name_rank = 0 if isinstance(first_name, str): @@ -325,7 +321,8 @@ def name_rank(first_name: str, last_name: str) -> list: else: last_name_rank = None return [first_name_rank, last_name_rank] -======= + + def convert_duplicates_to_dict(df: pd.DataFrame) -> None: """Saves to the "output" directory a file where each row represents a string matching to another string