From 50537f9e620400630d1a92f1b4f7962a48955fa8 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 31 Jan 2024 15:43:13 +0000
Subject: [PATCH 1/6] updating requirements.txt to include names-dataset
 package

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index db05b66..fa82b10 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ Requests==2.31.0
 setuptools==68.0.0
 textdistance==4.6.1
 usaddress==0.5.4
+names-dataset==3.1.0

From b21fd5299d3351f28a8f3896c11729fc14390629 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Wed, 31 Jan 2024 16:24:03 +0000
Subject: [PATCH 2/6] initial name_rank function

---
 utils/linkage.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/utils/linkage.py b/utils/linkage.py
index d013bfd..d910587 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,5 +1,9 @@
 import textdistance as td
 import usaddress
+from names_dataset import NameDataset
+
+nd = NameDataset()
+# 'The library takes time to initialize because the database is massive.'
 
 """
 Module for performing record linkage on state campaign finance dataset
@@ -131,3 +135,36 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
             string.append(key)
 
     return " ".join(string)
+
+
+def name_rank(first_name: str, last_name: str) -> list:
+    """Returns a score for the rank of a first name and last name in the US
+    https://github.com/philipperemy/name-dataset
+
+    Args:
+        first_name: any string
+        last_name: any string
+    Returns:
+        name rank for first name and last names
+        1 is the most common name, only for names in the 'United States'
+        first element is the element corresponds to the rank of the first name
+        second element is the element corresponds to the rank of the last name
+    """
+
+    first_name_result = nd.search(first_name)
+    last_name_result = nd.search(last_name)
+    first_name_rank = 0
+    last_name_rank = 0
+    try:
+        first_name_rank = first_name_result["first_name"]["rank"][
+            "United States"
+        ]
+    except KeyError:
+        pass
+
+    try:
+        last_name_rank = last_name_result["last_name"]["rank"]["United States"]
+    except KeyError:
+        pass
+
+    return [first_name_rank, last_name_rank]

From 28c003433545676a9f09827e29814d40543ff4c4 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 1 Feb 2024 06:02:46 +0000
Subject: [PATCH 3/6] edited function

---
 utils/linkage.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index d910587..5370b30 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -151,20 +151,25 @@ def name_rank(first_name: str, last_name: str) -> list:
         second element is the element corresponds to the rank of the last name
     """
 
+    if first_name is None or last_name is None:
+        return [None, None]
+
+    if not isinstance(first_name, str) or not isinstance(last_name, str):
+        return [None, None]
+
     first_name_result = nd.search(first_name)
     last_name_result = nd.search(last_name)
-    first_name_rank = 0
-    last_name_rank = 0
-    try:
-        first_name_rank = first_name_result["first_name"]["rank"][
-            "United States"
-        ]
-    except KeyError:
-        pass
-
-    try:
-        last_name_rank = last_name_result["last_name"]["rank"]["United States"]
-    except KeyError:
-        pass
+    first_name_rank = None
+    last_name_rank = None
+
+    if first_name_result and isinstance(first_name_result, dict):
+        first_name_data = first_name_result.get("first_name")
+        if first_name_data and "rank" in first_name_data:
+            first_name_rank = first_name_data["rank"].get("United States", None)
+
+    if last_name_result and isinstance(last_name_result, dict):
+        last_name_data = last_name_result.get("last_name")
+        if last_name_data and "rank" in last_name_data:
+            last_name_rank = last_name_data["rank"].get("United States", None)
 
     return [first_name_rank, last_name_rank]

From dbaad50d25540e680ffef004021f917fcf5b265d Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Thu, 15 Feb 2024 05:17:21 +0000
Subject: [PATCH 4/6] updated name_rank function

---
 utils/linkage.py | 62 ++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 5370b30..9f146da 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -2,8 +2,8 @@
 import usaddress
 from names_dataset import NameDataset
 
+# Initialize the NameDataset class, takes too long to initialize within the function
 nd = NameDataset()
-# 'The library takes time to initialize because the database is massive.'
 
 """
 Module for performing record linkage on state campaign finance dataset
@@ -138,38 +138,44 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
 
 
 def name_rank(first_name: str, last_name: str) -> list:
-    """Returns a score for the rank of a first name and last name in the US
+    """Returns a score for the rank of a given first name and last name
     https://github.com/philipperemy/name-dataset
-
     Args:
         first_name: any string
         last_name: any string
     Returns:
         name rank for first name and last names
-        1 is the most common name, only for names in the 'United States'
-        first element is the element corresponds to the rank of the first name
-        second element is the element corresponds to the rank of the last name
+        1 is the most common name, only for names in the United States
+        First element in the list corresponds to the rank of the first name
+        Second element in the list corresponds to the rank of the last name
+        Empty or non string values will return None
+        Names that are not found in the dataset will return 0
+
+    >>> name_rank("John", "Smith")
+    [5, 7]
+    >>> name_rank("Adil", "Kassim")
+    [0, 7392]
+    >>> name_rank(None, 9)
+    [None, None
     """
-
-    if first_name is None or last_name is None:
-        return [None, None]
-
-    if not isinstance(first_name, str) or not isinstance(last_name, str):
-        return [None, None]
-
-    first_name_result = nd.search(first_name)
-    last_name_result = nd.search(last_name)
-    first_name_rank = None
-    last_name_rank = None
-
-    if first_name_result and isinstance(first_name_result, dict):
-        first_name_data = first_name_result.get("first_name")
-        if first_name_data and "rank" in first_name_data:
-            first_name_rank = first_name_data["rank"].get("United States", None)
-
-    if last_name_result and isinstance(last_name_result, dict):
-        last_name_data = last_name_result.get("last_name")
-        if last_name_data and "rank" in last_name_data:
-            last_name_rank = last_name_data["rank"].get("United States", None)
-
+    first_name_rank = 0
+    last_name_rank = 0
+    if isinstance(first_name, str):
+        first_name_result = nd.search(first_name)
+        if first_name_result and isinstance(first_name_result, dict):
+            first_name_data = first_name_result.get("first_name")
+            if first_name_data and "rank" in first_name_data:
+                first_name_rank = first_name_data["rank"].get(
+                    "United States", 0
+                )
+    else:
+        first_name_rank = None
+    if isinstance(last_name, str):
+        last_name_result = nd.search(last_name)
+        if last_name_result and isinstance(last_name_result, dict):
+            last_name_data = last_name_result.get("last_name")
+            if last_name_data and "rank" in last_name_data:
+                last_name_rank = last_name_data["rank"].get("United States", 0)
+    else:
+        last_name_rank = None
     return [first_name_rank, last_name_rank]

From fbc579c0dfbd6a8e6b6d4a6a7a5ac418ff31d380 Mon Sep 17 00:00:00 2001
From: Avery Schoen <33437601+averyschoen@users.noreply.github.com>
Date: Thu, 15 Feb 2024 08:37:58 -0600
Subject: [PATCH 5/6] Update linkage.py

---
 utils/linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 9f146da..403ff16 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -156,7 +156,7 @@ def name_rank(first_name: str, last_name: str) -> list:
     >>> name_rank("Adil", "Kassim")
     [0, 7392]
     >>> name_rank(None, 9)
-    [None, None
+    [None, None]
     """
     first_name_rank = 0
     last_name_rank = 0

From 4e353273bd7535d96dec2d932d3b769baff8cc52 Mon Sep 17 00:00:00 2001
From: Adil Kassim <adilk@uchicago.edu>
Date: Mon, 19 Feb 2024 15:46:27 +0000
Subject: [PATCH 6/6] slight formatting changes

---
 utils/linkage.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/utils/linkage.py b/utils/linkage.py
index 2e1f9c9..8810877 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -2,7 +2,6 @@
 import usaddress
 from names_dataset import NameDataset
 
-
 """
 Module for performing record linkage on state campaign finance dataset
 """
@@ -10,8 +9,6 @@
 import re
 
 import pandas as pd
-import textdistance as td
-import usaddress
 
 from utils.constants import COMPANY_TYPES, repo_root
 
@@ -278,7 +275,6 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
     return " ".join(string)
 
 
-
 def name_rank(first_name: str, last_name: str) -> list:
     """Returns a score for the rank of a given first name and last name
     https://github.com/philipperemy/name-dataset
@@ -300,10 +296,10 @@ def name_rank(first_name: str, last_name: str) -> list:
     >>> name_rank(None, 9)
     [None, None]
     """
-    
+
     # Initialize the NameDataset class
     nd = NameDataset()
-    
+
     first_name_rank = 0
     last_name_rank = 0
     if isinstance(first_name, str):
@@ -325,7 +321,8 @@ def name_rank(first_name: str, last_name: str) -> list:
     else:
         last_name_rank = None
     return [first_name_rank, last_name_rank]
-=======
+
+
 def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
     """Saves to the "output" directory a file where each row represents a string
     matching to another string