Merge pull request #15 from dsi-clinic/row_similarity

Row similarity
uchicago-dsi · Feb 19, 2024 · caa3f99 · caa3f99
2 parents cbe4d1e + ff02e3d
commit caa3f99
Show file tree

Hide file tree

Showing 3 changed files with 306 additions and 0 deletions.
diff --git a/utils/classify.py b/utils/classify.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+# we want to run down a list of people and, hopefully, their adresses, plus a list of
+# corporations, groups, etc, and classify them, basically just looking for matches
+
+# do we want to just input all the names/people (there's not many, less than 200
+# for sure),give a string similarity match score, and extract the top ten for
+# manual review? this should give us a feeling for how to set our threshold
+# we might also, once we have all the data, buckle down and just classify
+# some of them manually
+
+inds_list = []
+
+# a list of individual names
+
+
+def similarity_calculator(
+    df: pd.DataFrame, subject: str, n: int, comparison_func
+) -> pd.DataFrame:
+    """Find best matches to a subject name in a pandas dataframe
+
+    For a given individual or organization, the subject, we search through the
+    'name'column of a dataframe, select the n highest matches according to a
+    selected comparison function, and return those as a dataframe. This is meant
+    to be used manually to search for matches. For quick automated processing, see
+    automated_classifier().
+
+    Note that the comparison function must take in two inputs, both strings, and
+    output a percentage match
+    """
+
+    similarities_df = df.copy()
+
+    similarities = similarities_df["name"].apply(
+        lambda x: comparison_func(x, subject)
+    )
+
+    similarities_df["similarities"] = similarities
+
+    top_n_matches = similarities_df.sort_values(
+        by=["similarities"], ascending=False
+    )[0:n]
+
+    return top_n_matches
+
+
+def automated_classifier(
+    df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func
+):
+    """Using similarity_calculator, classify entities automatically
+
+    Feeding a dictionary of names and the associated statuses, we compare
+    the string matches and, if they exceed a certain threshold, classify
+    them as belonging to some group specified in the subjects dictionary.
+    """
+
+    similarities_df = df.copy()
+
+    for subject in subjects_dict:
+        similarities = similarities_df["name"].apply(
+            lambda x, sub=subject: comparison_func(x, sub)
+        )
+        matches = similarities >= threshold
+
+        status = subjects_dict[subject]
+
+        similarities_df["classification"] = pd.Series(matches).apply(
+            lambda x, stat=status: stat if x else "neutral"
+        )
+
+    return similarities_df
+
+    # we can use the indices and/or select manually, just add a new
+    # column to the subjects table
+    # that marks fossil fuels, green energy, or neither
diff --git a/utils/linkage.py b/utils/linkage.py
@@ -1,9 +1,11 @@
 """
 Module for performing record linkage on state campaign finance dataset
 """
+import math
 import os.path
 import re
 
+import numpy as np
 import pandas as pd
 import textdistance as td
 import usaddress
@@ -94,6 +96,128 @@ def calculate_string_similarity(string1: str, string2: str) -> float:
     return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1]))
 
 
+def calculate_row_similarity(
+    row1: pd.DataFrame, row2: pd.DataFrame, weights: np.array, comparison_func
+) -> float:
+    """Find weighted similarity of two rows in a dataframe
+
+    The length of the weights vector must be the same as
+    the number of selected columns.
+
+    This version is slow and not optimized, and will be
+    revised in order to make it more efficient. It
+    exists as to provide basic functionality. Once we have
+    the comparison function locked in, using .apply will
+    likely be easier and more efficient.
+    """
+
+    row_length = len(weights)
+    if not (row1.shape[1] == row2.shape[1] == row_length):
+        raise ValueError("Number of columns and weights must be the same")
+
+    similarity = np.zeros(row_length)
+
+    for i in range(row_length):
+        similarity[i] = comparison_func(
+            row1.reset_index().drop(columns="index").iloc[:, i][0],
+            row2.reset_index().drop(columns="index").iloc[:, i][0],
+        )
+
+    return sum(similarity * weights)
+
+
+def row_matches(
+    df: pd.DataFrame, weights: np.array, threshold: float, comparison_func
+) -> dict:
+    """Get weighted similarity score of two rows
+
+    Run through the rows using indices: if two rows have a comparison score
+    greater than a threshold, we assign the later row to the former. Any
+    row which is matched to any other row is not examined again. Matches are
+    stored in a dictionary object, with each index appearing no more than once.
+
+    This is not optimized. Not presently sure how to make a good test case
+    for this, will submit and ask in mentor session.
+    """
+
+    all_indices = np.array(list(df.index))
+
+    index_dict = {}
+    [index_dict.setdefault(x, []) for x in all_indices]
+
+    discard_indices = []
+
+    end = max(all_indices)
+    for i in all_indices:
+        # Skip indices that have been stored in the discard_indices list
+        if i in discard_indices:
+            continue
+
+        # Iterate through the remaining numbers
+        for j in range(i + 1, end):
+            if j in discard_indices:
+                continue
+
+            # Our conditional
+            if (
+                calculate_row_similarity(
+                    df.iloc[[i]], df.iloc[[j]], weights, comparison_func
+                )
+                > threshold
+            ):
+                # Store the other index and mark it for skipping in future iterations
+                discard_indices.append(j)
+                index_dict[i].append(j)
+
+    return index_dict
+
+
+def match_confidence(
+    confidences: np.array(float), weights: np.array(float), weights_toggle: bool
+) -> float:
+    """Combine confidences for row matches into a final confidence
+
+    This is a weighted log-odds based combination of row match confidences
+    originating from various record linkage methods. Weights will be applied
+    to the linkage methods in order and must be of the same length.
+
+    weights_toggle allows one to turn weights on and off when calling the
+    function. False cancels the use of weights.
+
+    Since log-odds have undesirable behaviors at 0 and 1, we truncate at
+    +-5, which corresponds to around half a percent probability or
+    1 - the same.
+    >>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), True)
+    2.627759082143462e-12
+    >>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), False)
+    0.08337802853594725
+    """
+
+    if (min(confidences) < 0) or (max(confidences) > 1):
+        raise ValueError("Probabilities must be bounded on [0, 1]")
+
+    log_odds = []
+
+    for c in confidences:
+        l_o = np.log(c / (1 - c))
+
+        if l_o > 5:
+            l_o = 5
+
+        elif l_o < -5:
+            l_o = -5
+
+        log_odds.append(l_o)
+
+    if weights_toggle:
+        log_odds = log_odds * weights
+
+    l_o_sum = np.sum(log_odds)
+
+    conf_sum = math.e ** (l_o_sum) / (1 + math.e ** (l_o_sum))
+    return conf_sum
+
+
 def determine_comma_role(name: str) -> str:
     """Given a string (someone's name), attempts to determine the role of the
     comma in the name and where it ought to belong.

diff --git a/utils/tests/test_linkage.py b/utils/tests/test_linkage.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from utils.linkage import (
+    calculate_row_similarity,
+    calculate_string_similarity,
+    row_matches,
+)
+
+# import pytest
+
+
+# creating a test for calculate_row_similarity and row_matches
+
+# to put in data:
+d = {
+    "name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"],
+    "address": [
+        "3 Blue Drive, Chicago",
+        "4 Blue Drive, Chicago",
+        "8 Fancy Way, Chicago",
+    ],
+}
+test_df = pd.DataFrame(data=d)
+
+
+@pytest.fixture
+def row_similarity_scen_1():
+    return test_df
+
+
+@pytest.fixture
+def row_similarity_scen_2():
+    return test_df
+
+
+def test_row_similarity_scen_1(row_similarity_scen_1):
+    wrong = calculate_row_similarity(
+        row_similarity_scen_1.iloc[[0]],
+        row_similarity_scen_1.iloc[[1]],
+        np.array([0.8, 0.2]),
+        calculate_string_similarity,
+    )
+    right = calculate_row_similarity(
+        row_similarity_scen_1.iloc[[0]],
+        row_similarity_scen_1.iloc[[2]],
+        np.array([0.8, 0.2]),
+        calculate_string_similarity,
+    )
+
+    assert right > wrong
+
+
+def test_row_similarity_scen_2(row_similarity_scen_2):
+    wrong = calculate_row_similarity(
+        row_similarity_scen_2.iloc[[0]],
+        row_similarity_scen_2.iloc[[1]],
+        np.array([0.2, 0.8]),
+        calculate_string_similarity,
+    )
+    right = calculate_row_similarity(
+        row_similarity_scen_2.iloc[[0]],
+        row_similarity_scen_2.iloc[[2]],
+        np.array([0.2, 0.8]),
+        calculate_string_similarity,
+    )
+
+    assert right < wrong
+
+
+d2 = {
+    "name": [
+        "bob von rosevich",
+        "anantarya smith",
+        "bob j vonrosevich",
+        "missy elliot",
+        "mr johnson",
+        "quarantin directino",
+        "missy eliot",
+        "joseph johnson",
+    ],
+    "address": [
+        "3 Blue Drive, Chicago",
+        "4 Blue Drive, Chicago",
+        "8 Fancy Way, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+        "42 Hollywood Boulevard, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+    ],
+}
+test_df2 = pd.DataFrame(data=d2)
+
+
+@pytest.fixture
+def row_match_scen1():
+    return test_df2
+
+
+def test_row_matches(row_match_scen1):
+    res = row_matches(
+        row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity
+    )
+
+    assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []}