Skip to content

Commit

Permalink
Merge pull request #15 from dsi-clinic/row_similarity
Browse files Browse the repository at this point in the history
Row similarity
  • Loading branch information
averyschoen authored Feb 19, 2024
2 parents cbe4d1e + ff02e3d commit caa3f99
Show file tree
Hide file tree
Showing 3 changed files with 306 additions and 0 deletions.
75 changes: 75 additions & 0 deletions utils/classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd

# we want to run down a list of people and, hopefully, their adresses, plus a list of
# corporations, groups, etc, and classify them, basically just looking for matches

# do we want to just input all the names/people (there's not many, less than 200
# for sure),give a string similarity match score, and extract the top ten for
# manual review? this should give us a feeling for how to set our threshold
# we might also, once we have all the data, buckle down and just classify
# some of them manually

inds_list = []

# a list of individual names


def similarity_calculator(
df: pd.DataFrame, subject: str, n: int, comparison_func
) -> pd.DataFrame:
"""Find best matches to a subject name in a pandas dataframe
For a given individual or organization, the subject, we search through the
'name'column of a dataframe, select the n highest matches according to a
selected comparison function, and return those as a dataframe. This is meant
to be used manually to search for matches. For quick automated processing, see
automated_classifier().
Note that the comparison function must take in two inputs, both strings, and
output a percentage match
"""

similarities_df = df.copy()

similarities = similarities_df["name"].apply(
lambda x: comparison_func(x, subject)
)

similarities_df["similarities"] = similarities

top_n_matches = similarities_df.sort_values(
by=["similarities"], ascending=False
)[0:n]

return top_n_matches


def automated_classifier(
df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func
):
"""Using similarity_calculator, classify entities automatically
Feeding a dictionary of names and the associated statuses, we compare
the string matches and, if they exceed a certain threshold, classify
them as belonging to some group specified in the subjects dictionary.
"""

similarities_df = df.copy()

for subject in subjects_dict:
similarities = similarities_df["name"].apply(
lambda x, sub=subject: comparison_func(x, sub)
)
matches = similarities >= threshold

status = subjects_dict[subject]

similarities_df["classification"] = pd.Series(matches).apply(
lambda x, stat=status: stat if x else "neutral"
)

return similarities_df

# we can use the indices and/or select manually, just add a new
# column to the subjects table
# that marks fossil fuels, green energy, or neither
124 changes: 124 additions & 0 deletions utils/linkage.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""
Module for performing record linkage on state campaign finance dataset
"""
import math
import os.path
import re

import numpy as np
import pandas as pd
import textdistance as td
import usaddress
Expand Down Expand Up @@ -94,6 +96,128 @@ def calculate_string_similarity(string1: str, string2: str) -> float:
return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1]))


def calculate_row_similarity(
row1: pd.DataFrame, row2: pd.DataFrame, weights: np.array, comparison_func
) -> float:
"""Find weighted similarity of two rows in a dataframe
The length of the weights vector must be the same as
the number of selected columns.
This version is slow and not optimized, and will be
revised in order to make it more efficient. It
exists as to provide basic functionality. Once we have
the comparison function locked in, using .apply will
likely be easier and more efficient.
"""

row_length = len(weights)
if not (row1.shape[1] == row2.shape[1] == row_length):
raise ValueError("Number of columns and weights must be the same")

similarity = np.zeros(row_length)

for i in range(row_length):
similarity[i] = comparison_func(
row1.reset_index().drop(columns="index").iloc[:, i][0],
row2.reset_index().drop(columns="index").iloc[:, i][0],
)

return sum(similarity * weights)


def row_matches(
df: pd.DataFrame, weights: np.array, threshold: float, comparison_func
) -> dict:
"""Get weighted similarity score of two rows
Run through the rows using indices: if two rows have a comparison score
greater than a threshold, we assign the later row to the former. Any
row which is matched to any other row is not examined again. Matches are
stored in a dictionary object, with each index appearing no more than once.
This is not optimized. Not presently sure how to make a good test case
for this, will submit and ask in mentor session.
"""

all_indices = np.array(list(df.index))

index_dict = {}
[index_dict.setdefault(x, []) for x in all_indices]

discard_indices = []

end = max(all_indices)
for i in all_indices:
# Skip indices that have been stored in the discard_indices list
if i in discard_indices:
continue

# Iterate through the remaining numbers
for j in range(i + 1, end):
if j in discard_indices:
continue

# Our conditional
if (
calculate_row_similarity(
df.iloc[[i]], df.iloc[[j]], weights, comparison_func
)
> threshold
):
# Store the other index and mark it for skipping in future iterations
discard_indices.append(j)
index_dict[i].append(j)

return index_dict


def match_confidence(
confidences: np.array(float), weights: np.array(float), weights_toggle: bool
) -> float:
"""Combine confidences for row matches into a final confidence
This is a weighted log-odds based combination of row match confidences
originating from various record linkage methods. Weights will be applied
to the linkage methods in order and must be of the same length.
weights_toggle allows one to turn weights on and off when calling the
function. False cancels the use of weights.
Since log-odds have undesirable behaviors at 0 and 1, we truncate at
+-5, which corresponds to around half a percent probability or
1 - the same.
>>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), True)
2.627759082143462e-12
>>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), False)
0.08337802853594725
"""

if (min(confidences) < 0) or (max(confidences) > 1):
raise ValueError("Probabilities must be bounded on [0, 1]")

log_odds = []

for c in confidences:
l_o = np.log(c / (1 - c))

if l_o > 5:
l_o = 5

elif l_o < -5:
l_o = -5

log_odds.append(l_o)

if weights_toggle:
log_odds = log_odds * weights

l_o_sum = np.sum(log_odds)

conf_sum = math.e ** (l_o_sum) / (1 + math.e ** (l_o_sum))
return conf_sum


def determine_comma_role(name: str) -> str:
"""Given a string (someone's name), attempts to determine the role of the
comma in the name and where it ought to belong.
Expand Down
107 changes: 107 additions & 0 deletions utils/tests/test_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import pandas as pd
import pytest

from utils.linkage import (
calculate_row_similarity,
calculate_string_similarity,
row_matches,
)

# import pytest


# creating a test for calculate_row_similarity and row_matches

# to put in data:
d = {
"name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"],
"address": [
"3 Blue Drive, Chicago",
"4 Blue Drive, Chicago",
"8 Fancy Way, Chicago",
],
}
test_df = pd.DataFrame(data=d)


@pytest.fixture
def row_similarity_scen_1():
return test_df


@pytest.fixture
def row_similarity_scen_2():
return test_df


def test_row_similarity_scen_1(row_similarity_scen_1):
wrong = calculate_row_similarity(
row_similarity_scen_1.iloc[[0]],
row_similarity_scen_1.iloc[[1]],
np.array([0.8, 0.2]),
calculate_string_similarity,
)
right = calculate_row_similarity(
row_similarity_scen_1.iloc[[0]],
row_similarity_scen_1.iloc[[2]],
np.array([0.8, 0.2]),
calculate_string_similarity,
)

assert right > wrong


def test_row_similarity_scen_2(row_similarity_scen_2):
wrong = calculate_row_similarity(
row_similarity_scen_2.iloc[[0]],
row_similarity_scen_2.iloc[[1]],
np.array([0.2, 0.8]),
calculate_string_similarity,
)
right = calculate_row_similarity(
row_similarity_scen_2.iloc[[0]],
row_similarity_scen_2.iloc[[2]],
np.array([0.2, 0.8]),
calculate_string_similarity,
)

assert right < wrong


d2 = {
"name": [
"bob von rosevich",
"anantarya smith",
"bob j vonrosevich",
"missy elliot",
"mr johnson",
"quarantin directino",
"missy eliot",
"joseph johnson",
],
"address": [
"3 Blue Drive, Chicago",
"4 Blue Drive, Chicago",
"8 Fancy Way, Chicago",
"8 Fancy Way, Evanston",
"17 Regular Road, Chicago",
"42 Hollywood Boulevard, Chicago",
"8 Fancy Way, Evanston",
"17 Regular Road, Chicago",
],
}
test_df2 = pd.DataFrame(data=d2)


@pytest.fixture
def row_match_scen1():
return test_df2


def test_row_matches(row_match_scen1):
res = row_matches(
row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity
)

assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []}

0 comments on commit caa3f99

Please sign in to comment.