-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from dsi-clinic/row_similarity
Row similarity
- Loading branch information
Showing
3 changed files
with
306 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import pandas as pd | ||
|
||
# we want to run down a list of people and, hopefully, their adresses, plus a list of | ||
# corporations, groups, etc, and classify them, basically just looking for matches | ||
|
||
# do we want to just input all the names/people (there's not many, less than 200 | ||
# for sure),give a string similarity match score, and extract the top ten for | ||
# manual review? this should give us a feeling for how to set our threshold | ||
# we might also, once we have all the data, buckle down and just classify | ||
# some of them manually | ||
|
||
inds_list = [] | ||
|
||
# a list of individual names | ||
|
||
|
||
def similarity_calculator( | ||
df: pd.DataFrame, subject: str, n: int, comparison_func | ||
) -> pd.DataFrame: | ||
"""Find best matches to a subject name in a pandas dataframe | ||
For a given individual or organization, the subject, we search through the | ||
'name'column of a dataframe, select the n highest matches according to a | ||
selected comparison function, and return those as a dataframe. This is meant | ||
to be used manually to search for matches. For quick automated processing, see | ||
automated_classifier(). | ||
Note that the comparison function must take in two inputs, both strings, and | ||
output a percentage match | ||
""" | ||
|
||
similarities_df = df.copy() | ||
|
||
similarities = similarities_df["name"].apply( | ||
lambda x: comparison_func(x, subject) | ||
) | ||
|
||
similarities_df["similarities"] = similarities | ||
|
||
top_n_matches = similarities_df.sort_values( | ||
by=["similarities"], ascending=False | ||
)[0:n] | ||
|
||
return top_n_matches | ||
|
||
|
||
def automated_classifier( | ||
df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func | ||
): | ||
"""Using similarity_calculator, classify entities automatically | ||
Feeding a dictionary of names and the associated statuses, we compare | ||
the string matches and, if they exceed a certain threshold, classify | ||
them as belonging to some group specified in the subjects dictionary. | ||
""" | ||
|
||
similarities_df = df.copy() | ||
|
||
for subject in subjects_dict: | ||
similarities = similarities_df["name"].apply( | ||
lambda x, sub=subject: comparison_func(x, sub) | ||
) | ||
matches = similarities >= threshold | ||
|
||
status = subjects_dict[subject] | ||
|
||
similarities_df["classification"] = pd.Series(matches).apply( | ||
lambda x, stat=status: stat if x else "neutral" | ||
) | ||
|
||
return similarities_df | ||
|
||
# we can use the indices and/or select manually, just add a new | ||
# column to the subjects table | ||
# that marks fossil fuels, green energy, or neither |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from utils.linkage import ( | ||
calculate_row_similarity, | ||
calculate_string_similarity, | ||
row_matches, | ||
) | ||
|
||
# import pytest | ||
|
||
|
||
# creating a test for calculate_row_similarity and row_matches | ||
|
||
# to put in data: | ||
d = { | ||
"name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"], | ||
"address": [ | ||
"3 Blue Drive, Chicago", | ||
"4 Blue Drive, Chicago", | ||
"8 Fancy Way, Chicago", | ||
], | ||
} | ||
test_df = pd.DataFrame(data=d) | ||
|
||
|
||
@pytest.fixture | ||
def row_similarity_scen_1(): | ||
return test_df | ||
|
||
|
||
@pytest.fixture | ||
def row_similarity_scen_2(): | ||
return test_df | ||
|
||
|
||
def test_row_similarity_scen_1(row_similarity_scen_1): | ||
wrong = calculate_row_similarity( | ||
row_similarity_scen_1.iloc[[0]], | ||
row_similarity_scen_1.iloc[[1]], | ||
np.array([0.8, 0.2]), | ||
calculate_string_similarity, | ||
) | ||
right = calculate_row_similarity( | ||
row_similarity_scen_1.iloc[[0]], | ||
row_similarity_scen_1.iloc[[2]], | ||
np.array([0.8, 0.2]), | ||
calculate_string_similarity, | ||
) | ||
|
||
assert right > wrong | ||
|
||
|
||
def test_row_similarity_scen_2(row_similarity_scen_2): | ||
wrong = calculate_row_similarity( | ||
row_similarity_scen_2.iloc[[0]], | ||
row_similarity_scen_2.iloc[[1]], | ||
np.array([0.2, 0.8]), | ||
calculate_string_similarity, | ||
) | ||
right = calculate_row_similarity( | ||
row_similarity_scen_2.iloc[[0]], | ||
row_similarity_scen_2.iloc[[2]], | ||
np.array([0.2, 0.8]), | ||
calculate_string_similarity, | ||
) | ||
|
||
assert right < wrong | ||
|
||
|
||
d2 = { | ||
"name": [ | ||
"bob von rosevich", | ||
"anantarya smith", | ||
"bob j vonrosevich", | ||
"missy elliot", | ||
"mr johnson", | ||
"quarantin directino", | ||
"missy eliot", | ||
"joseph johnson", | ||
], | ||
"address": [ | ||
"3 Blue Drive, Chicago", | ||
"4 Blue Drive, Chicago", | ||
"8 Fancy Way, Chicago", | ||
"8 Fancy Way, Evanston", | ||
"17 Regular Road, Chicago", | ||
"42 Hollywood Boulevard, Chicago", | ||
"8 Fancy Way, Evanston", | ||
"17 Regular Road, Chicago", | ||
], | ||
} | ||
test_df2 = pd.DataFrame(data=d2) | ||
|
||
|
||
@pytest.fixture | ||
def row_match_scen1(): | ||
return test_df2 | ||
|
||
|
||
def test_row_matches(row_match_scen1): | ||
res = row_matches( | ||
row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity | ||
) | ||
|
||
assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []} |