Skip to content

Commit

Permalink
Merge pull request #36 from dsi-clinic/update_classify
Browse files Browse the repository at this point in the history
Update classify
  • Loading branch information
averyschoen authored Mar 5, 2024
2 parents ecd73d0 + feda102 commit 863cfab
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 78 deletions.
124 changes: 46 additions & 78 deletions utils/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,27 @@
from utils.constants import c_org_names, f_companies, f_org_names


def classify_wrapper(individuals_df, organizations_df):
"""Wrapper for classificaiton in linkage pipeline
def classify_wrapper(
individuals_df: pd.DataFrame, organizations_df: pd.DataFrame
):
"""Wrapper for classification in linkage pipeline
Initialize the classify column in both dataframes and
call sub-functions classifying individuals and organizations
Args: individuals_df: cleaned and deduplicated dataframe of individuals
organizations_df: cleaned and deduplicated dataframe of organizations
Args:
individuals_df: cleaned and deduplicated dataframe of individuals
organizations_df: cleaned and deduplicated dataframe of organizations
Returns:
individuals and organizations datfarames with a new
'classification' column containing 'neutral', 'f', or 'c'.
'neutral' status is the default for all entities, and those tagged
as 'neutral' are entities which we could not confidently identify as
either fossil fuel or clean energy organizations or affiliates.
Classification is very conservative, and we are very confident that
entities classified as one group or another are related to them.
Returns: individuals and organizations datfarames with a new
'classification' column containing 'neutral', 'f', or 'c'
"""

individuals_df["classification"] = "neutral"
Expand All @@ -25,14 +35,24 @@ def classify_wrapper(individuals_df, organizations_df):
return classified_individuals, classified_orgs


def matcher(df, substring, column, category):
def matcher(df: pd.DataFrame, substring: str, column: str, category: str):
"""Applies a label to the classification column based on substrings
We run through a given column containing strings in the dataframe. We
seek out rows containing substrings, and apply a certain label to
the classification column. We initialize using the 'neutral' label and
use the 'f' and 'c' labels to denote fossil fuel and clean energy
entities respectively.
Args:
df: a pandas dataframe
substring: the string to search for
column: the column name in which to search
category: the category to assign the row, such as 'f' 'c' or 'neutral'
Returns:
A pandas dataframe in which rows matching the substring conditions in
a certain column are marked with the appropriate category
"""

bool_series = df[column].str.contains(substring, na=False)
Expand All @@ -42,12 +62,18 @@ def matcher(df, substring, column, category):
return df


def classify_individuals(individuals_df):
def classify_individuals(individuals_df: pd.DataFrame):
"""Part of the classification pipeline
We apply the matcher function to the individuals dataframe
repeatedly, using a variety of substrings to identify the
employees of fossil fuel companies.
We check if individuals work for a known fossil fuel company
and categorize them using the matcher() function.
Args:
individuals_df: a dataframe containing deduplicated
standardized individuals data
Returns:
an individuals dataframe updated with the fossil fuels category
"""

for i in f_companies:
Expand All @@ -56,12 +82,20 @@ def classify_individuals(individuals_df):
return individuals_df


def classify_orgs(organizations_df):
def classify_orgs(organizations_df: pd.DataFrame):
"""Part of the classification pipeline
We apply the matcher function to the organizations dataframe
repeatedly, using a variety of substrings to identify fossil
fuel and clean energy companies.
Args:
organizations_df: a dataframe containing deduplicated
standardized organizations data
Returns:
an organizations dataframe updated with the fossil fuels
and clean energy category
"""

for i in f_org_names:
Expand All @@ -71,69 +105,3 @@ def classify_orgs(organizations_df):
organizations_df = matcher(organizations_df, i, "name", "c")

return organizations_df


inds_list = []

# a list of individual names


def similarity_calculator(
df: pd.DataFrame, subject: str, n: int, comparison_func
) -> pd.DataFrame:
"""Find best matches to a subject name in a pandas dataframe
For a given individual or organization, the subject, we search through the
'name'column of a dataframe, select the n highest matches according to a
selected comparison function, and return those as a dataframe. This is meant
to be used manually to search for matches. For quick automated processing, see
automated_classifier().
Note that the comparison function must take in two inputs, both strings, and
output a percentage match
"""

similarities_df = df.copy()

similarities = similarities_df["name"].apply(
lambda x: comparison_func(x, subject)
)

similarities_df["similarities"] = similarities

top_n_matches = similarities_df.sort_values(
by=["similarities"], ascending=False
)[0:n]

return top_n_matches


def automated_classifier(
df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func
):
"""Using similarity_calculator, classify entities automatically
Feeding a dictionary of names and the associated statuses, we compare
the string matches and, if they exceed a certain threshold, classify
them as belonging to some group specified in the subjects dictionary.
"""

similarities_df = df.copy()

for subject in subjects_dict:
similarities = similarities_df["name"].apply(
lambda x, sub=subject: comparison_func(x, sub)
)
matches = similarities >= threshold

status = subjects_dict[subject]

similarities_df["classification"] = pd.Series(matches).apply(
lambda x, stat=status: stat if x else "neutral"
)

return similarities_df

# we can use the indices and/or select manually, just add a new
# column to the subjects table
# that marks fossil fuels, green energy, or neither
46 changes: 46 additions & 0 deletions utils/tests/test_classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import numpy as np
import pandas as pd
import pytest

from utils.classify import matcher

d = {
"name": [
"bob von rosevich",
"anantarya smith",
"bob j vonrosevich",
"missy elliot",
"mr johnson",
"quarantin directino",
"missy eliot",
"joseph johnson",
],
"address": [
"3 Blue Drive, Chicago",
"4 Blue Drive, Chicago",
"8 Fancy Way, Chicago",
"8 Fancy Way, Evanston",
"17 Regular Road, Chicago",
"42 Hollywood Boulevard, Chicago",
"8 Fancy Way, Evanston",
"17 Regular Road, Chicago",
],
}

test_df = pd.DataFrame(data=d)

test_df["classification"] = "neutral"


@pytest.fixture
def matcher_scen_1():
return test_df


def test_matcher_scen_1(matcher_scen_1):
matcher(matcher_scen_1, "Fancy", "address", "f")
res = test_df[test_df["classification"] == "f"]["name"].values

assert np.all(
res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])
)

0 comments on commit 863cfab

Please sign in to comment.