diff --git a/utils/classify.py b/utils/classify.py index 3c24f94..5475894 100644 --- a/utils/classify.py +++ b/utils/classify.py @@ -3,17 +3,27 @@ from utils.constants import c_org_names, f_companies, f_org_names -def classify_wrapper(individuals_df, organizations_df): - """Wrapper for classificaiton in linkage pipeline +def classify_wrapper( + individuals_df: pd.DataFrame, organizations_df: pd.DataFrame +): + """Wrapper for classification in linkage pipeline Initialize the classify column in both dataframes and call sub-functions classifying individuals and organizations - Args: individuals_df: cleaned and deduplicated dataframe of individuals - organizations_df: cleaned and deduplicated dataframe of organizations + Args: + individuals_df: cleaned and deduplicated dataframe of individuals + organizations_df: cleaned and deduplicated dataframe of organizations + + Returns: + individuals and organizations datfarames with a new + 'classification' column containing 'neutral', 'f', or 'c'. + 'neutral' status is the default for all entities, and those tagged + as 'neutral' are entities which we could not confidently identify as + either fossil fuel or clean energy organizations or affiliates. + Classification is very conservative, and we are very confident that + entities classified as one group or another are related to them. - Returns: individuals and organizations datfarames with a new - 'classification' column containing 'neutral', 'f', or 'c' """ individuals_df["classification"] = "neutral" @@ -25,7 +35,7 @@ def classify_wrapper(individuals_df, organizations_df): return classified_individuals, classified_orgs -def matcher(df, substring, column, category): +def matcher(df: pd.DataFrame, substring: str, column: str, category: str): """Applies a label to the classification column based on substrings We run through a given column containing strings in the dataframe. We @@ -33,6 +43,16 @@ def matcher(df, substring, column, category): the classification column. We initialize using the 'neutral' label and use the 'f' and 'c' labels to denote fossil fuel and clean energy entities respectively. + + Args: + df: a pandas dataframe + substring: the string to search for + column: the column name in which to search + category: the category to assign the row, such as 'f' 'c' or 'neutral' + + Returns: + A pandas dataframe in which rows matching the substring conditions in + a certain column are marked with the appropriate category """ bool_series = df[column].str.contains(substring, na=False) @@ -42,12 +62,18 @@ def matcher(df, substring, column, category): return df -def classify_individuals(individuals_df): +def classify_individuals(individuals_df: pd.DataFrame): """Part of the classification pipeline - We apply the matcher function to the individuals dataframe - repeatedly, using a variety of substrings to identify the - employees of fossil fuel companies. + We check if individuals work for a known fossil fuel company + and categorize them using the matcher() function. + + Args: + individuals_df: a dataframe containing deduplicated + standardized individuals data + + Returns: + an individuals dataframe updated with the fossil fuels category """ for i in f_companies: @@ -56,12 +82,20 @@ def classify_individuals(individuals_df): return individuals_df -def classify_orgs(organizations_df): +def classify_orgs(organizations_df: pd.DataFrame): """Part of the classification pipeline We apply the matcher function to the organizations dataframe repeatedly, using a variety of substrings to identify fossil fuel and clean energy companies. + + Args: + organizations_df: a dataframe containing deduplicated + standardized organizations data + + Returns: + an organizations dataframe updated with the fossil fuels + and clean energy category """ for i in f_org_names: @@ -71,69 +105,3 @@ def classify_orgs(organizations_df): organizations_df = matcher(organizations_df, i, "name", "c") return organizations_df - - -inds_list = [] - -# a list of individual names - - -def similarity_calculator( - df: pd.DataFrame, subject: str, n: int, comparison_func -) -> pd.DataFrame: - """Find best matches to a subject name in a pandas dataframe - - For a given individual or organization, the subject, we search through the - 'name'column of a dataframe, select the n highest matches according to a - selected comparison function, and return those as a dataframe. This is meant - to be used manually to search for matches. For quick automated processing, see - automated_classifier(). - - Note that the comparison function must take in two inputs, both strings, and - output a percentage match - """ - - similarities_df = df.copy() - - similarities = similarities_df["name"].apply( - lambda x: comparison_func(x, subject) - ) - - similarities_df["similarities"] = similarities - - top_n_matches = similarities_df.sort_values( - by=["similarities"], ascending=False - )[0:n] - - return top_n_matches - - -def automated_classifier( - df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func -): - """Using similarity_calculator, classify entities automatically - - Feeding a dictionary of names and the associated statuses, we compare - the string matches and, if they exceed a certain threshold, classify - them as belonging to some group specified in the subjects dictionary. - """ - - similarities_df = df.copy() - - for subject in subjects_dict: - similarities = similarities_df["name"].apply( - lambda x, sub=subject: comparison_func(x, sub) - ) - matches = similarities >= threshold - - status = subjects_dict[subject] - - similarities_df["classification"] = pd.Series(matches).apply( - lambda x, stat=status: stat if x else "neutral" - ) - - return similarities_df - - # we can use the indices and/or select manually, just add a new - # column to the subjects table - # that marks fossil fuels, green energy, or neither diff --git a/utils/tests/test_classify.py b/utils/tests/test_classify.py new file mode 100644 index 0000000..b6bce88 --- /dev/null +++ b/utils/tests/test_classify.py @@ -0,0 +1,46 @@ +import numpy as np +import pandas as pd +import pytest + +from utils.classify import matcher + +d = { + "name": [ + "bob von rosevich", + "anantarya smith", + "bob j vonrosevich", + "missy elliot", + "mr johnson", + "quarantin directino", + "missy eliot", + "joseph johnson", + ], + "address": [ + "3 Blue Drive, Chicago", + "4 Blue Drive, Chicago", + "8 Fancy Way, Chicago", + "8 Fancy Way, Evanston", + "17 Regular Road, Chicago", + "42 Hollywood Boulevard, Chicago", + "8 Fancy Way, Evanston", + "17 Regular Road, Chicago", + ], +} + +test_df = pd.DataFrame(data=d) + +test_df["classification"] = "neutral" + + +@pytest.fixture +def matcher_scen_1(): + return test_df + + +def test_matcher_scen_1(matcher_scen_1): + matcher(matcher_scen_1, "Fancy", "address", "f") + res = test_df[test_df["classification"] == "f"]["name"].values + + assert np.all( + res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"]) + )