diff --git a/utils/README.md b/utils/README.md new file mode 100644 index 0000000..ec020f4 --- /dev/null +++ b/utils/README.md @@ -0,0 +1,22 @@ +## Minnesota Util: +#### MN_util.py + +Util functions for MN EDA +1. datasets_col_consistent (deprecated) +2. preprocess_candidate_df (deprecated) +3. preprocess_noncandidate_df (deprecated) +4. preprocess_contribution_df (deprecated) +5. drop_nonclassifiable (deprecated) +6. preprocess_expenditure (deprecated) +7. drop_nonclassifiable_expenditure (deprecated) + +#### minnesota.py +1. entity_name_dictionary +2. preprocess_candidate_contribution +3. preprocess_noncandidate_contribution +4. preprocess_expenditure +5. preprocess +6. clean +7. standardize +8. create_tables +9. clean_state diff --git a/utils/constants.py b/utils/constants.py index ec6fd41..1bf573e 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -190,6 +190,7 @@ "fundraiser", ] + PA_MAIN_URL = "https://www.dos.pa.gov" PA_ZIPPED_URL = ( "/VotingElections/CandidatesCommittees/CampaignFinance/Resources/Documents/" diff --git a/utils/minnesota.py b/utils/minnesota.py index 32d27db..94edd75 100644 --- a/utils/minnesota.py +++ b/utils/minnesota.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd -from clean import StateCleaner -from constants import ( +from utils.clean import StateCleaner +from utils.constants import ( MN_CANDIDATE_CONTRIBUTION_COL, MN_CANDIDATE_CONTRIBUTION_MAP, MN_INDEPENDENT_EXPENDITURE_COL, @@ -242,11 +242,19 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]: Returns: A list of 1 standarized DataFrame matching database schema """ - df = data[0] + + df = data[0].copy() # Create a copy to avoid modifying the original DataFrame df["company"] = None # MN dataset has no company information df["party"] = None # MN dataset has no party information df["transaction_id"] = None df["office_sought"] = df["office_sought"].replace(MN_RACE_MAP) + + # Standardize entity names to match other states in the database schema + entity_map = self.entity_name_dictionary() + df["recipient_type"] = df["recipient_type"].map(entity_map) + df["donor_type"] = df["donor_type"].map(entity_map) + id_mapping = {} + # Standardize entity names to match othe states in database schema df["recipient_type"] = df["recipient_type"].replace(self.entity_name_dictionary) df["donor_type"] = df["donor_type"].replace(self.entity_name_dictionary) @@ -255,6 +263,7 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]: recipient_uuid = str(uuid.uuid4()) donor_uuid = str(uuid.uuid4()) transaction_uuid = str(uuid.uuid4()) + # MN has partial recipient id, generate uuid, map them to original id if row["recipient_id"]: if ( @@ -271,7 +280,9 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]: row["recipient_id"], recipient_uuid, ) - df["recipient_id"].iloc[index] = recipient_uuid + + df.at[index, "recipient_id"] = recipient_uuid + # MN has partial donor id, generate uuid, map them to original id if row["donor_id"]: if row["donor_type"] == "Individual" or row["donor_type"] == "Lobbyist": @@ -285,8 +296,11 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]: row["donor_id"], donor_uuid, ) - df["donor_id"].iloc[index] = donor_uuid - df["transaction_id"].iloc[index] = transaction_uuid + + df.at[index, "donor_id"] = donor_uuid + + df.at[index, "transaction_id"] = transaction_uuid + # Convert id_mapping to DataFrame and save to CSV id_mapping_df = pd.DataFrame.from_dict(