Merge pull request #51 from dsi-clinic/MN_abstract_class

Mn abstract class
uchicago-dsi · Dec 5, 2023 · 57390b3 · 57390b3
2 parents 261fa60 + 40f92a1
commit 57390b3
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 6 deletions.
diff --git a/utils/README.md b/utils/README.md
@@ -0,0 +1,22 @@
+## Minnesota Util:
+#### MN_util.py
+
+Util functions for MN EDA
+1. datasets_col_consistent (deprecated)
+2. preprocess_candidate_df (deprecated)
+3. preprocess_noncandidate_df (deprecated)
+4. preprocess_contribution_df (deprecated)
+5. drop_nonclassifiable (deprecated)
+6. preprocess_expenditure (deprecated)
+7. drop_nonclassifiable_expenditure (deprecated)
+
+#### minnesota.py
+1. entity_name_dictionary
+2. preprocess_candidate_contribution
+3. preprocess_noncandidate_contribution
+4. preprocess_expenditure
+5. preprocess
+6. clean
+7. standardize
+8. create_tables
+9. clean_state
diff --git a/utils/constants.py b/utils/constants.py
@@ -190,6 +190,7 @@
     "fundraiser",
 ]
 
+
 PA_MAIN_URL = "https://www.dos.pa.gov"
 PA_ZIPPED_URL = (
     "/VotingElections/CandidatesCommittees/CampaignFinance/Resources/Documents/"

diff --git a/utils/minnesota.py b/utils/minnesota.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 import pandas as pd
-from clean import StateCleaner
-from constants import (
+from utils.clean import StateCleaner
+from utils.constants import (
     MN_CANDIDATE_CONTRIBUTION_COL,
     MN_CANDIDATE_CONTRIBUTION_MAP,
     MN_INDEPENDENT_EXPENDITURE_COL,
@@ -242,11 +242,19 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]:
         Returns: A list of 1 standarized DataFrame matching database schema
         """
 
-        df = data[0]
+
+        df = data[0].copy()  # Create a copy to avoid modifying the original DataFrame
         df["company"] = None  # MN dataset has no company information
         df["party"] = None  # MN dataset has no party information
         df["transaction_id"] = None
         df["office_sought"] = df["office_sought"].replace(MN_RACE_MAP)
+
+        # Standardize entity names to match other states in the database schema
+        entity_map = self.entity_name_dictionary()
+        df["recipient_type"] = df["recipient_type"].map(entity_map)
+        df["donor_type"] = df["donor_type"].map(entity_map)
+        id_mapping = {}
+
         # Standardize entity names to match othe states in database schema
         df["recipient_type"] = df["recipient_type"].replace(self.entity_name_dictionary)
         df["donor_type"] = df["donor_type"].replace(self.entity_name_dictionary)
@@ -255,6 +263,7 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]:
             recipient_uuid = str(uuid.uuid4())
             donor_uuid = str(uuid.uuid4())
             transaction_uuid = str(uuid.uuid4())
+
             # MN has partial recipient id, generate uuid, map them to original id
             if row["recipient_id"]:
                 if (
@@ -271,7 +280,9 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]:
                     row["recipient_id"],
                     recipient_uuid,
                 )
-            df["recipient_id"].iloc[index] = recipient_uuid
+
+                df.at[index, "recipient_id"] = recipient_uuid
+
             # MN has partial donor id, generate uuid, map them to original id
             if row["donor_id"]:
                 if row["donor_type"] == "Individual" or row["donor_type"] == "Lobbyist":
@@ -285,8 +296,11 @@ def standardize(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]:
                     row["donor_id"],
                     donor_uuid,
                 )
-            df["donor_id"].iloc[index] = donor_uuid
-            df["transaction_id"].iloc[index] = transaction_uuid
+
+                df.at[index, "donor_id"] = donor_uuid
+
+            df.at[index, "transaction_id"] = transaction_uuid
+
 
         # Convert id_mapping to DataFrame and save to CSV
         id_mapping_df = pd.DataFrame.from_dict(