diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index a809dfb..68f0fe4 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -5,4 +5,6 @@ # migrate code style to ruff 7313695f4c091a3943d17a0abea351987cc02eb6 # ruff format src/utils/classify_infogroup_data.py -4e4336ea0ff4af1ec6a84d309f042073b7eea25e \ No newline at end of file +4e4336ea0ff4af1ec6a84d309f042073b7eea25e +# fix code style of `collect_harvard_data` branch +4f978d2082a440f31479ca5cfbec90e8b7683b80 diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..217655e --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,26 @@ +Copyright (c) 2024 University of Chicago. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 8632527..2bbc73f 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@ 1. Collect: Gather key states' political campaign finance report data which should include recipient information, donor information, and transaction information. 2. Transform: Define database schema for storing transaction and entity information and write code to transform and validate raw data to fit appropriate schema. 3. Clean: Perform record linkage and fix likely data entry errors. -4. Classify: Label all entities as fossil fuel, clean energy, or other -5. Graph: Construct a network graph of campaign finance contributions -6. Analyze: Perform analysis on network data and join with other relevant dataset +4. Classify: Label all entities as fossil fuel, clean energy, or other. +5. Graph: Construct a network graph of campaign finance contributions with mirco-level and macro-level views. +6. Analyze: Perform analysis on network data and join with other relevant dataset. ## Setup @@ -33,24 +33,19 @@ For developing, please use either a Docker dev container or slurm computer clust ### Network Visualization -# TODO: #101 document what we want to see in the visualization and decide how many types of visual are needed - +The network visualizations created and their associated relevant metrics are housed in the `\output` directory. Specifically, [this](https://github.com/dsi-clinic/2024-winter-climate-cabinet-campaign-finance-tracker/tree/main/output/network_graphs) folder. Details about the approaches adopted for these visuals are present in [this](https://github.com/dsi-clinic/2024-winter-climate-cabinet-campaign-finance-tracker/blob/main/output/network_graphs/README.md) document. ## Repository Structure ### utils -Project python code +Project python code. ### notebooks -Contains short, clean notebooks to demonstrate analysis. +Contains short, clean notebooks to demonstrate analysis. This is a dynamic folder with notebooks added/removed as per current working processes. ### data -Contains details of acquiring all raw data used in repository. If data is small (<50MB) then it is okay to save it to the repo, making sure to clearly document how to the data is obtained. - -If the data is larger than 50MB than you should not add it to the repo and instead document how to get the data in the README.md file in the data directory. - -This [README.md file](/data/README.md) should be kept up to date. +Contains details of acquiring all raw data used in repository. ### output This folder is empty by default. The final outputs of make commands will be placed here by default. @@ -74,7 +69,7 @@ Student Email: npashilkar@uchicago.edu Student Name: Yangge Xu Student Email: yanggexu@uchicago.edu -Student Name: Bhavya Pandey +Student Name: Bhavya Pandey Student Email: bhavyapandey@uchicago.edu Student Name: Kaya Lee diff --git a/data/README.md b/data/README.md index 7f56201..6240106 100644 --- a/data/README.md +++ b/data/README.md @@ -198,6 +198,8 @@ These companies were listed on the website in a table, which was copy and pasted ### How to access: This file is called FFF_oil_companies.csv and can be downloaded from the climate cabinet drive in 2024-spring-clinic folder. +-Limitation: companies are global companies, so they may not all be applicable for our U.S. based analysis. + This file should be saved in the path data/raw_classification/FFF_oil_companies.csv ### Features @@ -250,3 +252,32 @@ This file is called SIC_codes and and should be downloaded as a csv from the cli - SIC_code: the SIC code associated with the company. If the SIC code is shorter than 6 numbers, the code represents the first n numbers of an SIC code - SIC_code_description: description associated with the SIC code - classification: if the company is fossil fuel (f), clean energy (c), maybe fossil fuel (uf), maybe clean energy (uc) + +## State Legislative Election Returns (1967-2016) + +### Overview +This [dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/3WZFK9) accompanies the State Legislative Election Returns dataset, which chronicles detailed election outcomes for individual candidates in state legislative races across the United States, covering the period from 1967 to 2016. This extensive dataset allows for historical analysis of electoral trends, candidate performance, and legislative turnover. + +### Data Source +The dataset aggregates data from multiple authoritative sources, including state election boards and historical archives, to ensure comprehensive coverage and accuracy. It provides an invaluable resource for researchers focusing on political science, electoral behavior, and governance. +# TODO: #106 add a link to where this came from and where it is expected to be saved to run the pipeline + +### Features +- **Temporal Coverage:** Includes data from 1967 to 2016, capturing a broad spectrum of political and historical contexts. +- **Utility:** Designed to support a wide range of analyses, from simple descriptive statistics to complex longitudinal studies. + +### Key Variables +The dataset comprises several critical variables that capture the essentials of each election: +- **caseid:** A unique identifier for each election entry. +- **year, month, day:** The date on which the election was held. +- **sab:** State abbreviation, indicating the state in which the election took place. +- **cname:** Name of the county for localized analysis. +- **candid:** A unique identifier for each candidate. +- **vote:** The number of votes received by the candidate. +- **termz:** The actual length of term the elected candidate served. +- **cand:** Name of the candidate. +- **sen:** Indicates whether the election was for the state senate. +- **partyt:** The political party affiliation of the candidate. +- **outcome:** The result of the election for the candidate (e.g., won, lost). +- **last, first:** Last and first names of the candidate. +- **v19_20171211:** A standardized candidate name variable, updated as of December 11, 2017. diff --git a/data/raw/HV/SLERs1967to2016_20180927_Codebook.docx b/data/raw/HV/SLERs1967to2016_20180927_Codebook.docx new file mode 100644 index 0000000..a24f567 Binary files /dev/null and b/data/raw/HV/SLERs1967to2016_20180927_Codebook.docx differ diff --git a/notebooks/README.md b/notebooks/README.md index e65e1c8..12ba02c 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -7,3 +7,6 @@ * `MN_EDA.ipynb` : Notebook containing the EDA and visualizations for Minnesota contribution and expenditure data * `PA_EDA.ipynb` : This notebook contains the EDA for Pennsylvania datasets on contributions, filer information, and expenditure data from 2018-2023. + +* `harvard_eda.ipynb`: This notebook contains the EDA for the Harvard datasets on election results from 1967 - 2016 + diff --git a/notebooks/election_dedupe.ipynb b/notebooks/election_dedupe.ipynb new file mode 100644 index 0000000..f29554d --- /dev/null +++ b/notebooks/election_dedupe.ipynb @@ -0,0 +1,960 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Election records splink demonstration" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from splink.duckdb.linker import DuckDBLinker\n", + "\n", + "pd.options.display.max_rows = 1000\n", + "election_df = pd.read_csv(\"/project/output/transformed/election_results_table.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "election_df = election_df.head(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.duckdb.blocking_rule_library import block_on\n", + "\n", + "settings = {\n", + " \"link_type\": \"dedupe_only\",\n", + " \"blocking_rules_to_generate_predictions\": [\n", + " block_on([\"first_name\", \"last_name\", \"year\",\"month\",\"state\"]),\n", + " ],\n", + "}\n", + "linker = DuckDBLinker(election_df, settings)\n", + "\n", + "linker.profile_columns(\n", + " [\"first_name\", \"last_name\", \"state\"], top_n=10, bottom_n=5\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import splink.duckdb.comparison_library as cl\n", + "import splink.duckdb.comparison_template_library as ctl\n", + "\n", + "settings = {\n", + " \"link_type\": \"dedupe_only\",\n", + " \"blocking_rules_to_generate_predictions\": [\n", + " block_on([\"first_name\", \"last_name\", \"year\",\"month\",\"county\"]),\n", + " ],\n", + " \"comparisons\": [\n", + " ctl.name_comparison(\"first_name\", term_frequency_adjustments=True),\n", + " ctl.name_comparison(\"last_name\", term_frequency_adjustments=True),\n", + " cl.exact_match(\"year\", term_frequency_adjustments=True),\n", + " cl.exact_match(\"month\", term_frequency_adjustments=True),\n", + " cl.exact_match(\"county\", term_frequency_adjustments=True),\n", + " ],\n", + " \"retain_matching_columns\": True,\n", + " \"retain_intermediate_calculation_columns\": True,\n", + " \"max_iterations\": 10,\n", + " \"em_convergence\": 0.01\n", + "}\n", + "\n", + "linker = DuckDBLinker(election_df, settings)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Deterministic matching rules led to no observed matches! This means that no possible record pairs are matches, and no records are linked to one another.\n", + "If this is truly the case then you do not need to run the linkage model.\n", + "However this is usually in error; expected rules to have recall of 60%. Consider revising rules as they may have an error.\n", + "Probability two random records match is estimated to be 0.\n", + "This means that amongst all possible pairwise record comparisons, one in Infinity are expected to match. With 499,500 total possible comparisons, we expect a total of around 0.00 matching pairs\n" + ] + } + ], + "source": [ + "linker.estimate_probability_two_random_records_match(\n", + " [\n", + " \"l.first_name = r.first_name and l.last_name = r.last_name and l.county = r.county and l.year = r.year and l.month = r.month\",\n", + " ],\n", + " recall=0.6,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n", + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - last_name (no m values are trained).\n", + " - year (no m values are trained).\n", + " - month (no m values are trained).\n", + " - county (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.estimate_u_using_random_sampling(max_pairs = 5e6)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"first_name\" = r.\"first_name\") AND (l.\"last_name\" = r.\"last_name\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - year\n", + " - month\n", + " - county\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - first_name\n", + " - last_name\n", + "\n" + ] + }, + { + "ename": "SplinkException", + "evalue": "Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_45ad164c2):\nCREATE TABLE __splink__m_u_counts_45ad164c2 AS\n(\n WITH __splink__df_comparison_vectors AS (\n SELECT\n *\n FROM __splink__df_comparison_vectors_f7d241f7d\n ), __splink__df_match_weight_parts AS (\n SELECT\n \"unique_id_l\",\n \"unique_id_r\",\n gamma_year,\n CASE\n WHEN gamma_year = -1\n THEN CAST(1.0 AS DOUBLE)\n WHEN gamma_year = 1\n THEN CAST(1.0477455337921533 AS DOUBLE)\n WHEN gamma_year = 0\n THEN CAST(0.5359557072040178 AS DOUBLE)\n END AS bf_year,\n CASE\n WHEN gamma_year = -1\n THEN CAST(1 AS DOUBLE)\n WHEN gamma_year = 1\n THEN (\n CASE\n WHEN NOT COALESCE(\"tf_year_l\", \"tf_year_r\") IS NULL\n THEN POWER(\n CAST(0.9067087087087087 AS DOUBLE) / (\n CASE\n WHEN COALESCE(\"tf_year_l\", \"tf_year_r\") >= COALESCE(\"tf_year_r\", \"tf_year_l\")\n THEN COALESCE(\"tf_year_l\", \"tf_year_r\")\n ELSE COALESCE(\"tf_year_r\", \"tf_year_l\")\n END\n ),\n CAST(1.0 AS DOUBLE)\n )\n ELSE CAST(1 AS DOUBLE)\n END\n )\n WHEN gamma_year = 0\n THEN CAST(1 AS DOUBLE)\n END AS bf_tf_adj_year,\n gamma_month,\n CASE\n WHEN gamma_month = -1\n THEN CAST(1.0 AS DOUBLE)\n WHEN gamma_month = 1\n THEN CAST(2.0196764431732572 AS DOUBLE)\n WHEN gamma_month = 0\n THEN CAST(0.09440595126044711 AS DOUBLE)\n END AS bf_month,\n CASE\n WHEN gamma_month = -1\n THEN CAST(1 AS DOUBLE)\n WHEN gamma_month = 1\n THEN (\n CASE\n WHEN NOT COALESCE(\"tf_month_l\", \"tf_month_r\") IS NULL\n THEN POWER(\n CAST(0.4703723723723724 AS DOUBLE) / (\n CASE\n WHEN COALESCE(\"tf_month_l\", \"tf_month_r\") >= COALESCE(\"tf_month_r\", \"tf_month_l\")\n THEN COALESCE(\"tf_month_l\", \"tf_month_r\")\n ELSE COALESCE(\"tf_month_r\", \"tf_month_l\")\n END\n ),\n CAST(1.0 AS DOUBLE)\n )\n ELSE CAST(1 AS DOUBLE)\n END\n )\n WHEN gamma_month = 0\n THEN CAST(1 AS DOUBLE)\n END AS bf_tf_adj_month,\n gamma_county,\n CASE\n WHEN gamma_county = -1\n THEN CAST(1.0 AS DOUBLE)\n WHEN gamma_county = 1\n THEN CAST(48.3456647398844 AS DOUBLE)\n WHEN gamma_county = 0\n THEN CAST(0.051002201367164914 AS DOUBLE)\n END AS bf_county,\n CASE\n WHEN gamma_county = -1\n THEN CAST(1 AS DOUBLE)\n WHEN gamma_county = 1\n THEN (\n CASE\n WHEN NOT COALESCE(\"tf_county_l\", \"tf_county_r\") IS NULL\n THEN POWER(\n CAST(0.019650159018627895 AS DOUBLE) / (\n CASE\n WHEN COALESCE(\"tf_county_l\", \"tf_county_r\") >= COALESCE(\"tf_county_r\", \"tf_county_l\")\n THEN COALESCE(\"tf_county_l\", \"tf_county_r\")\n ELSE COALESCE(\"tf_county_r\", \"tf_county_l\")\n END\n ),\n CAST(1.0 AS DOUBLE)\n )\n ELSE CAST(1 AS DOUBLE)\n END\n )\n WHEN gamma_county = 0\n THEN CAST(1 AS DOUBLE)\n END AS bf_tf_adj_county\n FROM __splink__df_comparison_vectors\n ), __splink__df_predict AS (\n SELECT\n LOG2(\n CAST(0.0 AS DOUBLE) * bf_year * bf_tf_adj_year * bf_month * bf_tf_adj_month * bf_county * bf_tf_adj_county\n ) AS match_weight,\n CASE\n WHEN bf_year = CAST('infinity' AS DOUBLE)\n OR bf_tf_adj_year = CAST('infinity' AS DOUBLE)\n OR bf_month = CAST('infinity' AS DOUBLE)\n OR bf_tf_adj_month = CAST('infinity' AS DOUBLE)\n OR bf_county = CAST('infinity' AS DOUBLE)\n OR bf_tf_adj_county = CAST('infinity' AS DOUBLE)\n THEN 1.0\n ELSE (\n CAST(0.0 AS DOUBLE) * bf_year * bf_tf_adj_year * bf_month * bf_tf_adj_month * bf_county * bf_tf_adj_county\n ) / (\n 1 + (\n CAST(0.0 AS DOUBLE) * bf_year * bf_tf_adj_year * bf_month * bf_tf_adj_month * bf_county * bf_tf_adj_county\n )\n )\n END AS match_probability,\n \"unique_id_l\",\n \"unique_id_r\",\n gamma_year,\n gamma_month,\n gamma_county\n FROM __splink__df_match_weight_parts\n ORDER BY\n 1 NULLS LAST\n )\n SELECT\n gamma_year AS comparison_vector_value,\n SUM(match_probability * 1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) AS u_count,\n 'year' AS output_column_name\n FROM __splink__df_predict\n GROUP BY\n gamma_year\n UNION ALL\n SELECT\n gamma_month AS comparison_vector_value,\n SUM(match_probability * 1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) AS u_count,\n 'month' AS output_column_name\n FROM __splink__df_predict\n GROUP BY\n gamma_month\n UNION ALL\n SELECT\n gamma_county AS comparison_vector_value,\n SUM(match_probability * 1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) AS u_count,\n 'county' AS output_column_name\n FROM __splink__df_predict\n GROUP BY\n gamma_county\n UNION ALL\n SELECT\n 0 AS comparison_vector_value,\n SUM(match_probability * 1) / SUM(1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) / SUM(1) AS u_count,\n '_probability_two_random_records_match' AS output_column_name\n FROM __splink__df_predict\n)\n\nError was: Out of Range Error: cannot take logarithm of zero", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOutOfRangeException\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:713\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 712\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 713\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 715\u001b[0m \u001b[38;5;66;03m# Parse our SQL through sqlglot to pretty print\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:225\u001b[0m, in \u001b[0;36mDuckDBLinker._run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_sql_execution\u001b[39m(\u001b[38;5;28mself\u001b[39m, final_sql, templated_name, physical_name):\n\u001b[0;32m--> 225\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_con\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mOutOfRangeException\u001b[0m: Out of Range Error: cannot take logarithm of zero", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mSplinkException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[22], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m training_blocking_rule \u001b[38;5;241m=\u001b[39m block_on([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfirst_name\u001b[39m\u001b[38;5;124m\"\u001b[39m,\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlast_name\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m----> 2\u001b[0m training_session_names \u001b[38;5;241m=\u001b[39m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_parameters_using_expectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtraining_blocking_rule\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:1706\u001b[0m, in \u001b[0;36mLinker.estimate_parameters_using_expectation_maximisation\u001b[0;34m(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)\u001b[0m\n\u001b[1;32m 1684\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mWARNING: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have provided comparisons_to_deactivate but not \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas an exact match.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1693\u001b[0m )\n\u001b[1;32m 1695\u001b[0m em_training_session \u001b[38;5;241m=\u001b[39m EMTrainingSession(\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1697\u001b[0m blocking_rule,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1703\u001b[0m estimate_without_term_frequencies\u001b[38;5;241m=\u001b[39mestimate_without_term_frequencies,\n\u001b[1;32m 1704\u001b[0m )\n\u001b[0;32m-> 1706\u001b[0m \u001b[43mem_training_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_populate_m_u_from_trained_values()\n\u001b[1;32m 1710\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m populate_probability_two_random_records_match_from_trained_values:\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/em_training_session.py:197\u001b[0m, in \u001b[0;36mEMTrainingSession._train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m EMTrainingException(\n\u001b[1;32m 182\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTraining rule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbr_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m resulted in no record pairs. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis means that in the supplied data set \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe number of comparisons that will be generated by a blocking rule.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 192\u001b[0m )\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# Compute the new params, populating the paramters in the copied settings object\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;66;03m# At this stage, we do not overwrite any of the parameters\u001b[39;00m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;66;03m# in the original (main) setting object\u001b[39;00m\n\u001b[0;32m--> 197\u001b[0m \u001b[43mexpectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcvv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 199\u001b[0m rule \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_blocking_rule_for_training\u001b[38;5;241m.\u001b[39mblocking_rule_sql\n\u001b[1;32m 200\u001b[0m training_desc \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEM, blocked on: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrule\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/expectation_maximisation.py:256\u001b[0m, in \u001b[0;36mexpectation_maximisation\u001b[0;34m(em_training_session, df_comparison_vector_values)\u001b[0m\n\u001b[1;32m 254\u001b[0m df_params \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39m_execute_sql_pipeline([agreement_pattern_counts])\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 256\u001b[0m df_params \u001b[38;5;241m=\u001b[39m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf_comparison_vector_values\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 257\u001b[0m param_records \u001b[38;5;241m=\u001b[39m df_params\u001b[38;5;241m.\u001b[39mas_pandas_dataframe()\n\u001b[1;32m 258\u001b[0m param_records \u001b[38;5;241m=\u001b[39m compute_proportions_for_new_parameters(param_records)\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:651\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sql_to_splink_dataframe_checking_cache(\n\u001b[1;32m 646\u001b[0m sql_gen,\n\u001b[1;32m 647\u001b[0m output_tablename_templated,\n\u001b[1;32m 648\u001b[0m use_cache,\n\u001b[1;32m 649\u001b[0m )\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mreset()\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:645\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 642\u001b[0m output_tablename_templated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mqueue[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39moutput_table_name\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_to_splink_dataframe_checking_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43msql_gen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:911\u001b[0m, in \u001b[0;36mLinker._sql_to_splink_dataframe_checking_cache\u001b[0;34m(self, sql, output_tablename_templated, use_cache)\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_pd) \u001b[38;5;66;03m# noqa: T201\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 911\u001b[0m splink_dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_against_backend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name_hash\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intermediate_table_cache\u001b[38;5;241m.\u001b[39mexecuted_queries\u001b[38;5;241m.\u001b[39mappend(splink_dataframe)\n\u001b[1;32m 916\u001b[0m splink_dataframe\u001b[38;5;241m.\u001b[39mcreated_by_splink \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:220\u001b[0m, in \u001b[0;36mDuckDBLinker._execute_sql_against_backend\u001b[0;34m(self, sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_delete_table_from_database(physical_name)\n\u001b[1;32m 215\u001b[0m sql \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124mCREATE TABLE \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124mAS\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m--> 220\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_and_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DuckDBDataFrame(templated_name, physical_name, \u001b[38;5;28mself\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:725\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 723\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SplinkException(\n\u001b[1;32m 726\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError executing the following sql for table \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 727\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemplated_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfinal_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mError was: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 729\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", + "\u001b[0;31mSplinkException\u001b[0m: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_45ad164c2):\nCREATE TABLE __splink__m_u_counts_45ad164c2 AS\n(\n WITH __splink__df_comparison_vectors AS (\n SELECT\n *\n FROM __splink__df_comparison_vectors_f7d241f7d\n ), __splink__df_match_weight_parts AS (\n SELECT\n \"unique_id_l\",\n \"unique_id_r\",\n gamma_year,\n CASE\n WHEN gamma_year = -1\n THEN CAST(1.0 AS DOUBLE)\n WHEN gamma_year = 1\n THEN CAST(1.0477455337921533 AS DOUBLE)\n WHEN gamma_year = 0\n THEN CAST(0.5359557072040178 AS DOUBLE)\n END AS bf_year,\n CASE\n WHEN gamma_year = -1\n THEN CAST(1 AS DOUBLE)\n WHEN gamma_year = 1\n THEN (\n CASE\n WHEN NOT COALESCE(\"tf_year_l\", \"tf_year_r\") IS NULL\n THEN POWER(\n CAST(0.9067087087087087 AS DOUBLE) / (\n CASE\n WHEN COALESCE(\"tf_year_l\", \"tf_year_r\") >= COALESCE(\"tf_year_r\", \"tf_year_l\")\n THEN COALESCE(\"tf_year_l\", \"tf_year_r\")\n ELSE COALESCE(\"tf_year_r\", \"tf_year_l\")\n END\n ),\n CAST(1.0 AS DOUBLE)\n )\n ELSE CAST(1 AS DOUBLE)\n END\n )\n WHEN gamma_year = 0\n THEN CAST(1 AS DOUBLE)\n END AS bf_tf_adj_year,\n gamma_month,\n CASE\n WHEN gamma_month = -1\n THEN CAST(1.0 AS DOUBLE)\n WHEN gamma_month = 1\n THEN CAST(2.0196764431732572 AS DOUBLE)\n WHEN gamma_month = 0\n THEN CAST(0.09440595126044711 AS DOUBLE)\n END AS bf_month,\n CASE\n WHEN gamma_month = -1\n THEN CAST(1 AS DOUBLE)\n WHEN gamma_month = 1\n THEN (\n CASE\n WHEN NOT COALESCE(\"tf_month_l\", \"tf_month_r\") IS NULL\n THEN POWER(\n CAST(0.4703723723723724 AS DOUBLE) / (\n CASE\n WHEN COALESCE(\"tf_month_l\", \"tf_month_r\") >= COALESCE(\"tf_month_r\", \"tf_month_l\")\n THEN COALESCE(\"tf_month_l\", \"tf_month_r\")\n ELSE COALESCE(\"tf_month_r\", \"tf_month_l\")\n END\n ),\n CAST(1.0 AS DOUBLE)\n )\n ELSE CAST(1 AS DOUBLE)\n END\n )\n WHEN gamma_month = 0\n THEN CAST(1 AS DOUBLE)\n END AS bf_tf_adj_month,\n gamma_county,\n CASE\n WHEN gamma_county = -1\n THEN CAST(1.0 AS DOUBLE)\n WHEN gamma_county = 1\n THEN CAST(48.3456647398844 AS DOUBLE)\n WHEN gamma_county = 0\n THEN CAST(0.051002201367164914 AS DOUBLE)\n END AS bf_county,\n CASE\n WHEN gamma_county = -1\n THEN CAST(1 AS DOUBLE)\n WHEN gamma_county = 1\n THEN (\n CASE\n WHEN NOT COALESCE(\"tf_county_l\", \"tf_county_r\") IS NULL\n THEN POWER(\n CAST(0.019650159018627895 AS DOUBLE) / (\n CASE\n WHEN COALESCE(\"tf_county_l\", \"tf_county_r\") >= COALESCE(\"tf_county_r\", \"tf_county_l\")\n THEN COALESCE(\"tf_county_l\", \"tf_county_r\")\n ELSE COALESCE(\"tf_county_r\", \"tf_county_l\")\n END\n ),\n CAST(1.0 AS DOUBLE)\n )\n ELSE CAST(1 AS DOUBLE)\n END\n )\n WHEN gamma_county = 0\n THEN CAST(1 AS DOUBLE)\n END AS bf_tf_adj_county\n FROM __splink__df_comparison_vectors\n ), __splink__df_predict AS (\n SELECT\n LOG2(\n CAST(0.0 AS DOUBLE) * bf_year * bf_tf_adj_year * bf_month * bf_tf_adj_month * bf_county * bf_tf_adj_county\n ) AS match_weight,\n CASE\n WHEN bf_year = CAST('infinity' AS DOUBLE)\n OR bf_tf_adj_year = CAST('infinity' AS DOUBLE)\n OR bf_month = CAST('infinity' AS DOUBLE)\n OR bf_tf_adj_month = CAST('infinity' AS DOUBLE)\n OR bf_county = CAST('infinity' AS DOUBLE)\n OR bf_tf_adj_county = CAST('infinity' AS DOUBLE)\n THEN 1.0\n ELSE (\n CAST(0.0 AS DOUBLE) * bf_year * bf_tf_adj_year * bf_month * bf_tf_adj_month * bf_county * bf_tf_adj_county\n ) / (\n 1 + (\n CAST(0.0 AS DOUBLE) * bf_year * bf_tf_adj_year * bf_month * bf_tf_adj_month * bf_county * bf_tf_adj_county\n )\n )\n END AS match_probability,\n \"unique_id_l\",\n \"unique_id_r\",\n gamma_year,\n gamma_month,\n gamma_county\n FROM __splink__df_match_weight_parts\n ORDER BY\n 1 NULLS LAST\n )\n SELECT\n gamma_year AS comparison_vector_value,\n SUM(match_probability * 1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) AS u_count,\n 'year' AS output_column_name\n FROM __splink__df_predict\n GROUP BY\n gamma_year\n UNION ALL\n SELECT\n gamma_month AS comparison_vector_value,\n SUM(match_probability * 1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) AS u_count,\n 'month' AS output_column_name\n FROM __splink__df_predict\n GROUP BY\n gamma_month\n UNION ALL\n SELECT\n gamma_county AS comparison_vector_value,\n SUM(match_probability * 1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) AS u_count,\n 'county' AS output_column_name\n FROM __splink__df_predict\n GROUP BY\n gamma_county\n UNION ALL\n SELECT\n 0 AS comparison_vector_value,\n SUM(match_probability * 1) / SUM(1) AS m_count,\n SUM((\n 1 - match_probability\n ) * 1) / SUM(1) AS u_count,\n '_probability_two_random_records_match' AS output_column_name\n FROM __splink__df_predict\n)\n\nError was: Out of Range Error: cannot take logarithm of zero" + ] + } + ], + "source": [ + "training_blocking_rule = block_on([\"first_name\",\"last_name\"])\n", + "training_session_names = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"year\" = r.\"year\") AND (l.\"month\" = r.\"month\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - last_name\n", + " - state\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - year\n", + " - month\n", + "\n", + "Iteration 1: Largest change in params was -0.0499 in the m_probability of state, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.0101 in the m_probability of first_name, level `Damerau_levenshtein <= 1`\n", + "Iteration 3: Largest change in params was 0.00206 in the m_probability of first_name, level `Damerau_levenshtein <= 1`\n", + "\n", + "EM converged after 3 iterations\n", + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on([\"year\",\"month\"])\n", + "training_session_date = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_name_llast_name_l
0gregalbritton
1gregalbritton
2gregalbritton
3gregalbritton
4gregalbritton
\n", + "
" + ], + "text/plain": [ + " first_name_l last_name_l\n", + "0 greg albritton\n", + "1 greg albritton\n", + "2 greg albritton\n", + "3 greg albritton\n", + "4 greg albritton" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_predict = linker.predict()\n", + "df_e = df_predict.as_pandas_dataframe(limit = 5)\n", + "df_e[[\"first_name_l\",\"last_name_l\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# district in ind_table -> the office they're running for\n", + "# try splink function \n", + "# " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0yearmonthdaystatecountydistrict_designation_ballotdistrictdistrict_numbergeographic_post...votetermfull_namesenatepartydistrict_seat_numberoutcomelast_namefirst_nameunique_id
71556782014114ALmobile22NaN220...6554.0albritton, greg1republican1walbrittongregca8befe5-7ffc-4563-8cf4-18adefe4f5ea
71656792014114ALchoctaw22NaN220...7004.0albritton, greg1republican1walbrittongreg11fdbc1a-eeaa-4227-8feb-36633881de50
71756802014114ALconecuh22NaN220...10034.0albritton, greg1republican1walbrittongreg518c68d3-d0d9-496a-902c-a3271b0d9320
71856812014114ALmonroe22NaN220...21794.0albritton, greg1republican1walbrittongreg532ce6cd-633f-490a-8500-60868f227477
71956822014114ALwashington22NaN220...27264.0albritton, greg1republican1walbrittongrega5ff357f-5c45-4eed-97ed-c67ef2f23829
72056832014114ALclarke22NaN220...32814.0albritton, greg1republican1walbrittongregeb1ae14c-6826-4aa1-8fd3-bbbd0dcd6e6f
72156842014114ALescambia22NaN220...52334.0albritton, greg1republican1walbrittongreg629766fd-365a-403f-a1f5-02914f3c0f37
72256852014114ALbaldwin22NaN220...73854.0albritton, greg1republican1walbrittongreg0b59008a-3c9e-4497-8482-c8b156d823d5
\n", + "

8 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 year month day state county \\\n", + "715 5678 2014 11 4 AL mobile \n", + "716 5679 2014 11 4 AL choctaw \n", + "717 5680 2014 11 4 AL conecuh \n", + "718 5681 2014 11 4 AL monroe \n", + "719 5682 2014 11 4 AL washington \n", + "720 5683 2014 11 4 AL clarke \n", + "721 5684 2014 11 4 AL escambia \n", + "722 5685 2014 11 4 AL baldwin \n", + "\n", + " district_designation_ballot district district_number geographic_post \\\n", + "715 22 NaN 22 0 \n", + "716 22 NaN 22 0 \n", + "717 22 NaN 22 0 \n", + "718 22 NaN 22 0 \n", + "719 22 NaN 22 0 \n", + "720 22 NaN 22 0 \n", + "721 22 NaN 22 0 \n", + "722 22 NaN 22 0 \n", + "\n", + " ... vote term full_name senate party \\\n", + "715 ... 655 4.0 albritton, greg 1 republican \n", + "716 ... 700 4.0 albritton, greg 1 republican \n", + "717 ... 1003 4.0 albritton, greg 1 republican \n", + "718 ... 2179 4.0 albritton, greg 1 republican \n", + "719 ... 2726 4.0 albritton, greg 1 republican \n", + "720 ... 3281 4.0 albritton, greg 1 republican \n", + "721 ... 5233 4.0 albritton, greg 1 republican \n", + "722 ... 7385 4.0 albritton, greg 1 republican \n", + "\n", + " district_seat_number outcome last_name first_name \\\n", + "715 1 w albritton greg \n", + "716 1 w albritton greg \n", + "717 1 w albritton greg \n", + "718 1 w albritton greg \n", + "719 1 w albritton greg \n", + "720 1 w albritton greg \n", + "721 1 w albritton greg \n", + "722 1 w albritton greg \n", + "\n", + " unique_id \n", + "715 ca8befe5-7ffc-4563-8cf4-18adefe4f5ea \n", + "716 11fdbc1a-eeaa-4227-8feb-36633881de50 \n", + "717 518c68d3-d0d9-496a-902c-a3271b0d9320 \n", + "718 532ce6cd-633f-490a-8500-60868f227477 \n", + "719 a5ff357f-5c45-4eed-97ed-c67ef2f23829 \n", + "720 eb1ae14c-6826-4aa1-8fd3-bbbd0dcd6e6f \n", + "721 629766fd-365a-403f-a1f5-02914f3c0f37 \n", + "722 0b59008a-3c9e-4497-8482-c8b156d823d5 \n", + "\n", + "[8 rows x 22 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "election_df[(election_df[\"first_name\"] == \"greg\") & (election_df[\"last_name\"] == \"albritton\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame:\n", + " \"\"\"Use splink to deduplicate dataframe based on settings\n", + " \n", + " Configuration settings and blocking can be found in constants.py as\n", + " individuals_settings, individuals_blocking, organizations_settings,\n", + " organizations_blocking\n", + "\n", + " Uses the splink library which employs probabilistic matching for\n", + " record linkage\n", + " https://moj-analytical-services.github.io/splink/index.html\n", + "\n", + " Args:\n", + " df: dataframe\n", + " settings: configuration settings\n", + " (based on splink documentation and dataframe columns)\n", + " blocking: list of columns to block on for the table\n", + " (cuts dataframe into parts based on columns labeled blocks)\n", + "\n", + " Returns:\n", + " deduplicated version of initial dataframe with column 'matching_id'\n", + " that holds list of matching unique_ids\n", + " \"\"\"\n", + " # Initialize the linker object\n", + " linker = DuckDBLinker(df, settings)\n", + "\n", + " # Estimate probability that two random records match\n", + " linker.estimate_probability_two_random_records_match(\n", + " blocking, recall=0.80\n", + " )\n", + "\n", + " # Estimate the parameter u using random sampling\n", + " linker.estimate_u_using_random_sampling(max_pairs=5e6)\n", + "\n", + " # Run expectation maximisation on each block\n", + " for block in blocking:\n", + " linker.estimate_parameters_using_expectation_maximisation(block)\n", + "\n", + " # Predict matches\n", + " df_predict = linker.predict()\n", + "\n", + " # Cluster predictions and threshold\n", + " clusters = linker.cluster_pairwise_predictions_at_threshold(\n", + " df_predict, threshold_match_probability=0.7\n", + " )\n", + " clusters_df = clusters.as_pandas_dataframe()\n", + "\n", + " match_list_df = (\n", + " clusters_df.groupby(\"cluster_id\")[\"unique_id\"].agg(list).reset_index()\n", + " )\n", + " match_list_df = match_list_df.rename(columns={\"unique_id\": \"duplicated\"})\n", + "\n", + " deduped_df = df.merge(\n", + " match_list_df,\n", + " left_on=\"unique_id\",\n", + " right_on=\"duplicated\",\n", + " how=\"left\"\n", + " )\n", + "\n", + " deduped_df[\"matching_id\"] = deduped_df[\"cluster_id\"]\n", + "\n", + "\n", + " deduped_df = deduped_df.drop(columns=[\"duplicated\", \"cluster_id\"])\n", + "\n", + " return deduped_df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 4.13e-05.\n", + "This means that amongst all possible pairwise record comparisons, one in 24,186.40 are expected to match. With 1,343,977,935 total possible comparisons, we expect a total of around 55,567.50 matching pairs\n", + "----- Estimating u probabilities using random sampling -----\n", + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - last_name (no m values are trained).\n", + " - year (no m values are trained).\n", + " - month (no m values are trained).\n", + " - state (no m values are trained).\n", + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "l.first_name = r.first_name and l.last_name = r.last_name and l.state = r.state and l.year = r.year and l.month = r.month\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - first_name\n", + " - last_name\n", + " - year\n", + " - month\n", + " - state\n", + "\n" + ] + }, + { + "ename": "SplinkException", + "evalue": "Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_cae27a405):\n\n CREATE TABLE __splink__m_u_counts_cae27a405\n AS\n (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_8e81f7a75), \n__splink__df_match_weight_parts as (\n select \"unique_id_l\",\"unique_id_r\" \n from __splink__df_comparison_vectors\n ), \n__splink__df_predict as (\n select\n log2(cast(1717.6657519007895 as float8) * ) as match_weight,\n CASE WHEN THEN 1.0 ELSE (cast(1717.6657519007895 as float8) * )/(1+(cast(1717.6657519007895 as float8) * )) END as match_probability,\n \"unique_id_l\",\"unique_id_r\" \n from __splink__df_match_weight_parts\n \n order by 1\n ) \n select 0 as comparison_vector_value,\n sum(match_probability * 1) /\n sum(1) as m_count,\n sum((1-match_probability) * 1) /\n sum(1) as u_count,\n '_probability_two_random_records_match' as output_column_name\n from __splink__df_predict\n )\n \n\nError was: Parser Error: syntax error at or near \")\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mParserException\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:713\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 712\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 713\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 715\u001b[0m \u001b[38;5;66;03m# Parse our SQL through sqlglot to pretty print\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:225\u001b[0m, in \u001b[0;36mDuckDBLinker._run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_sql_execution\u001b[39m(\u001b[38;5;28mself\u001b[39m, final_sql, templated_name, physical_name):\n\u001b[0;32m--> 225\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_con\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mParserException\u001b[0m: Parser Error: syntax error at or near \")\"", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mSplinkException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msplink_dedupe\u001b[49m\u001b[43m(\u001b[49m\u001b[43melection_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msettings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblocking\u001b[49m\u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[8], line 36\u001b[0m, in \u001b[0;36msplink_dedupe\u001b[0;34m(df, settings, blocking)\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# Run expectation maximisation on each block\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m block \u001b[38;5;129;01min\u001b[39;00m blocking:\n\u001b[0;32m---> 36\u001b[0m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_parameters_using_expectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblock\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;66;03m# Predict matches\u001b[39;00m\n\u001b[1;32m 39\u001b[0m df_predict \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mpredict()\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:1706\u001b[0m, in \u001b[0;36mLinker.estimate_parameters_using_expectation_maximisation\u001b[0;34m(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)\u001b[0m\n\u001b[1;32m 1684\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mWARNING: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have provided comparisons_to_deactivate but not \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas an exact match.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1693\u001b[0m )\n\u001b[1;32m 1695\u001b[0m em_training_session \u001b[38;5;241m=\u001b[39m EMTrainingSession(\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1697\u001b[0m blocking_rule,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1703\u001b[0m estimate_without_term_frequencies\u001b[38;5;241m=\u001b[39mestimate_without_term_frequencies,\n\u001b[1;32m 1704\u001b[0m )\n\u001b[0;32m-> 1706\u001b[0m \u001b[43mem_training_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_populate_m_u_from_trained_values()\n\u001b[1;32m 1710\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m populate_probability_two_random_records_match_from_trained_values:\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/em_training_session.py:197\u001b[0m, in \u001b[0;36mEMTrainingSession._train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m EMTrainingException(\n\u001b[1;32m 182\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTraining rule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbr_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m resulted in no record pairs. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis means that in the supplied data set \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe number of comparisons that will be generated by a blocking rule.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 192\u001b[0m )\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# Compute the new params, populating the paramters in the copied settings object\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;66;03m# At this stage, we do not overwrite any of the parameters\u001b[39;00m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;66;03m# in the original (main) setting object\u001b[39;00m\n\u001b[0;32m--> 197\u001b[0m \u001b[43mexpectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcvv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 199\u001b[0m rule \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_blocking_rule_for_training\u001b[38;5;241m.\u001b[39mblocking_rule_sql\n\u001b[1;32m 200\u001b[0m training_desc \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEM, blocked on: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrule\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/expectation_maximisation.py:256\u001b[0m, in \u001b[0;36mexpectation_maximisation\u001b[0;34m(em_training_session, df_comparison_vector_values)\u001b[0m\n\u001b[1;32m 254\u001b[0m df_params \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39m_execute_sql_pipeline([agreement_pattern_counts])\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 256\u001b[0m df_params \u001b[38;5;241m=\u001b[39m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf_comparison_vector_values\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 257\u001b[0m param_records \u001b[38;5;241m=\u001b[39m df_params\u001b[38;5;241m.\u001b[39mas_pandas_dataframe()\n\u001b[1;32m 258\u001b[0m param_records \u001b[38;5;241m=\u001b[39m compute_proportions_for_new_parameters(param_records)\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:651\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sql_to_splink_dataframe_checking_cache(\n\u001b[1;32m 646\u001b[0m sql_gen,\n\u001b[1;32m 647\u001b[0m output_tablename_templated,\n\u001b[1;32m 648\u001b[0m use_cache,\n\u001b[1;32m 649\u001b[0m )\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mreset()\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:645\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 642\u001b[0m output_tablename_templated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mqueue[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39moutput_table_name\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_to_splink_dataframe_checking_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43msql_gen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:911\u001b[0m, in \u001b[0;36mLinker._sql_to_splink_dataframe_checking_cache\u001b[0;34m(self, sql, output_tablename_templated, use_cache)\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_pd) \u001b[38;5;66;03m# noqa: T201\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 911\u001b[0m splink_dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_against_backend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name_hash\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intermediate_table_cache\u001b[38;5;241m.\u001b[39mexecuted_queries\u001b[38;5;241m.\u001b[39mappend(splink_dataframe)\n\u001b[1;32m 916\u001b[0m splink_dataframe\u001b[38;5;241m.\u001b[39mcreated_by_splink \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:220\u001b[0m, in \u001b[0;36mDuckDBLinker._execute_sql_against_backend\u001b[0;34m(self, sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_delete_table_from_database(physical_name)\n\u001b[1;32m 215\u001b[0m sql \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124mCREATE TABLE \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124mAS\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m--> 220\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_and_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DuckDBDataFrame(templated_name, physical_name, \u001b[38;5;28mself\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:725\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 723\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SplinkException(\n\u001b[1;32m 726\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError executing the following sql for table \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 727\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemplated_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfinal_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mError was: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 729\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", + "\u001b[0;31mSplinkException\u001b[0m: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_cae27a405):\n\n CREATE TABLE __splink__m_u_counts_cae27a405\n AS\n (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_8e81f7a75), \n__splink__df_match_weight_parts as (\n select \"unique_id_l\",\"unique_id_r\" \n from __splink__df_comparison_vectors\n ), \n__splink__df_predict as (\n select\n log2(cast(1717.6657519007895 as float8) * ) as match_weight,\n CASE WHEN THEN 1.0 ELSE (cast(1717.6657519007895 as float8) * )/(1+(cast(1717.6657519007895 as float8) * )) END as match_probability,\n \"unique_id_l\",\"unique_id_r\" \n from __splink__df_match_weight_parts\n \n order by 1\n ) \n select 0 as comparison_vector_value,\n sum(match_probability * 1) /\n sum(1) as m_count,\n sum((1-match_probability) * 1) /\n sum(1) as u_count,\n '_probability_two_random_records_match' as output_column_name\n from __splink__df_predict\n )\n \n\nError was: Parser Error: syntax error at or near \")\"" + ] + } + ], + "source": [ + "splink_dedupe(election_df, settings, blocking )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/harvard_eda.ipynb b/notebooks/harvard_eda.ipynb new file mode 100644 index 0000000..5bd55a0 --- /dev/null +++ b/notebooks/harvard_eda.ipynb @@ -0,0 +1,1430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Harvard Dataset EDA\n", + "The Harvard dataset, which contains information on individual candidates, aims to enrich the exiting individual dataset (`individuals_table.csv`) from various states. (However, as Sarah suggested, it's still uncertain whether a new table for candidates should be created. If so, it could include more variables.) For now, I've selected variables that could 1) help match individuals within the two datasets and 2) provide additional information in addition to the individual's table, such as election outcomes and candidate party affiliations.\n", + "\n", + "This Markdown document is designed to explain the rationale behind the selection of variables within the Harvard dataset and to describe the basic nature of these variables." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from utils.election.constants import HV_FILEPATH\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_38218/4064994762.py:1: UnicodeWarning: \n", + "One or more strings in the dta file could not be decoded using utf-8, and\n", + "so the fallback encoding of latin-1 is being used. This can happen when a file\n", + "has been incorrectly encoded by Stata or some other software. You should verify\n", + "the string values returned are correct.\n", + " hv_df = pd.read_stata(HV_FILEPATH)\n", + "/tmp/ipykernel_38218/4064994762.py:1: UnicodeWarning: \n", + "One or more strings in the dta file could not be decoded using utf-8, and\n", + "so the fallback encoding of latin-1 is being used. This can happen when a file\n", + "has been incorrectly encoded by Stata or some other software. You should verify\n", + "the string values returned are correct.\n", + " hv_df = pd.read_stata(HV_FILEPATH)\n" + ] + } + ], + "source": [ + "hv_df = pd.read_stata(HV_FILEPATH)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "HV_INDIVIDUAL_COLS = [\n", + " \"caseid\",\n", + " \"year\",\n", + " \"month\",\n", + " \"day\",\n", + " \"sab\",\n", + " \"cname\",\n", + " \"candid\",\n", + " \"cand\",\n", + " \"sen\",\n", + " \"partyz\",\n", + " \"partyt\",\n", + " \"outcome\",\n", + " \"vote\",\n", + " \"termz\",\n", + " \"last\",\n", + " \"first\",\n", + " \"v19_20171211\",\n", + " \"v19_20160217\"\n", + "\n", + "]\n", + "raw_df = hv_df[HV_INDIVIDUAL_COLS]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Name columns (`v19_20171211`,`v19_20160217`,`cand`,\n", + "`last` and `first`) \n", + "`cand`\tStandardized Candidate Name \n", + "`v19_20171211`\tStandardized Candidate Name from December 11, 2017 \n", + "`v19_20160217`\tStandardized Candidate Name from February 17, 2016 \n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
v19_20171211v19_20160217
8074KAWASAKI, SCOTT
8075SCATTERING
8076HOLDAWAY, TRUNO N. L.
8077THOMPSON, STEVE M.
8078SCATTERING
.........
378340ANDERSON, JAMES LEE
378341SCATTERING
378342FORD, ROBERT
378343SCOTT, CHARLES K.
378344SCATTERING
\n", + "

20643 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " v19_20171211 v19_20160217\n", + "8074 KAWASAKI, SCOTT \n", + "8075 SCATTERING \n", + "8076 HOLDAWAY, TRUNO N. L. \n", + "8077 THOMPSON, STEVE M. \n", + "8078 SCATTERING \n", + "... ... ...\n", + "378340 ANDERSON, JAMES LEE \n", + "378341 SCATTERING \n", + "378342 FORD, ROBERT \n", + "378343 SCOTT, CHARLES K. \n", + "378344 SCATTERING \n", + "\n", + "[20643 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if two standardized candidate columns match\n", + "# What is the pattern for update, any correction of just update from NaN.\n", + "raw_df[\n", + " (raw_df[\"v19_20171211\"].str.strip() != raw_df[\"v19_20160217\"].str.strip()) &\n", + " ~raw_df[\"v19_20160217\"].isna()\n", + "][[\"v19_20171211\", \"v19_20160217\"]]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We thus know that v19_20171211 is a update of v19_20160217, there is no additional information within the v19_20160217 volumn. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_38218/3528258281.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " raw_df[\"v19_20171211\"] = raw_df[\"v19_20171211\"].str.lower()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
v19_20171211candfirstlast
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [v19_20171211, cand, first, last]\n", + "Index: []" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[\"v19_20171211\"] = raw_df[\"v19_20171211\"].str.lower()\n", + "raw_df[\n", + " (raw_df[\"cand\"].str.strip() != raw_df[\"v19_20171211\"].str.strip()) & \n", + " raw_df[\"v19_20171211\"].isna()\n", + "][[\"v19_20171211\", \"cand\",\"first\",\"last\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, there are two things to do 1) update the `cand` column that use the non-empty v19 values to replace the cand (full_name) column; 2) delete all rows with missingnames. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caseidyearmonthdaysabcnamecandidcandsenpartyzpartytoutcomevotetermzlastfirstv19_20171211v19_20160217
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [caseid, year, month, day, sab, cname, candid, cand, sen, partyz, partyt, outcome, vote, termz, last, first, v19_20171211, v19_20160217]\n", + "Index: []" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[raw_df[\"cand\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_38218/2829064622.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " raw_df[\"cand\"] = np.where(raw_df[\"v19_20171211\"].notna(),raw_df[\"v19_20171211\"],raw_df[\"cand\"])\n" + ] + } + ], + "source": [ + "raw_df[\"cand\"] = np.where(raw_df[\"v19_20171211\"].notna(),raw_df[\"v19_20171211\"],raw_df[\"cand\"])\n", + "raw_df = raw_df[~raw_df[\"cand\"].str.startswith(\"namemissing\")]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caseidyearmonthdaysabcnamecandidcandsenpartyzpartytoutcomevotetermzlastfirstv19_20171211v19_20160217
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [caseid, year, month, day, sab, cname, candid, cand, sen, partyz, partyt, outcome, vote, termz, last, first, v19_20171211, v19_20160217]\n", + "Index: []" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[(raw_df[\"last\"].isna() & raw_df[\"first\"].isna())]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. `sab` column (state) \n", + "We only need to uppercase every value here." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['AL' 'AK' 'AZ' 'AR' 'CA' 'CO' 'CT' 'DE' 'FL' 'GA' 'HI' 'ID' 'IL' 'IN'\n", + " 'IA' 'KS' 'KY' 'LA' 'ME' 'MD' 'MA' 'MI' 'MN' 'MS' 'MO' 'MT' 'NE' 'NV'\n", + " 'NH' 'NJ' 'NM' 'NY' 'NC' 'ND' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD' 'TN'\n", + " 'TX' 'UT' 'VT' 'VA' 'WA' 'WV' 'WI' 'WY']\n" + ] + } + ], + "source": [ + "raw_df[\"sab\"] = raw_df[\"sab\"].str.upper()\n", + "print(raw_df[\"sab\"].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. Candidate uniqueness?\n", + "In order to merge the datasets, we need to ensure the uniqueness of each individual. That is, name/id of each observation is unique. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "candid\n", + "167521 100\n", + "168980 77\n", + "167044 73\n", + "165078 72\n", + "166643 66\n", + " ... \n", + "4421 1\n", + "4423 1\n", + "4603 1\n", + "4979 1\n", + "4978 1\n", + "Name: count, Length: 160405, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# not unique\n", + "raw_df[\"candid\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
candidunique_partyt_count
782
20212
29302
52532
58602
.........
1534803541592
1536373549382
1554663638482
1554733638762
1555253640012
\n", + "

4635 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " candid unique_partyt_count\n", + "7 8 2\n", + "20 21 2\n", + "29 30 2\n", + "52 53 2\n", + "58 60 2\n", + "... ... ...\n", + "153480 354159 2\n", + "153637 354938 2\n", + "155466 363848 2\n", + "155473 363876 2\n", + "155525 364001 2\n", + "\n", + "[4635 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_party_counts = raw_df.groupby(\"candid\")[\"partyt\"].nunique().reset_index(name=\"unique_partyt_count\")\n", + "\n", + "unique_party_counts[unique_party_counts[\"unique_partyt_count\"] > 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caseidyearmonthdaysabcnamecandidcandsenpartyzpartytoutcomevotetermzlastfirstv19_20171211v19_20160217
6730779198611.04.0AK8donley, dave0ddw2985.02.0donleydavedonley, daveDONLEY, DAVE
6841783198811.08.0AK8donley, dave0ddw4234.02.0donleydavedonley, daveDONLEY, DAVE
6946732199011.06.0AK8donley, dave0ddw4081.02.0donleydavedonley, daveDONLEY, DAVE
854490199211.03.0AK8donley, dave1ddw5731.02.0donleydavedonley, daveDONLEY, DAVE
8580160199411.08.0AK8donley, dave1ddw5209.04.0donleydavedonley, daveDONLEY, DAVE
8620161199811.03.0AK8donley, dave1rrw8003.04.0donleydavedonley, daveDONLEY, DAVE
86858200211.05.0AK8donley, dave1rrl4666.02.0donleydavedonley, daveDONLEY, DAVE
\n", + "
" + ], + "text/plain": [ + " caseid year month day sab cname candid cand sen partyz \\\n", + "6730 779 1986 11.0 4.0 AK 8 donley, dave 0 d \n", + "6841 783 1988 11.0 8.0 AK 8 donley, dave 0 d \n", + "6946 732 1990 11.0 6.0 AK 8 donley, dave 0 d \n", + "8544 90 1992 11.0 3.0 AK 8 donley, dave 1 d \n", + "8580 160 1994 11.0 8.0 AK 8 donley, dave 1 d \n", + "8620 161 1998 11.0 3.0 AK 8 donley, dave 1 r \n", + "8685 8 2002 11.0 5.0 AK 8 donley, dave 1 r \n", + "\n", + " partyt outcome vote termz last first v19_20171211 v19_20160217 \n", + "6730 d w 2985.0 2.0 donley dave donley, dave DONLEY, DAVE \n", + "6841 d w 4234.0 2.0 donley dave donley, dave DONLEY, DAVE \n", + "6946 d w 4081.0 2.0 donley dave donley, dave DONLEY, DAVE \n", + "8544 d w 5731.0 2.0 donley dave donley, dave DONLEY, DAVE \n", + "8580 d w 5209.0 4.0 donley dave donley, dave DONLEY, DAVE \n", + "8620 r w 8003.0 4.0 donley dave donley, dave DONLEY, DAVE \n", + "8685 r l 4666.0 2.0 donley dave donley, dave DONLEY, DAVE " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[raw_df[\"candid\"] == 8]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For one candidate, they may have several election records. They may have several election results at different counties. Throughout their career, they may have different parties." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. Other variables" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`partyt` Assigns just one party to a candidate in one election season (i.e., the primary and general election in one year), using the same seven codes used in partyz. \n", + "\tFor example, a candidate running in NY with fusion as a Democrat and a Republican is assigned the party they are expected to caucus with in the state legislature, measured by how the end up caucusing in the state legislature. \n", + "\tFor example, a candidate who files in a state primary as a Democrat, and then is written in by voters in the Republican primary, has “d” designated as their “true” party. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "partyt\n", + "d 183902\n", + "r 151981\n", + "nonmaj 22189\n", + "writein 17574\n", + "nonpart 2431\n", + "partymiss 268\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[\"partyt\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caseidyearmonthdaysabcnamecandidcandsenpartyzpartytoutcomevotetermzlastfirstv19_20171211v19_20160217
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [caseid, year, month, day, sab, cname, candid, cand, sen, partyz, partyt, outcome, vote, termz, last, first, v19_20171211, v19_20160217]\n", + "Index: []" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[raw_df[\"last\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "raw_df = raw_df[(raw_df[\"year\"] <= 2017) & (raw_df[\"year\"] >= 2014)]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "mini_df = pd.read_csv(\"/project/data/transformed/inds_mini.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "party\n", + "republican 2\n", + "democratic 1\n", + "DELETE 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mini_df[\"party\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We rename the party categories to align with the individual file." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caseidyearmonthdaysabcnamecandidcandsenpartyz...last_namefull_nameentity_typestatepartycompanyoccupationaddresszipcity
0321972.02014.08.026.0AZ10358.0alston, lela0.0d...NaNalston, lelacandidateAZdemocraticnone (is a candidate)NaNNaNNaNNaN
1336363.02014.011.04.0AZmaricopa10358.0alston, lela0.0d...NaNalston, lelacandidateAZdemocraticnone (is a candidate)NaNNaNNaNNaN
2361592.02016.011.08.0AZ10358.0alston, lela0.0d...NaNalston, lelacandidateAZdemocraticnone (is a candidate)NaNNaNNaNNaN
2537361500.02016.011.08.0AZ361500.0schmuck, frank1.0r...NaNschmuck, frankcandidateAZrepublicannone (is a candidate)NaNNaNNaNNaN
3239321937.02014.08.026.0AZ295389.0carter, heather0.0r...NaNcarter, heathercandidateAZrepublicannone (is a candidate)NaNNaNNaNNaN
3240336333.02014.011.04.0AZmaricopa295389.0carter, heather0.0r...NaNcarter, heathercandidateAZrepublicannone (is a candidate)NaNNaNNaNNaN
3241361565.02016.011.08.0AZ295389.0carter, heather0.0r...NaNcarter, heathercandidateAZrepublicannone (is a candidate)NaNNaNNaNNaN
\n", + "

7 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " caseid year month day sab cname candid cand \\\n", + "0 321972.0 2014.0 8.0 26.0 AZ 10358.0 alston, lela \n", + "1 336363.0 2014.0 11.0 4.0 AZ maricopa 10358.0 alston, lela \n", + "2 361592.0 2016.0 11.0 8.0 AZ 10358.0 alston, lela \n", + "2537 361500.0 2016.0 11.0 8.0 AZ 361500.0 schmuck, frank \n", + "3239 321937.0 2014.0 8.0 26.0 AZ 295389.0 carter, heather \n", + "3240 336333.0 2014.0 11.0 4.0 AZ maricopa 295389.0 carter, heather \n", + "3241 361565.0 2016.0 11.0 8.0 AZ 295389.0 carter, heather \n", + "\n", + " sen partyz ... last_name full_name entity_type state \\\n", + "0 0.0 d ... NaN alston, lela candidate AZ \n", + "1 0.0 d ... NaN alston, lela candidate AZ \n", + "2 0.0 d ... NaN alston, lela candidate AZ \n", + "2537 1.0 r ... NaN schmuck, frank candidate AZ \n", + "3239 0.0 r ... NaN carter, heather candidate AZ \n", + "3240 0.0 r ... NaN carter, heather candidate AZ \n", + "3241 0.0 r ... NaN carter, heather candidate AZ \n", + "\n", + " party company occupation address zip city \n", + "0 democratic none (is a candidate) NaN NaN NaN NaN \n", + "1 democratic none (is a candidate) NaN NaN NaN NaN \n", + "2 democratic none (is a candidate) NaN NaN NaN NaN \n", + "2537 republican none (is a candidate) NaN NaN NaN NaN \n", + "3239 republican none (is a candidate) NaN NaN NaN NaN \n", + "3240 republican none (is a candidate) NaN NaN NaN NaN \n", + "3241 republican none (is a candidate) NaN NaN NaN NaN \n", + "\n", + "[7 rows x 31 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_df = raw_df.merge(mini_df, left_on=\"cand\", right_on=\"full_name\",how = \"right\")\n", + "test_df[test_df[\"cand\"].isna() == False]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2015 -2017 data\n", + "create a new table for elections who was runnning, foreign key to ind's table \n", + "what race they are \n", + "district, vote, results + # of votes -> how many more votes etc.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_38218/3301807390.py:1: DtypeWarning: Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " ind_df = pd.read_csv(\"/project/output/transformed/individuals_table.csv\")\n" + ] + } + ], + "source": [ + "ind_df = pd.read_csv(\"/project/output/transformed/individuals_table.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0idfirst_namelast_namefull_nameentity_typestatepartycompany
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Unnamed: 0, id, first_name, last_name, full_name, entity_type, state, party, company]\n", + "Index: []" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ind_df[ind_df[\"full_name\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "linkage -> file, to find match individual, for the election results -> look for potential match -> include in the election results. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "statefinancetransformer -> new class -> election result transformer \n", + "\n", + "1. cleaning the data \n", + "2. doing matching -> reuse the funciton in linkage \n", + "3. ONLY LAST NAME" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/output/README.md b/output/README.md index 42aa382..d25c32c 100644 --- a/output/README.md +++ b/output/README.md @@ -1,5 +1,7 @@ -# Output README +# Output --- -'deduplicated_UUIDs.csv' : Following record linkage work in the record_linkage pipeline, this file stores all the original uuids, and indicates the uuids to which the deduplicated uuids have been matched to. +`deduplicated_UUIDs.csv` : Following record linkage work in the record_linkage pipeline, this file stores all the original uuids, and indicates the uuids to which the deduplicated uuids have been matched to. -'network_metrics.txt' : Following the network graph creation, this file stores some summarizing metrics about the netowork including: 50 nodes of highest centrality (in-degree, out-degree, eigenvector, and betweenness), density, assortativity based on classification, and clustering. \ No newline at end of file +`network_metrics.txt` : Following the network graph creation, this file stores some summarizing metrics about the netowork including: 50 nodes of highest centrality (in-degree, out-degree, eigenvector, and betweenness), density, assortativity based on classification, and clustering. + +This folder gets populated with output files upon running the `make` commands. The final network visualization graph outputs and metrics are housed here. \ No newline at end of file diff --git a/output/network_graphs/macro_level.png b/output/network_graphs/macro_level.png new file mode 100644 index 0000000..aa1a3a8 Binary files /dev/null and b/output/network_graphs/macro_level.png differ diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..ebe0af8 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,7 @@ +### Scripts directory +* `election_linkage.py`: Script to run preprocessing, cleaning pipelin to build linkage for election data +* `transform_election_pipeline.py`: Script for running cleaning pipeline for election results +* `tx_transform_pipeline.py`: Script for running cleaning pipeline for Texas StateTransformer +* `tx_election_linkage_pipeline.py`: Script to run preprocessing, cleaning pipeline to build linkage for Texas election data +* `company_classification_pipeline.py`: Script to clean FFF data and InfoGroup data and merge to create a dataset of confidently classified companies. Deduping is performed on these datasets separately then they are merged. Creates a csv of the merged dataset: data/classification/merged_cleaned_company_classification.py +* `company_linkage_pipeline.py`: Script to perform record linkage on the confidently classified company dataset and a campaign finance dataset (currently using a testing subset with the Texas organizations dataset). Transforms the campaign finance data by creating a reference to the matching UUIDs in the classified company dataset. Output csv can be found in output/linked diff --git a/scripts/clean_classify_graph_pipeline.py b/scripts/clean_classify_graph_pipeline.py index bf4d29f..6b8112b 100644 --- a/scripts/clean_classify_graph_pipeline.py +++ b/scripts/clean_classify_graph_pipeline.py @@ -4,10 +4,10 @@ from utils.constants import BASE_FILEPATH from utils.linkage_and_network_pipeline import clean_data_and_build_network -transformed_data = BASE_FILEPATH / "data" / "transformed" +transformed_data = BASE_FILEPATH / "output" / "transformed" -organizations_table = pd.read_csv(transformed_data / "orgs_mini.csv") -individuals_table = pd.read_csv(transformed_data / "inds_mini.csv") -transactions_table = pd.read_csv(transformed_data / "trans_mini.csv") +organizations_table = pd.read_csv(transformed_data / "organizations_table.csv") +individuals_table = pd.read_csv(transformed_data / "individuals_table.csv") +transactions_table = pd.read_csv(transformed_data / "transactions_table.csv") clean_data_and_build_network(individuals_table, organizations_table, transactions_table) diff --git a/scripts/election_linkage.py b/scripts/election_linkage.py new file mode 100644 index 0000000..7fe1d09 --- /dev/null +++ b/scripts/election_linkage.py @@ -0,0 +1,13 @@ +"""Script to run preprocessing, cleaning pipelin to build linkage for election data""" + +import pandas as pd +from utils.constants import BASE_FILEPATH +from utils.election.linkage_pipeline import preprocess_data_and_create_table + +transformed_data = BASE_FILEPATH / "output" / "transformed" +cleaned_data = BASE_FILEPATH / "output" / "cleaned" + +ind_df = pd.read_csv(cleaned_data / "individuals_table.csv") +election_table = pd.read_csv(transformed_data / "election_results_table.csv") + +preprocess_data_and_create_table(election_table, ind_df) diff --git a/scripts/transform_election_pipeline.py b/scripts/transform_election_pipeline.py new file mode 100644 index 0000000..972b22a --- /dev/null +++ b/scripts/transform_election_pipeline.py @@ -0,0 +1,40 @@ +"""Script for running cleaning pipeline for election results""" + +import argparse + +from utils.constants import BASE_FILEPATH +from utils.election.election_pipeline import transform_and_merge + +parser = argparse.ArgumentParser() + +parser.add_argument( + "-i", + "--input-directory", + default=None, + help="Path to raw data directory. Default is 'data/raw' in repo root", +) +parser.add_argument( + "-o", + "--output-directory", + default=None, + help="Path to directory to save output. Default is 'output/transformed'", +) +args = parser.parse_args() + +if args.output_directory is None: + output_directory = BASE_FILEPATH / "output" / "transformed" +else: + output_directory = args.output_directory +if args.input_directory is None: + input_directory = BASE_FILEPATH / "data" / "raw" +else: + input_directory = args.input_directory +input_directory.mkdir(parents=True, exist_ok=True) +output_directory.mkdir(parents=True, exist_ok=True) + +election_results_output_path = output_directory / "election_results_table.csv" + +complete_election_resutls_table = transform_and_merge() + +complete_election_resutls_table.to_csv(election_results_output_path) +print(complete_election_resutls_table.dtypes) diff --git a/scripts/tx_election/tx_election_linkage_pipeline.py b/scripts/tx_election/tx_election_linkage_pipeline.py new file mode 100644 index 0000000..a24f44e --- /dev/null +++ b/scripts/tx_election/tx_election_linkage_pipeline.py @@ -0,0 +1,14 @@ +"""Script to run preprocessing, cleaning pipelin to build linkage for Texas election data""" + +import pandas as pd +from utils.constants import BASE_FILEPATH +from utils.election.tx_linkage_pipeline import preprocess_data_and_create_table + +transformed_data = BASE_FILEPATH / "output" / "transformed" + +ind_df = pd.read_csv(transformed_data / "tx_individuals_table.csv") +election_table = pd.read_csv(transformed_data / "election_results_table.csv") + +tx_election_table = election_table[election_table["state"] == "TX"] + +preprocess_data_and_create_table(tx_election_table, ind_df) diff --git a/scripts/tx_election/tx_transform_pipeline.py b/scripts/tx_election/tx_transform_pipeline.py new file mode 100644 index 0000000..4442c66 --- /dev/null +++ b/scripts/tx_election/tx_transform_pipeline.py @@ -0,0 +1,45 @@ +"""Script for running cleaning pipeline for Texas StateTransformer""" + +import argparse + +from utils.constants import BASE_FILEPATH +from utils.transform.tx_pipeline import transform_and_merge + +parser = argparse.ArgumentParser() + +parser.add_argument( + "-i", + "--input-directory", + default=None, + help="Path to raw data directory. Default is 'data/raw' in repo root", +) +parser.add_argument( + "-o", + "--output-directory", + default=None, + help="Path to directory to save output. Default is 'output/transformed'", +) +args = parser.parse_args() + +if args.output_directory is None: + output_directory = BASE_FILEPATH / "output" / "transformed" +else: + output_directory = args.output_directory +if args.input_directory is None: + input_directory = BASE_FILEPATH / "data" / "raw" +else: + input_directory = args.input_directory +input_directory.mkdir(parents=True, exist_ok=True) +output_directory.mkdir(parents=True, exist_ok=True) + +individuals_output_path = output_directory / "tx_individuals_table.csv" +organizations_output_path = output_directory / "tx_organizations_table.csv" +transactions_output_path = output_directory / "tx_transactions_table.csv" +( + complete_individuals_table, + complete_organizations_table, + complete_transactions_table, +) = transform_and_merge() +complete_individuals_table.to_csv(individuals_output_path) +complete_organizations_table.to_csv(organizations_output_path) +complete_transactions_table.to_csv(transactions_output_path) diff --git a/src/utils/README.md b/src/utils/README.md index ee64ef8..3c2efef 100644 --- a/src/utils/README.md +++ b/src/utils/README.md @@ -1,5 +1,25 @@ # Utils README --- +#### classify.py +1. These functions take in the deduplicated and cleaned individuals and organizations +dataframes from the deduplication and linkage pipeline. +2. We classify based on substrings known to indicate clean energy or fossil fuels groups. +In particular, individuals are classified based on their employment by fossil fuels companies, +and organizations are classified by their names, prioritizing high profile corporations/PACs +and those which were found by a manual search of the largest donors/recipients in the dataset + +#### constants.py +Declares constants to be used in various parts of the project. Specifies relative file paths and other static information to be used +uniformly across all code scripts. + +#### linkage.py +Performs record linkage across the different datasets, deduplicates records. + +#### network.py +Writes the code for building, visualizing, and analyzing network visualizations (both micro and macro level) as the final outputs. + +### linkage_and_network_pipeline.py +The module for running the final network visualization pipeline. Writes functions to call other relevant functions to build the networks from cleaned, transformed, and classified data. ## Michigan Utils: #### preprocess_mi_campaign_data.py @@ -91,4 +111,22 @@ These functions clean, standardize, and merge InfoGroup data. It cleans the larg These functions merge the FFF and Infogroup data and transforms the df to create a reference to InfoGroup's parent company UUID if found. +## Election Util: +#### Util function for harvard.py +1. extract_first_name + +#### harvard.py +1. preprocess +2. clean +3. standardize +4. create_table +5. create_election_result_uuid +6. clean_state +## Texas Util: +#### texas.py +1. preprocess +2. clean +3. standardize +4. create_tables +5. clean_state diff --git a/src/utils/election/clean.py b/src/utils/election/clean.py new file mode 100644 index 0000000..01e3f94 --- /dev/null +++ b/src/utils/election/clean.py @@ -0,0 +1,86 @@ +"""Abstract base class for transforming election into standard schema""" + +from abc import ABC, abstractmethod + +import pandas as pd + + +class ElectionResultTransformer(ABC): + """This abstract class is the one that all the election result cleaners will be built on + + Given a path to a directory with raw data from dataset, this class provides + the interface for: + - reading the data into pandas DatFrames + - deleting empty or clearly erroneous rows + - renaming / reshaping data to fit a single schema + - validating data to fit schema + - adding uuids based on the Individual table uuids + + The methods in this class are meant to be very conservative. Raw data should + not be modified, only transformed. Rows cannot be changed, only deleted in + obviously erroneous cases. + """ + + @abstractmethod + def preprocess(self, directory: str = None) -> pd.DataFrame: + """Preprocesses the election data and returns a dataframe + + Reads in the election data, makes any necessary bug fixes, and + combines the data into a list of DataFrames, discards data not in schema + + Inputs: + directory: absolute path to a directory with relevant election data. + defined per dataframe. + + Returns: + One dataframe with all relevant information + """ + pass + + @abstractmethod + def clean(self, data: pd.DataFrame) -> pd.DataFrame: + """Cleans the dataframe as needed and returns the dataframe + + Cleans the columns, converts dtypes to match database schema, and drops + rows not representing minimal viable transactions + + Inputs: + data: Dataframe as output from preprocess method. + + Returns: Dataframe + """ + pass + + @abstractmethod + def standardize(self, data: pd.DataFrame) -> pd.DataFrame: + """Standardizes the dataframe into the necessary format for the schema + + Maps [] types and column names as defined in schema, adjust + and add UUIDs as necessary + + Inputs: + data: dataframe as outputted from clean method. + + Returns: Dataframe + """ + pass + + @abstractmethod + def create_table(self, data: pd.DataFrame) -> pd.DataFrame: + """Creates the election result table that has matched uuid with individual dataset + + Inputs: + data: Dataframe as output from standardize method. + + Returns: a table as defined in database schema + """ + pass + + @abstractmethod + def clean_state(self) -> pd.DataFrame: + """Runs the ElectionResultCleaner pipeline returning a cleaned dataframes + + Returns: cleans the state and returns the standardized table showing + the election results. + """ + pass diff --git a/src/utils/election/constants.py b/src/utils/election/constants.py new file mode 100644 index 0000000..d8675d2 --- /dev/null +++ b/src/utils/election/constants.py @@ -0,0 +1,80 @@ +"""Constants to be used in building up election pipeline.""" + +import numpy as np +import splink.duckdb.comparison_library as cl +import splink.duckdb.comparison_template_library as ctl +from splink.duckdb.blocking_rule_library import block_on +from utils.constants import BASE_FILEPATH + +HV_FILEPATH = BASE_FILEPATH / "data" / "raw" / "HV" / "196slers1967to2016_20180908.dta" + +INDIVIDUALS_FILEPATH = BASE_FILEPATH / "output" / "cleaned" / "individuals_table.csv" + +HV_INDIVIDUAL_COLS = [ + "year", + "month", + "day", + "sab", + "cname", + "ddez", + "dname", + "dno", + "geopost", + "mmdpost", + "candid", + "vote", + "termz", + "cand", + "sen", + "partyt", + "dseats", + "outcome", + "last", + "first", + "v19_20171211", +] + +type_mapping = { + "year": "int", + "month": "int", + "day": "int", + "state": "string", + "county": "string", + "district_designation_ballot": "string", + "district": "string", + "district_number": "int", + "geographic_post": "int", + "mmd_post": "int", + "candidate_id": "int", + "vote": "int", + "term": "float", + "full_name": "string", + "senate": "int", + "party": "string", + "district_seat_number": "int", + "outcome": "string", + "last_name": "string", + "first_name": "string", +} +party_map = {"d": "democratic", "r": "republican", "partymiss": np.nan} + +settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": [ + block_on(["first_name", "last_name", "year", "month", "county"]), + ], + "comparisons": [ + ctl.name_comparison("first_name", term_frequency_adjustments=True), + ctl.name_comparison("last_name", term_frequency_adjustments=True), + cl.exact_match("year", term_frequency_adjustments=True), + cl.exact_match("month", term_frequency_adjustments=True), + cl.exact_match("county", term_frequency_adjustments=True), + ], + "retain_matching_columns": True, + "retain_intermediate_calculation_columns": True, + "max_iterations": 10, + "em_convergence": 0.01, +} +blocking = [ + "l.first_name = r.first_name and l.last_name = r.last_name and l.county = r.county and l.year = r.year and l.month = r.month" +] diff --git a/src/utils/election/election_linkage.py b/src/utils/election/election_linkage.py new file mode 100644 index 0000000..3ae08b6 --- /dev/null +++ b/src/utils/election/election_linkage.py @@ -0,0 +1,202 @@ +"""Module for performing record linakge on individual record and election results""" + +import warnings + +import pandas as pd +from splink.duckdb.linker import DuckDBLinker + + +def contains_custom_special_characters(s: str) -> bool: + """Check if a string contains special characters + + Inputs: Any input, expected to handle any type. + + Returns: bool + """ + if pd.isna(s): + return False + special_chars = "!@#$%^&*()" + s = str(s) + return any(char in special_chars for char in s) + + +def create_single_last_name(row: pd.Series) -> pd.Series: + """Create single_last_name column based on last name or full name columns + + Datasources present full names and last names in different ways. + Some contains middle names and some last names may have several words that they may contain words like "van" or "Mr." + The get_likely_name function also fails to identify last name from full name columns + For more efficient and accurate matching, we create a column that shows the last word of full name / last name columns + + Inputs: Row + + Returns: Row with single_last_name column + """ + last_name = str(row["last_name"]) + if contains_custom_special_characters(last_name): + full_name = str(row["full_name"]) + row["single_last_name"] = full_name.lower().strip().split()[-1] + else: + row["single_last_name"] = last_name.lower().strip().split()[-1] + return row + + +def extract_first_name(full_name: str) -> str: + """Extracts and standardizes the first name from a full name string. + + Assumes format: "LastName, FirstName (Nickname)" or "LastName, FirstName". + The result is returned in lower case. + The function is designed for the harvard election result data + + Args: + full_name (str): A string containing the full name. + + Returns: + str: The standardized first name in lower case. + """ + full_name = str(full_name) + parts = full_name.split(",") + if len(parts) > 1: + first_name_part = parts[1].strip() + first_name = first_name_part.split("(")[0].strip() + return first_name.lower() + return "" + + +def decide_foreign_key( + election_data: pd.DataFrame, ind_df: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Include only the individual names with existed data and find id + + Inputs: + election_data: election result cleaned data + ind_df: cleaned individual table + + Returns: + a table of election result with a new column of individual uuid + and another table with duplicated information susceptible for inacuracy + """ + transform_ind_df = ind_df.copy()[["first_name", "single_last_name", "id", "state"]] + merged_data = election_data.merge( + transform_ind_df, on=["first_name", "single_last_name", "state"], how="inner" + ).rename(columns={"id": "candidate_uuid", "unique_id": "case_id"}) + merged_data = merged_data.drop(["single_last_name"], axis=1) + transform_ind_df["is_duplicate"] = transform_ind_df.duplicated( + subset=["first_name", "single_last_name", "state"], keep=False + ) + duplicated_id = transform_ind_df[transform_ind_df["is_duplicate"]] + + duplicated_id = duplicated_id.merge( + merged_data[["candidate_uuid"]], + left_on="id", + right_on="candidate_uuid", + how="left", + ) + duplicated_id["in_merged_data"] = duplicated_id["candidate_uuid"].notna() + + # Optionally drop the temporary 'candidate_uuid' column from duplicated_id DataFrame + duplicated_id.drop(columns=["candidate_uuid"]) + return merged_data, duplicated_id + + +def manual_dedupe(tx_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + """Delete potentially duplicated columns. + + This function identifies and removes duplicated records based on the + combination of 'first_name', 'last_name', and 'city' columns. + + Inputs: + tx_df: pd.DataFrame - The input dataframe containing tax records. + + Returns: + (pd.DataFrame, pd.DataFrame) - Two dataframes, one with deduplicated records + and one with the duplicated records. + """ + duplicates = tx_df.duplicated( + subset=["first_name", "single_last_name"], keep="first" + ) + + deduped_df = tx_df[~duplicates] + + duplicated_records_df = tx_df[duplicates].copy() + + original_ids = tx_df[~duplicates][["first_name", "single_last_name", "id"]] + + duplicated_records_df = duplicated_records_df.merge( + original_ids, on=["first_name", "single_last_name"], suffixes=("", "_original") + ) + + duplicated_records_df.rename(columns={"ID_uniform": "original_unique_id"}) + + return deduped_df, duplicated_records_df + + +# I didn't use this code since it took too long time and there are not a lot of duplications in the existing data +def splink_dedupe(df: pd.DataFrame, settings: dict, blocking: list) -> pd.DataFrame: + """Use splink to deduplicate dataframe based on settings + + Configuration settings and blocking can be found in constants.py as + individuals_settings, organizations_settings + + Uses the splink library which employs probabilistic matching for + record linkage + https://moj-analytical-services.github.io/splink/index.html + + Args: + df: dataframe + settings: configuration settings + (based on splink documentation and dataframe columns) + blocking: list of columns to block on for the table + (cuts dataframe into parts based on columns labeled blocks) + + Returns: + deduplicated version of initial dataframe with column 'matching_id' + that holds list of matching unique_ids + """ + # Initialize the linker object + linker = DuckDBLinker(df, settings) + + # Use warnings to manage any potential issues during processing + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") # Capture all warnings + + # Estimate probability that two random records match + linker.estimate_probability_two_random_records_match(blocking, recall=0.6) + + # Check for warnings and stop if any are raised + if len(w) > 0: + print( + "Warning detected, stopping the process and returning original dataframe." + ) + return df + + # Estimate the parameter u using random sampling + linker.estimate_u_using_random_sampling(max_pairs=5e6) + + # Run expectation maximisation on each block + for block in blocking: + linker.estimate_parameters_using_expectation_maximisation(block) + + # Predict matches + df_predict = linker.predict() + + # Cluster predictions and threshold + clusters = linker.cluster_pairwise_predictions_at_threshold( + df_predict, threshold_match_probability=0.7 + ) + clusters_df = clusters.as_pandas_dataframe() + + match_list_df = ( + clusters_df.groupby("cluster_id")["unique_id"].agg(list).reset_index() + ) + match_list_df = match_list_df.rename(columns={"unique_id": "duplicated"}) + + deduped_df = df.merge( + match_list_df, left_on="unique_id", right_on="duplicated", how="left" + ) + + deduped_df["matching_id"] = deduped_df["cluster_id"] + + deduped_df = deduped_df.drop(columns=["duplicated", "cluster_id"]) + + return deduped_df diff --git a/src/utils/election/election_pipeline.py b/src/utils/election/election_pipeline.py new file mode 100644 index 0000000..0481fa9 --- /dev/null +++ b/src/utils/election/election_pipeline.py @@ -0,0 +1,34 @@ +"""Merge raw election data into standardized schema""" + +import pandas as pd +from utils.election.clean import ElectionResultTransformer +from utils.election.harvard import HarvardTransformer + +ALL_ELECTION_CLEANERS = [ + HarvardTransformer(), +] + + +def transform_and_merge( + election_cleaners: list[ElectionResultTransformer] = None, +) -> pd.DataFrame: + """From raw datafiles, clean, merge, and reformat election result data . + + Args: + election_cleaners: List of election cleaners to merge data from. If None, + will default to all election_cleaners + + Returns: + election result table + """ + if election_cleaners is None: + election_cleaners = ALL_ELECTION_CLEANERS + + single_source_election_tables = [] + for election_cleaner in election_cleaners: + print("Cleaning...") + (election_result_table) = election_cleaner.clean_state() + single_source_election_tables.append(election_result_table) + complete_election_result_table = pd.concat(single_source_election_tables) + + return complete_election_result_table diff --git a/src/utils/election/harvard.py b/src/utils/election/harvard.py new file mode 100644 index 0000000..a32ae26 --- /dev/null +++ b/src/utils/election/harvard.py @@ -0,0 +1,166 @@ +"""Election result transformer implementation for Harvard dataset""" + +import uuid + +import numpy as np +import pandas as pd +from utils.election.clean import ( + ElectionResultTransformer, +) +from utils.election.constants import ( + HV_FILEPATH, + HV_INDIVIDUAL_COLS, + party_map, + type_mapping, +) +from utils.election.utils import extract_first_name + + +class HarvardTransformer(ElectionResultTransformer): + """Based on the StateTransformer abstract class and cleans Harvard data""" + + def preprocess(self) -> pd.DataFrame: + """Turns filepath into a dataframe + + The raw dataverse file is in .dta format and we need to + turn it into a pandas readable file + + returns: a dataframe + """ + raw_df = pd.read_stata(HV_FILEPATH) + + return raw_df + + def clean(self, data: pd.DataFrame) -> pd.DataFrame: + """Cleans the dataframe as needed and returns the dataframe + + Cleans the columns, converts dtypes to match database schema, and drops + rows not representing minimal viable transactions + + Inputs: + data: Dataframe as output from preprocess method. + + Returns: Dataframe + """ + clean_df = data.copy(deep=True) + clean_df = clean_df[HV_INDIVIDUAL_COLS] + year_start = 2014 + year_end = 2016 + clean_df = clean_df[ + (clean_df["year"] <= year_end) & (clean_df["year"] >= year_start) + ] + + clean_df = clean_df[~(clean_df["last"] == "scattering")] + # the data is cleaned in the original dataset -- if last or first name is missing + # that implies that the full name is incomplete as well + # there is a posibility that records could match base on last/first name solely + # but here for simplicity and efficiency, I jsut deleted rows with incomplete full names + clean_df = clean_df[~(clean_df["last"].isna())] + clean_df = clean_df[~(clean_df["first"].isna())] + print("check") + print(clean_df[clean_df["first"].isna()]) + + clean_df.loc[clean_df["first"] == "", "first"] = clean_df["cand"].apply( + extract_first_name + ) + + clean_df["cand"] = np.where( + clean_df["v19_20171211"].notna(), clean_df["v19_20171211"], clean_df["cand"] + ) + clean_df = clean_df.drop(["v19_20171211"], axis=1) + + return clean_df + + def standardize(self, data: pd.DataFrame) -> pd.DataFrame: + """Standardizes the dataframe into the necessary format for the schema + + Maps data types and column names as defined in schema + + Inputs: + data: dataframe as outputted from clean method. + + Returns: Dataframe + """ + # here last, first name should be remodified + data["sab"] = data["sab"].str.upper() + data["last"] = data["last"].str.lower() + data["first"] = data["first"].str.lower() + data["cand"] = data["cand"].str.lower() + data["cand"] = data["cand"].astype(str)[data["cand"].notna()] + + data["partyt"] = data["partyt"].map(party_map) + + data = data.rename( + columns={ + "ddez": "district_designation_ballot", + "dname": "district", + "dno": "district_number", + "geopost": "geographic_post", + "mmdpost": "mmd_post", + "cname": "county", + "sen": "senate", + "candid": "candidate_id", + "sab": "state", + "last": "last_name", + "first": "first_name", + "cand": "full_name", + "partyt": "party", + "termz": "term", + "dseats": "district_seat_number", + } + ) + + data["day"] = data["day"].fillna(0) + data["county"] = data["county"].fillna("Unknown") + data["district"] = data["district"].fillna("Unknown") + data["district_number"] = data["district_number"].fillna(0) + data["geographic_post"] = data["geographic_post"].fillna(0) + data["mmd_post"] = data["mmd_post"].fillna(0) + data["vote"] = data["vote"].fillna(0) + data["full_name"] = data["full_name"].fillna( + "Unknown" + ) # Fixed from mistakenly using 'district' + data["party"] = data["party"].fillna("Unknown") + + data = data.astype(type_mapping) + print("standardize result", data.dtypes) + return data + + def create_table(self, data: pd.DataFrame) -> pd.DataFrame: + """Creates the election result table and create uuid + + Inputs: + data: Dataframe as output from standardize method. + + Returns: + a table as defined in database schema + """ + final_table = data.copy() + + final_table = self.create_election_result_uuid(final_table) + + return final_table + + def create_election_result_uuid(self, data: pd.DataFrame) -> pd.DataFrame: + """Add uuid to each election result record + + Inputs: + data: standarized data frame + + Returns: + A dataframe with case_id column added + """ + data["unique_id"] = [uuid.uuid4() for _ in range(len(data))] + + return data + + def clean_state(self) -> pd.DataFrame: + """Runs the ElectionResultCleaner pipeline returning a cleaned dataframes + + Returns: cleans the state and returns the standardized table showing + the election results. + """ + raw_df = self.preprocess() + clean_df = self.clean(raw_df) + standardized_df = self.standardize(clean_df) + return self.create_table(standardized_df) diff --git a/src/utils/election/linkage_pipeline.py b/src/utils/election/linkage_pipeline.py new file mode 100644 index 0000000..a7ddc91 --- /dev/null +++ b/src/utils/election/linkage_pipeline.py @@ -0,0 +1,67 @@ +"""Module for running election linkage pipeline""" + +import pandas as pd +from utils.constants import BASE_FILEPATH +from utils.election.election_linkage import ( + create_single_last_name, + decide_foreign_key, + extract_first_name, +) + + +def preprocess_election_results(election_df: pd.DataFrame) -> pd.DataFrame: + """Preprocess and clean a dataframe of election results + + Args: + election_df: dataframe of election results + + Returns: + cleaned dataframe of election_results ready for building network + """ + election_df = election_df.apply(create_single_last_name, axis=1) + election_df.loc[election_df["first_name"] == "", "first_name"] = election_df[ + "full_name" + ].apply(extract_first_name) + + return election_df + + +def preprocess_cleaned_individuals(ind_df: pd.DataFrame) -> pd.DataFrame: + """Preprocess and clean a dataframe of individuals table + + Inputs: + ind__df: dataframe of election results + + Returns: + cleaned dataframe of election_results ready for building network + """ + ind_df = ind_df.apply(create_single_last_name, axis=1) + + return ind_df + + +def preprocess_data_and_create_table( + election_df: pd.DataFrame, ind_df: pd.DataFrame +) -> None: + """Clean data, link duplicates, classify nodes and create a network + + Args: + election_df: election data + ind_df: individual table + """ + election_df = preprocess_election_results(election_df) + ind_df = preprocess_cleaned_individuals(ind_df) + + final_df, duplicated_id = decide_foreign_key(election_df, ind_df) + + output_path = BASE_FILEPATH / "output" / "cleaned" + output_path.mkdir(exist_ok=True) + cleaned_election_output_path = ( + BASE_FILEPATH / "output" / "cleaned" / "election_table.csv" + ) + duplicated_output_path = ( + BASE_FILEPATH / "output" / "cleaned" / "duplicated_ind_table.csv" + ) + + final_df.to_csv(cleaned_election_output_path, index=False) + duplicated_id.to_csv(duplicated_output_path, index=False) diff --git a/src/utils/election/tx_linkage_pipeline.py b/src/utils/election/tx_linkage_pipeline.py new file mode 100644 index 0000000..1fca8c4 --- /dev/null +++ b/src/utils/election/tx_linkage_pipeline.py @@ -0,0 +1,80 @@ +"""Module for running TX election linkage pipeline""" + +import pandas as pd +from utils.constants import BASE_FILEPATH +from utils.election.election_linkage import manual_dedupe +from utils.election.linkage_pipeline import ( + preprocess_election_results, +) + + +def decide_foreign_key( + election_data: pd.DataFrame, ind_df: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Include only the individual names with existed data and find id + + Inputs: + election_data: election result cleaned data + ind_df: cleaned individual table + + Returns: + a table of election result with a new column of individual uuid + and another table with duplicated information susceptible for inacuracy + """ + transform_ind_df = ind_df.copy()[["first_name", "single_last_name", "id", "state"]] + merged_data = election_data.merge( + transform_ind_df, on=["first_name", "single_last_name", "state"], how="inner" + ).rename(columns={"id": "candidate_uuid", "unique_id": "case_id"}) + merged_data = merged_data.drop(["single_last_name"], axis=1) + transform_ind_df["is_duplicate"] = transform_ind_df.duplicated( + subset=["first_name", "single_last_name", "state"], keep=False + ) + duplicated_id = transform_ind_df[transform_ind_df["is_duplicate"]] + + duplicated_id = duplicated_id.merge( + merged_data[["candidate_uuid"]], + left_on="id", + right_on="candidate_uuid", + how="left", + ) + duplicated_id["in_merged_data"] = duplicated_id["candidate_uuid"].notna() + + # Optionally drop the temporary 'candidate_uuid' column from duplicated_id DataFrame + duplicated_id.drop(columns=["candidate_uuid"]) + return merged_data, duplicated_id + + +def preprocess_data_and_create_table( + election_df: pd.DataFrame, ind_df: pd.DataFrame +) -> None: + """Clean data, link duplicates, classify nodes and create a network + + Args: + election_df: election data + ind_df: individual table + """ + ind_df, duplicated_tx_record = manual_dedupe(ind_df) + + election_df = preprocess_election_results(election_df) + # ind_df should be preprocessed here, but since there is no middle names in the last_name column, I just copied it. + # ind_df = preprocess_cleaned_individuals(ind_df) + ind_df["single_last_name"] = ind_df["last_name"] + + final_df, duplicated_id = decide_foreign_key(election_df, ind_df) + + output_path = BASE_FILEPATH / "output" / "cleaned" + output_path.mkdir(exist_ok=True) + cleaned_election_output_path = ( + BASE_FILEPATH / "output" / "cleaned" / "tx_election_table.csv" + ) + duplicated_output_path = ( + BASE_FILEPATH / "output" / "cleaned" / "tx_duplicated_ind_table.csv" + ) + + duplicated_election_record_path = ( + BASE_FILEPATH / "output" / "cleaned" / "tx_duplicated_election_table.csv" + ) + + final_df.to_csv(cleaned_election_output_path, index=False) + duplicated_id.to_csv(duplicated_output_path, index=False) + duplicated_tx_record.to_csv(duplicated_election_record_path, index=False) diff --git a/src/utils/election/utils.py b/src/utils/election/utils.py new file mode 100644 index 0000000..ac4b156 --- /dev/null +++ b/src/utils/election/utils.py @@ -0,0 +1,38 @@ +"""Utilities for cleaning election results data""" + +import pandas as pd + + +def create_single_last_name(row: pd.Series) -> pd.Series: + """Create single_last_name column for election result table + + For more efficient matching, create the a column that shows the last word of full name + + Inputs: Row + + Returns: Row with single_last_name column + """ + last_name = row["last"] + row["single_last_name"] = last_name.lower().strip().split()[-1] + return row + + +def extract_first_name(full_name: str) -> str: + """Extracts and standardizes the first name from a full name string. + + Assumes format: "LastName, FirstName (Nickname)" or "LastName, FirstName". + The result is returned in lower case. + The function is designed for the harvard election result data + + Args: + full_name (str): A string containing the full name. + + Returns: + str: The standardized first name in lower case. + """ + parts = full_name.split(",") + if len(parts) > 1: + first_name_part = parts[1].strip() + first_name = first_name_part.split("(")[0].strip() + return first_name.lower() + return "" diff --git a/src/utils/ind_transform.py b/src/utils/ind_transform.py new file mode 100644 index 0000000..f4ced52 --- /dev/null +++ b/src/utils/ind_transform.py @@ -0,0 +1,133 @@ +"""Module for standardize transform_pipeline result""" + +import pandas as pd +from nameparser import HumanName + +from utils.linkage import get_likely_name + + +# this function needs to be changed -- it takes too long time +def standardize_individual_names(row: pd.Series) -> pd.Series: + """Standardizes the name-related columns for individuals + + Create/strandardize name-reatled columns including first, last and full names. + + Inputs: Row + + Returns: Row (the function is used to "apply" to each row) + """ + if pd.isna(row["first_name"]) and pd.notna(row["full_name"]): + name = HumanName(row["full_name"]) + row["first_name"] = name.first.strip().upper() + if pd.isna(row["last_name"]) and pd.notna(row["full_name"]): + name = HumanName(row["full_name"]) + row["last_name"] = name.last.strip().upper() + + # Update full name based on first and last name + row["full_name"] = get_likely_name( + row["first_name"], row["last_name"], row["full_name"] + ) + return row + + +def standardize_last_name(row: pd.Series) -> pd.Series: + """Standardizes the last_name col for individuals based on name-related columns + + Inputs: Row + + Returns: Row (the function is used to "apply" to each row) + """ + # there is no row with both last name and full name columns empty + if pd.isna(row["last_name"]): + name = HumanName(row["full_name"]) + row["last_name"] = name.last.strip().lower() + else: + last_name = row["last_name"].strip().lower() + row["last_name"] = last_name + return row + + +def contains_custom_special_characters(s: str) -> bool: + """Check if a string contains special characters + + Inputs: String + + Returns: bool + """ + special_chars = "!@#$%^&*()" + return any(char in special_chars for char in s) + + +def ind_table_create_single_last_name(row: pd.Series) -> pd.Series: + """Create single_last_name column for individual table + + For more efficient matching, create the a column that shows the last word of full name + + Inputs: Row + + Returns: Row with single_last_name column + """ + last_name = row["last_name"] + if contains_custom_special_characters(last_name): + full_name = row["full_name"] + row["single_last_name"] = full_name.lower().strip().split()[-1] + else: + row["single_last_name"] = last_name.lower().strip().split()[-1] + return row + + +def election_table_create_single_last_name(row: pd.Series) -> pd.Series: + """Create single_last_name column for election result table + + For more efficient matching, create the a column that shows the last word of full name + + Inputs: Row + + Returns: Row with single_last_name column + """ + last_name = row["last"] + row["single_last_name"] = last_name.lower().strip().split()[-1] + return row + + +# create a function to select columns with single_last_name is unique +def add_unique_lastname_column( + df_with_names: pd.DataFrame, lastname_col: str +) -> pd.DataFrame: + """Adds a column to the DataFrame indicating if the last name is unique. + + Inputs: + dataframe (pd.DataFrame): The DataFrame containing the personal information. + lastname_col (str): The column name where the last names are stored. + + Returns: + pd.DataFrame: The original DataFrame with an additional 'is_unique_lastname' column. + """ + # Check the uniqueness of each last name + lastname_counts = df_with_names[lastname_col].value_counts() + # Map the counts back to the original dataframe to check if each lastname appears only once + df_with_names["is_unique_lastname"] = ( + df_with_names[lastname_col].map(lastname_counts) == 1 + ) + return df_with_names + + +def extract_first_name(full_name: str) -> str: + """Extracts and standardizes the first name from a full name string. + + Assumes format: "LastName, FirstName (Nickname)" or "LastName, FirstName". + The result is returned in lower case. + The function is designed for the harvard election result data + + Args: + full_name (str): A string containing the full name. + + Returns: + str: The standardized first name in lower case. + """ + parts = full_name.split(",") + if len(parts) > 1: + first_name_part = parts[1].strip() + first_name = first_name_part.split("(")[0].strip() + return first_name.lower() + return "" diff --git a/src/utils/linkage.py b/src/utils/linkage.py index d14f57c..11901e6 100644 --- a/src/utils/linkage.py +++ b/src/utils/linkage.py @@ -303,7 +303,7 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: names = names.title().replace(" ", " ").split(" ") final_name = [] [final_name.append(x) for x in names if x not in final_name] - return " ".join(final_name).strip() + return " ".join(final_name).strip().strip().upper() def get_street_from_address_line_1(address_line_1: str) -> str: diff --git a/src/utils/linkage_and_network_pipeline.py b/src/utils/linkage_and_network_pipeline.py index 860cdb2..f4581f4 100644 --- a/src/utils/linkage_and_network_pipeline.py +++ b/src/utils/linkage_and_network_pipeline.py @@ -25,6 +25,8 @@ from utils.network import ( combine_datasets_for_network_graph, create_network_graph, + network_metrics, + plot_macro_level_graph, run_network_graph_pipeline, ) @@ -217,5 +219,14 @@ def clean_data_and_build_network( g = create_network_graph(aggreg_df) g_output_path = BASE_FILEPATH / "output" / "g.gml" nx.write_graphml(g, g_output_path) + centrality_metrics, communities = network_metrics(g) - run_network_graph_pipeline(2018, 2022, [individuals, organizations, transactions]) + # this creates the micro-level visualization which is + # stored in the output/network_graphs location + run_network_graph_pipeline(2018, 2023, [individuals, organizations, transactions]) + + # this creates the macro-level visualization - run this file in an interactive window in + # case the output figure is not displayed + plot_macro_level_graph( + g, communities, {"betweenness": nx.betweenness_centrality(g, weight="amount")} + ) diff --git a/src/utils/network.py b/src/utils/network.py index 467fbb4..570cb0f 100644 --- a/src/utils/network.py +++ b/src/utils/network.py @@ -1,12 +1,11 @@ -"""Buidling, visualizing, and analyzing networks (micro-level)""" +"""Buidling, visualizing, and analyzing networks""" import itertools from pathlib import Path import matplotlib.pyplot as plt import networkx as nx - -# import numpy as np +import numpy as np import pandas as pd import plotly.graph_objects as go @@ -92,7 +91,6 @@ def combine_datasets_for_network_graph(dfs: list[pd.DataFrame]) -> pd.DataFrame: return aggreg_df -# RETAINED def create_network_graph(df: pd.DataFrame) -> nx.MultiDiGraph: """Creates network with entities as nodes, transactions as edges @@ -130,8 +128,20 @@ def create_network_graph(df: pd.DataFrame) -> nx.MultiDiGraph: return G +# Note: dict calls retained due to conventions in visualization package + + def plot_network_graph(G: nx.MultiDiGraph, start_year: int, end_year: int) -> None: - """Creates a plotly visualization of the nodes and edges with arrows indicating direction, and colors indicating classification.""" + """Creates a plotly visualization of the nodes and edges with arrows indicating direction, and colors indicating classification. + + Args: + G: A Networkx MultiDiGraph with nodes and edges + start_year: starting year to begin subsetting the data to be used for the visualization + end_year: end year to finish subsetting the data to be used for the visualization + + Returns: + A Networkx MultiDiGraph with nodes and edges + """ pos = nx.spring_layout( G ) # position nodes using the spring layout - retained from original code @@ -288,17 +298,17 @@ def network_metrics(net_graph: nx.Graph) -> None: "density": density, } - # with Path("output/network_metrics.txt").open("w") as file: - # file.write(f"in degree centrality: {in_degree}\n") - # file.write(f"out degree centrality: {out_degree}\n") - # file.write(f"eigenvector centrality: {eigenvector}\n") - # file.write(f"betweenness centrality: {betweenness}\n\n") + with Path("output/network_metrics.txt").open("w") as file: + file.write(f"in degree centrality: {in_degree}\n") + file.write(f"out degree centrality: {out_degree}\n") + file.write(f"eigenvector centrality: {eigenvector}\n") + file.write(f"betweenness centrality: {betweenness}\n\n") - # file.write(f"assortativity based on 'classification': {assortativity}\n\n") + file.write(f"assortativity based on 'classification': {assortativity}\n\n") - # file.write(f"density': {density}\n\n") + file.write(f"density': {density}\n\n") - # file.write(f"communities where k = 5': {communities}\n\n") + file.write(f"communities where k = 5': {communities}\n\n") return metrics, communities @@ -328,14 +338,14 @@ def run_network_graph_pipeline( plot_network_graph(G, start_year, end_year) -# added for macro-level viz - Work in Progress +# added function for macro-level cluster viz def additional_network_metrics(G: nx.Graph) -> None: """Calculate and print additional network metrics Args: - G: network graph created + G: network graph created with edges and nodes Returns: - some metrics requried for clustering viz + prints some additional metrics that may be requried for clustering viz """ # switch the MultiDiGraph to DiGraph for computing simple_graph = nx.DiGraph(G) @@ -351,13 +361,6 @@ def additional_network_metrics(G: nx.Graph) -> None: print("Average Clustering Coefficient:", clustering_coeff) -# for testing -individuals = pd.read_csv("output/cleaned/individuals_table.csv") -organizations = pd.read_csv("output/cleaned/organizations_table.csv") -transactions = pd.read_csv("output/cleaned/transactions_table.csv") -run_network_graph_pipeline(2018, 2021, [individuals, organizations, transactions]) - - def plot_macro_level_graph( net_graph: nx.Graph, communities: list, centrality_metrics: list ) -> None: @@ -367,16 +370,19 @@ def plot_macro_level_graph( net_graph (nx.Graph): The networkx graph object. communities (list of lists): Each sublist contains nodes that form a community. centrality_metrics (dict): Dictionary containing various centrality measures. + + Returns: + None, creates visualization """ pos = nx.spring_layout(net_graph) - plt.figure(figsize=(12, 8)) + plt.figure(figsize=(15, 8)) # mapping each node to its community - # community_map = { - # node: idx for idx, community in enumerate(communities) for node in community - # } - # obtaining colors for each community - # community_colors = np.array([community_map[node] for node in net_graph.nodes()]) + community_map = { + node: idx for idx, community in enumerate(communities) for node in community + } + # obtaining colors for each community for coloring of nodes + community_colors = np.array([community_map[node] for node in net_graph.nodes()]) # putting down nodes node_sizes = [ @@ -385,51 +391,44 @@ def plot_macro_level_graph( nx.draw_networkx_nodes( net_graph, pos, - # node_color=community_colors, + node_color=community_colors, node_size=node_sizes, - cmap=plt.cm.jet, + cmap=plt.get_cmap("viridis"), + ax=plt.gca(), alpha=0.7, ) # drawing edges nx.draw_networkx_edges(net_graph, pos, alpha=0.5) - # labels for high centrality nodes + # adding labels for high centrality nodes high_centrality_nodes = [ node for node in centrality_metrics["betweenness"] if centrality_metrics["betweenness"][node] > sorted(centrality_metrics["betweenness"].values())[-10] - ] # have to adjust threshold + ] # can adjust threshold here to display labels nx.draw_networkx_labels( net_graph, pos, labels={node: node for node in high_centrality_nodes}, font_size=10, ) + mapper = plt.cm.ScalarMappable(cmap=plt.cm.viridis) + ax = plt.gca() + plt.colorbar( + mapper, + ax=ax, + orientation="horizontal", + label="Community ID", + fraction=0.036, + pad=0.04, + ) - plt.title("Macro-Level Clustering View of Network Graph") - # plt.colorbar( - # plt.cm.ScalarMappable(cmap=plt.cm.jet), - # orientation="horizontal", - # label="Community ID", - # ) + plt.title("Macro-Level Clustering View of Network Graph", fontsize=16) plt.axis("off") + graphs_directory = Path("output/network_graphs") + graphs_directory.mkdir(parents=True, exist_ok=True) + filename = graphs_directory / f"macro_level_{centrality_metrics[0]}.png" + plt.savefig(str(filename)) plt.show() - - -# testing usage of macro level viz function - change paths if needed and RUN IN AN INTERACTIVE WINDOW TO DISPLAY GRAPH -# TODO: make default paths more robust -# TODO: move script to scripts directory -individuals = pd.read_csv("/project/output/cleaned/individuals_table.csv") -organizations = pd.read_csv("/project/output/cleaned/organizations_table.csv") -transactions = pd.read_csv("/project/output/cleaned/transactions_table.csv") - -aggreg_df = combine_datasets_for_network_graph( - [individuals, organizations, transactions] -) -G = create_network_graph(aggreg_df) -centrality_metrics, communities = network_metrics(G) -plot_macro_level_graph( - G, communities, {"betweenness": nx.betweenness_centrality(G, weight="amount")} -) diff --git a/src/utils/transform/constants.py b/src/utils/transform/constants.py index 8a12645..6ebedb9 100644 --- a/src/utils/transform/constants.py +++ b/src/utils/transform/constants.py @@ -192,7 +192,6 @@ "fundraiser", ] - # PA EDA constants: PA_SCHEMA_CHANGE_YEAR = 2022 @@ -493,6 +492,13 @@ "aggregate", ] +TX_IND_FILEPATH = BASE_FILEPATH / "data" / "TX_full" / "individuals_table_TX_14-16.csv" + +TX_ORG_FILEPATH = ( + BASE_FILEPATH / "data" / "TX_full" / "organizations_table_TX_14-16.csv" +) + +TX_TRA_FILEPATH = BASE_FILEPATH / "data" / "TX_full" / "transactions_table_TX_14-16.csv" state_abbreviations = [ " AK ", diff --git a/src/utils/transform/michigan.py b/src/utils/transform/michigan.py index c023c3c..0e3b0b1 100644 --- a/src/utils/transform/michigan.py +++ b/src/utils/transform/michigan.py @@ -6,6 +6,7 @@ import pandas as pd from utils.constants import BASE_FILEPATH +from utils.ind_transform import standardize_individual_names from utils.transform.clean import StateTransformer from utils.transform.constants import ( MI_CON_FILEPATH, @@ -639,6 +640,7 @@ def standardize_and_concatenate_individuals( "candidate_full_name": "full_name", } ) + individuals = individuals.apply(standardize_individual_names, axis=1) individuals_df = pd.concat( [ individuals, diff --git a/src/utils/transform/minnesota.py b/src/utils/transform/minnesota.py index 26eb2a7..71c7b66 100644 --- a/src/utils/transform/minnesota.py +++ b/src/utils/transform/minnesota.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from utils.ind_transform import standardize_individual_names from utils.transform.clean import StateTransformer from utils.transform.constants import ( MN_CANDIDATE_CONTRIBUTION_COL, @@ -369,6 +370,7 @@ def create_tables( } ) ind_df = pd.concat([ind_recipient_df, ind_donor_df], ignore_index=True) + ind_df = ind_df.apply(standardize_individual_names, axis=1) # Create organization table from both recipient and donor entries org_recipient_df = pd.DataFrame( diff --git a/src/utils/transform/pennsylvania.py b/src/utils/transform/pennsylvania.py index 893d32e..872adb1 100644 --- a/src/utils/transform/pennsylvania.py +++ b/src/utils/transform/pennsylvania.py @@ -6,6 +6,7 @@ import pandas as pd from utils.constants import BASE_FILEPATH +from utils.ind_transform import standardize_individual_names from utils.transform import clean from utils.transform import constants as const @@ -227,6 +228,7 @@ def make_individuals_table(self, df: pd.DataFrame) -> pd.DataFrame: new_cols = ["first_name", "last_name", "company"] all_individuals = all_individuals.assign(**{col: None for col in new_cols}) all_individuals["state"] = "PA" + all_individuals = all_individuals.apply(standardize_individual_names, axis=1) return all_individuals diff --git a/src/utils/transform/pipeline.py b/src/utils/transform/pipeline.py index fe7c81b..4bc8e34 100644 --- a/src/utils/transform/pipeline.py +++ b/src/utils/transform/pipeline.py @@ -30,6 +30,7 @@ def transform_and_merge( """ if state_cleaners is None: state_cleaners = ALL_STATE_CLEANERS + single_state_individuals_tables = [] single_state_organizations_tables = [] single_state_transactions_tables = [] @@ -43,7 +44,6 @@ def transform_and_merge( single_state_individuals_tables.append(individuals_table) single_state_organizations_tables.append(organizations_table) single_state_transactions_tables.append(transactions_table) - complete_individuals_table = pd.concat(single_state_individuals_tables) complete_organizations_table = pd.concat(single_state_organizations_tables) complete_transactions_table = pd.concat(single_state_transactions_tables) diff --git a/src/utils/transform/texas.py b/src/utils/transform/texas.py new file mode 100644 index 0000000..454cda7 --- /dev/null +++ b/src/utils/transform/texas.py @@ -0,0 +1,162 @@ +"""State transformer implementation for Texas (for data in TX_full folder only)""" + +import numpy as np +import pandas as pd + +from utils.transform.clean import StateTransformer +from utils.transform.constants import TX_IND_FILEPATH + +# TX_ORG_FILEPATH, TX_TRA_FILEPATH +from utils.transform.utils import get_full_name + + +class TexasTransformer(StateTransformer): + """State transformer implementation for Texas in tx_full folder""" + + name = "Texas" + stable_id_across_years = True + entity_name_dictrionary = {} + + def preprocess(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Turns filepath into dataframes + + returns: dataframes of Texas + """ + raw_ind_df = pd.read_csv(TX_IND_FILEPATH) + # raw_org_df = pd.read_csv(TX_ORG_FILEPATH) + # raw_tra_df = pd.read_csv(TX_TRA_FILEPATH) + # to save more space, I did read the actual org and tra csv + raw_org_df = pd.DataFrame() + raw_tra_df = pd.DataFrame() + + return raw_ind_df, raw_org_df, raw_tra_df + + def clean( + self, + raw_ind_df: pd.DataFrame, + raw_org_df: pd.DataFrame, + raw_tra_df: pd.DataFrame, + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Cleans the dataframes as needed and returns the dataframes + + Cleans the columns, converts dtypes to match database schema, and drops + rows not representing minimal viable transactions + + Inputs: + raw_ind_df, raw_org_df, raw_tra_df: Dataframes as outputs from preprocess method. + + Returns: Dataframe + """ + clean_ind_df = raw_ind_df.copy(deep=True) + + clean_ind_df["FULL_NAME"] = clean_ind_df.apply( + lambda row: get_full_name( + row["FIRST_NAME"], row["LAST_NAME"], row["FULL_NAME"] + ), + axis=1, + ) + clean_ind_df = clean_ind_df.fillna("Unknown") + + clean_ind_df["LAST_NAME"] = clean_ind_df["LAST_NAME"].str.lower().str.strip() + clean_ind_df["FIRST_NAME"] = ( + clean_ind_df["FIRST_NAME"].str.lower().str.strip().str.split().str[0] + ) + clean_ind_df["FULL_NAME"] = clean_ind_df["FULL_NAME"].str.lower().str.strip() + clean_ind_df["CITY"] = clean_ind_df["CITY"].str.lower().str.strip() + + return clean_ind_df, raw_org_df, raw_tra_df + + def standardize( + self, ind_df: pd.DataFrame, org_df: pd.DataFrame, tra_df: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Standardizes the dataframe into the necessary format for the schema + + Inputs: + data: a list of 1 or 3 dataframes[ind,org,transaction] as outputted from clean method. + + Returns: a list of dataframes. + """ + ind_df["address"] = np.nan + ind_df["party"] = np.nan + # for data merging use, should be in the clean process but we are not cleaning for the texas data + ind_df["single_last_name"] = ind_df["LAST_NAME"] + + ind_df["address"] = ind_df.apply( + lambda row: get_full_name( + row["ADDRESS_LINE_1"], row["ADDRESS_LINE_2"], row["address"] + ), + axis=1, + ) + ind_df = ind_df.drop( + [ + "Unnamed: 0.1", + "Unnamed: 0", + "ADDRESS_LINE_1", + "ADDRESS_LINE_2", + "ENTITY_TYPE_SPECIFIC", + "ENTITY_TYPE", + "ORIGINAL_ID", + ], + axis=1, + ) + + ind_df = ind_df.rename( + columns={ + "ID": "id", + "ENTITY_TYPE_GENERAL": "entity_type", + "FULL_NAME": "full_name", + "LAST_NAME": "last_name", + "FIRST_NAME": "first_name", + "CITY": "city", + "STATE": "state", + "ZIP_CODE": "zip", + "EMPLOYER": "company", + "OCCUPATION": "occupation", + ## these are the columns that are not in the current ind_table format + "PHONE_NUMBER": "phone_number", + "OFFICE_SOUGHT": "office_sought", + "DISTRICT": "district", + } + ) + + return ind_df, org_df, tra_df + + def create_tables( + self, ind_df: pd.DataFrame, org_df: pd.DataFrame, tra_df: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Creates the Individuals, Organizations, and Transactions tables + + Inputs: + data: a list of 1 or 3 dataframes as output from standardize method. + + Returns: (individuals_table, organizations_table, transactions_table) + tuple containing the tables as defined in database schema + """ + return ind_df, org_df, tra_df + + def clean_state(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Runs the StateCleaner pipeline returning a tuple of cleaned dataframes + + Returns: use preprocess, clean, standardize, and create_tables methods + to output (individuals_table, organizations_table, transactions_table) + as defined in database schema + + Inputs: + filepaths_list: list of absolute filepaths to relevant state data. + required naming conventions, order, and extensions + defined per state. + + Returns: cleans the state and returns the standardized Inidividuals, + Organizations, and list of Transactions tables in the order: + [ind->ind, ind->org, org->ind, org->org] tables in a tuple + """ + raw_ind_df, raw_org_df, raw_tra_df = self.preprocess() + clean_ind_df, clean_org_df, clean_tra_df = self.clean( + raw_ind_df, raw_org_df, raw_tra_df + ) + sd_ind_df, sd_org_df, sd_tra_df = self.standardize( + clean_ind_df, clean_org_df, clean_tra_df + ) + ind_df, org_df, tra_df = self.create_tables(sd_ind_df, sd_org_df, sd_tra_df) + + return ind_df, org_df, tra_df diff --git a/src/utils/transform/tx_pipeline.py b/src/utils/transform/tx_pipeline.py new file mode 100644 index 0000000..260e84d --- /dev/null +++ b/src/utils/transform/tx_pipeline.py @@ -0,0 +1,48 @@ +"""Merge raw state campaign finance into standardized schema""" + +import pandas as pd + +from utils.transform.clean import StateTransformer +from utils.transform.texas import TexasTransformer + +ALL_STATE_CLEANERS = [ + TexasTransformer(), +] + + +def transform_and_merge( + state_cleaners: list[StateTransformer] = None, +) -> list[pd.DataFrame]: + """From raw datafiles, clean, merge, and reformat data from specified states. + + Args: + state_cleaners: List of state cleaners to merge data from. If None, + will default to all state_cleaners + + Returns: + list of individuals, organizations, and transactions tables + """ + if state_cleaners is None: + state_cleaners = ALL_STATE_CLEANERS + + single_state_individuals_tables = [] + single_state_organizations_tables = [] + single_state_transactions_tables = [] + for state_cleaner in state_cleaners: + print("Cleaning...") + ( + individuals_table, + organizations_table, + transactions_table, + ) = state_cleaner.clean_state() + single_state_individuals_tables.append(individuals_table) + single_state_organizations_tables.append(organizations_table) + single_state_transactions_tables.append(transactions_table) + complete_individuals_table = pd.concat(single_state_individuals_tables) + complete_organizations_table = pd.concat(single_state_organizations_tables) + complete_transactions_table = pd.concat(single_state_transactions_tables) + return ( + complete_individuals_table, + complete_organizations_table, + complete_transactions_table, + ) diff --git a/src/utils/transform/utils.py b/src/utils/transform/utils.py index 262cf5e..fc6447e 100644 --- a/src/utils/transform/utils.py +++ b/src/utils/transform/utils.py @@ -43,3 +43,15 @@ def remove_nonstandard(col: pd.Series) -> pd.Series: # turns oversized whitespace to single space return col + + +def get_full_name(first_name: str, last_name: str, full_name: str) -> str: + """Returns potential full name based on first_name and last_name column + + Input: First_name, last_name: strings of first name and last name + + Return: String of full name + """ + if pd.isna(full_name) or full_name.strip() == "": + return f"{first_name} {last_name}" + return full_name