From 3ac77b14bc348d52cee47db18bb91bb86ca03033 Mon Sep 17 00:00:00 2001 From: Kaya Lee Date: Mon, 6 May 2024 22:05:44 -0500 Subject: [PATCH] fixed linkage testing notebook and added docstring to classify_InfoGroup_data.py --- EDA_and_tests/linkage_test.ipynb | 58 +++------------------------- src/utils/classify_InfoGroup_data.py | 5 ++- 2 files changed, 9 insertions(+), 54 deletions(-) diff --git a/EDA_and_tests/linkage_test.ipynb b/EDA_and_tests/linkage_test.ipynb index 1255abc..355962d 100644 --- a/EDA_and_tests/linkage_test.ipynb +++ b/EDA_and_tests/linkage_test.ipynb @@ -564,60 +564,12 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.0155.\n", - "This means that amongst all possible pairwise record comparisons, one in 64.67 are expected to match. With 1,212,903 total possible comparisons, we expect a total of around 18,756.67 matching pairs\n", - "----- Estimating u probabilities using random sampling -----\n", - "\n", - "Estimated u probabilities using random sampling\n", - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n", - "\n", - "----- Starting EM training session -----\n", - "\n", - "Estimating the m probabilities of the model by blocking on:\n", - "l.company_name = r.company_name\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - "\n" - ] - }, - { - "ename": "SplinkException", - "evalue": "Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):\n\n CREATE TABLE __splink__m_u_counts_d3d7043ef\n AS\n (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), \n__splink__df_match_weight_parts as (\n select \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_comparison_vectors\n ), \n__splink__df_predict as (\n select\n log2(cast(0.015707176032864763 as float8) * ) as match_weight,\n CASE WHEN THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,\n \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_match_weight_parts\n \n order by 1\n ) \n select 0 as comparison_vector_value,\n sum(match_probability * 1) /\n sum(1) as m_count,\n sum((1-match_probability) * 1) /\n sum(1) as u_count,\n '_probability_two_random_records_match' as output_column_name\n from __splink__df_predict\n )\n \n\nError was: Parser Error: syntax error at or near \")\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mParserException\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:713\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 712\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 713\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 715\u001b[0m \u001b[38;5;66;03m# Parse our SQL through sqlglot to pretty print\u001b[39;00m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:225\u001b[0m, in \u001b[0;36mDuckDBLinker._run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_sql_execution\u001b[39m(\u001b[38;5;28mself\u001b[39m, final_sql, templated_name, physical_name):\n\u001b[0;32m--> 225\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_con\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mParserException\u001b[0m: Parser Error: syntax error at or near \")\"", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mSplinkException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msplink_dedupe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msettings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morganizations_settings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblocking\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43morganizations_blocking\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/project/src/utils/linkage.py:578\u001b[0m, in \u001b[0;36msplink_dedupe\u001b[0;34m(df, settings, blocking)\u001b[0m\n\u001b[1;32m 575\u001b[0m linker\u001b[38;5;241m.\u001b[39mestimate_u_using_random_sampling(max_pairs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5e6\u001b[39m)\n\u001b[1;32m 577\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m blocking:\n\u001b[0;32m--> 578\u001b[0m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_parameters_using_expectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 580\u001b[0m df_predict \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mpredict()\n\u001b[1;32m 581\u001b[0m clusters \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mcluster_pairwise_predictions_at_threshold(\n\u001b[1;32m 582\u001b[0m df_predict, threshold_match_probability\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.7\u001b[39m\n\u001b[1;32m 583\u001b[0m ) \u001b[38;5;66;03m# default\u001b[39;00m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:1706\u001b[0m, in \u001b[0;36mLinker.estimate_parameters_using_expectation_maximisation\u001b[0;34m(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)\u001b[0m\n\u001b[1;32m 1684\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mWARNING: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have provided comparisons_to_deactivate but not \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas an exact match.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1693\u001b[0m )\n\u001b[1;32m 1695\u001b[0m em_training_session \u001b[38;5;241m=\u001b[39m EMTrainingSession(\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1697\u001b[0m blocking_rule,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1703\u001b[0m estimate_without_term_frequencies\u001b[38;5;241m=\u001b[39mestimate_without_term_frequencies,\n\u001b[1;32m 1704\u001b[0m )\n\u001b[0;32m-> 1706\u001b[0m \u001b[43mem_training_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_populate_m_u_from_trained_values()\n\u001b[1;32m 1710\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m populate_probability_two_random_records_match_from_trained_values:\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/em_training_session.py:197\u001b[0m, in \u001b[0;36mEMTrainingSession._train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m EMTrainingException(\n\u001b[1;32m 182\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTraining rule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbr_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m resulted in no record pairs. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis means that in the supplied data set \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe number of comparisons that will be generated by a blocking rule.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 192\u001b[0m )\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# Compute the new params, populating the paramters in the copied settings object\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;66;03m# At this stage, we do not overwrite any of the parameters\u001b[39;00m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;66;03m# in the original (main) setting object\u001b[39;00m\n\u001b[0;32m--> 197\u001b[0m \u001b[43mexpectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcvv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 199\u001b[0m rule \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_blocking_rule_for_training\u001b[38;5;241m.\u001b[39mblocking_rule_sql\n\u001b[1;32m 200\u001b[0m training_desc \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEM, blocked on: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrule\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/expectation_maximisation.py:256\u001b[0m, in \u001b[0;36mexpectation_maximisation\u001b[0;34m(em_training_session, df_comparison_vector_values)\u001b[0m\n\u001b[1;32m 254\u001b[0m df_params \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39m_execute_sql_pipeline([agreement_pattern_counts])\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 256\u001b[0m df_params \u001b[38;5;241m=\u001b[39m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf_comparison_vector_values\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 257\u001b[0m param_records \u001b[38;5;241m=\u001b[39m df_params\u001b[38;5;241m.\u001b[39mas_pandas_dataframe()\n\u001b[1;32m 258\u001b[0m param_records \u001b[38;5;241m=\u001b[39m compute_proportions_for_new_parameters(param_records)\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:651\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sql_to_splink_dataframe_checking_cache(\n\u001b[1;32m 646\u001b[0m sql_gen,\n\u001b[1;32m 647\u001b[0m output_tablename_templated,\n\u001b[1;32m 648\u001b[0m use_cache,\n\u001b[1;32m 649\u001b[0m )\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mreset()\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:645\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 642\u001b[0m output_tablename_templated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mqueue[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39moutput_table_name\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_to_splink_dataframe_checking_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43msql_gen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:911\u001b[0m, in \u001b[0;36mLinker._sql_to_splink_dataframe_checking_cache\u001b[0;34m(self, sql, output_tablename_templated, use_cache)\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_pd) \u001b[38;5;66;03m# noqa: T201\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 911\u001b[0m splink_dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_against_backend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name_hash\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intermediate_table_cache\u001b[38;5;241m.\u001b[39mexecuted_queries\u001b[38;5;241m.\u001b[39mappend(splink_dataframe)\n\u001b[1;32m 916\u001b[0m splink_dataframe\u001b[38;5;241m.\u001b[39mcreated_by_splink \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:220\u001b[0m, in \u001b[0;36mDuckDBLinker._execute_sql_against_backend\u001b[0;34m(self, sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_delete_table_from_database(physical_name)\n\u001b[1;32m 215\u001b[0m sql \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124mCREATE TABLE \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124mAS\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m--> 220\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_and_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DuckDBDataFrame(templated_name, physical_name, \u001b[38;5;28mself\u001b[39m)\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:725\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 723\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SplinkException(\n\u001b[1;32m 726\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError executing the following sql for table \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 727\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemplated_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfinal_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mError was: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 729\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", - "\u001b[0;31mSplinkException\u001b[0m: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):\n\n CREATE TABLE __splink__m_u_counts_d3d7043ef\n AS\n (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), \n__splink__df_match_weight_parts as (\n select \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_comparison_vectors\n ), \n__splink__df_predict as (\n select\n log2(cast(0.015707176032864763 as float8) * ) as match_weight,\n CASE WHEN THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,\n \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_match_weight_parts\n \n order by 1\n ) \n select 0 as comparison_vector_value,\n sum(match_probability * 1) /\n sum(1) as m_count,\n sum((1-match_probability) * 1) /\n sum(1) as u_count,\n '_probability_two_random_records_match' as output_column_name\n from __splink__df_predict\n )\n \n\nError was: Parser Error: syntax error at or near \")\"" - ] - } - ], + "outputs": [], "source": [ - "splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)" + "# currently throwing an error \n", + "#splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)" ] }, { @@ -644,7 +596,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.2" } }, "nbformat": 4, diff --git a/src/utils/classify_InfoGroup_data.py b/src/utils/classify_InfoGroup_data.py index de6551d..62bfa75 100644 --- a/src/utils/classify_InfoGroup_data.py +++ b/src/utils/classify_InfoGroup_data.py @@ -1,3 +1,5 @@ +"""Script to get relevant InfoGroup data for classification in the pipeline""" + import re # TODO: #92 Make orgs classification script into more well-defined pipeline @@ -100,10 +102,11 @@ def prepare_infogroup_data( Args: infogroup_csv: the InfoGroup csv file SIC6_codes_df: DataFrame of the relevant SIC6 codes w/ corresponding regex codes and descriptions + output_file_path: the resulting df will be written as a csv to this file path location testing: Boolean - True if code is being tested on only several chunks, False if whole InfoGroup csv should be used chunksize: the number of rows per chunk in the IG dataset (default 10,000 but can be changed for testing purposes) num_testing_chunks: number of chunks to iterate through when testing = True - output_file_path: the resulting df will be written as a csv to this file path location + Returns: a DataFrame with information for only the relevant companies from the InfoGroup dataset that is formatted in the same schema as the aggregated company df for downstream