Skip to content

Commit

Permalink
fixed linkage testing notebook and added docstring to classify_InfoGr…
Browse files Browse the repository at this point in the history
…oup_data.py
  • Loading branch information
klee2020 committed May 7, 2024
1 parent 961f3f8 commit 3ac77b1
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 54 deletions.
58 changes: 5 additions & 53 deletions EDA_and_tests/linkage_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -564,60 +564,12 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Probability two random records match is estimated to be 0.0155.\n",
"This means that amongst all possible pairwise record comparisons, one in 64.67 are expected to match. With 1,212,903 total possible comparisons, we expect a total of around 18,756.67 matching pairs\n",
"----- Estimating u probabilities using random sampling -----\n",
"\n",
"Estimated u probabilities using random sampling\n",
"\n",
"Your model is fully trained. All comparisons have at least one estimate for their m and u values\n",
"\n",
"----- Starting EM training session -----\n",
"\n",
"Estimating the m probabilities of the model by blocking on:\n",
"l.company_name = r.company_name\n",
"\n",
"Parameter estimates will be made for the following comparison(s):\n",
"\n",
"Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
"\n"
]
},
{
"ename": "SplinkException",
"evalue": "Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):\n\n CREATE TABLE __splink__m_u_counts_d3d7043ef\n AS\n (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), \n__splink__df_match_weight_parts as (\n select \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_comparison_vectors\n ), \n__splink__df_predict as (\n select\n log2(cast(0.015707176032864763 as float8) * ) as match_weight,\n CASE WHEN THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,\n \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_match_weight_parts\n \n order by 1\n ) \n select 0 as comparison_vector_value,\n sum(match_probability * 1) /\n sum(1) as m_count,\n sum((1-match_probability) * 1) /\n sum(1) as u_count,\n '_probability_two_random_records_match' as output_column_name\n from __splink__df_predict\n )\n \n\nError was: Parser Error: syntax error at or near \")\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mParserException\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:713\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 712\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 713\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 715\u001b[0m \u001b[38;5;66;03m# Parse our SQL through sqlglot to pretty print\u001b[39;00m\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:225\u001b[0m, in \u001b[0;36mDuckDBLinker._run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_sql_execution\u001b[39m(\u001b[38;5;28mself\u001b[39m, final_sql, templated_name, physical_name):\n\u001b[0;32m--> 225\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_con\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mParserException\u001b[0m: Parser Error: syntax error at or near \")\"",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mSplinkException\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msplink_dedupe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msettings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morganizations_settings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblocking\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43morganizations_blocking\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/project/src/utils/linkage.py:578\u001b[0m, in \u001b[0;36msplink_dedupe\u001b[0;34m(df, settings, blocking)\u001b[0m\n\u001b[1;32m 575\u001b[0m linker\u001b[38;5;241m.\u001b[39mestimate_u_using_random_sampling(max_pairs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5e6\u001b[39m)\n\u001b[1;32m 577\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m blocking:\n\u001b[0;32m--> 578\u001b[0m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_parameters_using_expectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 580\u001b[0m df_predict \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mpredict()\n\u001b[1;32m 581\u001b[0m clusters \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mcluster_pairwise_predictions_at_threshold(\n\u001b[1;32m 582\u001b[0m df_predict, threshold_match_probability\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.7\u001b[39m\n\u001b[1;32m 583\u001b[0m ) \u001b[38;5;66;03m# default\u001b[39;00m\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:1706\u001b[0m, in \u001b[0;36mLinker.estimate_parameters_using_expectation_maximisation\u001b[0;34m(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)\u001b[0m\n\u001b[1;32m 1684\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mWARNING: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have provided comparisons_to_deactivate but not \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas an exact match.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1693\u001b[0m )\n\u001b[1;32m 1695\u001b[0m em_training_session \u001b[38;5;241m=\u001b[39m EMTrainingSession(\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1697\u001b[0m blocking_rule,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1703\u001b[0m estimate_without_term_frequencies\u001b[38;5;241m=\u001b[39mestimate_without_term_frequencies,\n\u001b[1;32m 1704\u001b[0m )\n\u001b[0;32m-> 1706\u001b[0m \u001b[43mem_training_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_populate_m_u_from_trained_values()\n\u001b[1;32m 1710\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m populate_probability_two_random_records_match_from_trained_values:\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/em_training_session.py:197\u001b[0m, in \u001b[0;36mEMTrainingSession._train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m EMTrainingException(\n\u001b[1;32m 182\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTraining rule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbr_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m resulted in no record pairs. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis means that in the supplied data set \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe number of comparisons that will be generated by a blocking rule.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 192\u001b[0m )\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# Compute the new params, populating the paramters in the copied settings object\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;66;03m# At this stage, we do not overwrite any of the parameters\u001b[39;00m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;66;03m# in the original (main) setting object\u001b[39;00m\n\u001b[0;32m--> 197\u001b[0m \u001b[43mexpectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcvv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 199\u001b[0m rule \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_blocking_rule_for_training\u001b[38;5;241m.\u001b[39mblocking_rule_sql\n\u001b[1;32m 200\u001b[0m training_desc \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEM, blocked on: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrule\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/expectation_maximisation.py:256\u001b[0m, in \u001b[0;36mexpectation_maximisation\u001b[0;34m(em_training_session, df_comparison_vector_values)\u001b[0m\n\u001b[1;32m 254\u001b[0m df_params \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39m_execute_sql_pipeline([agreement_pattern_counts])\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 256\u001b[0m df_params \u001b[38;5;241m=\u001b[39m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf_comparison_vector_values\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 257\u001b[0m param_records \u001b[38;5;241m=\u001b[39m df_params\u001b[38;5;241m.\u001b[39mas_pandas_dataframe()\n\u001b[1;32m 258\u001b[0m param_records \u001b[38;5;241m=\u001b[39m compute_proportions_for_new_parameters(param_records)\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:651\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sql_to_splink_dataframe_checking_cache(\n\u001b[1;32m 646\u001b[0m sql_gen,\n\u001b[1;32m 647\u001b[0m output_tablename_templated,\n\u001b[1;32m 648\u001b[0m use_cache,\n\u001b[1;32m 649\u001b[0m )\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mreset()\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:645\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m 642\u001b[0m output_tablename_templated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mqueue[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39moutput_table_name\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 645\u001b[0m dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_to_splink_dataframe_checking_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43msql_gen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 651\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:911\u001b[0m, in \u001b[0;36mLinker._sql_to_splink_dataframe_checking_cache\u001b[0;34m(self, sql, output_tablename_templated, use_cache)\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_pd) \u001b[38;5;66;03m# noqa: T201\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 911\u001b[0m splink_dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_against_backend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name_hash\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intermediate_table_cache\u001b[38;5;241m.\u001b[39mexecuted_queries\u001b[38;5;241m.\u001b[39mappend(splink_dataframe)\n\u001b[1;32m 916\u001b[0m splink_dataframe\u001b[38;5;241m.\u001b[39mcreated_by_splink \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:220\u001b[0m, in \u001b[0;36mDuckDBLinker._execute_sql_against_backend\u001b[0;34m(self, sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_delete_table_from_database(physical_name)\n\u001b[1;32m 215\u001b[0m sql \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124mCREATE TABLE \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124mAS\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m--> 220\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_and_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DuckDBDataFrame(templated_name, physical_name, \u001b[38;5;28mself\u001b[39m)\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:725\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 723\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SplinkException(\n\u001b[1;32m 726\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError executing the following sql for table \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 727\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemplated_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfinal_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mError was: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 729\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
"\u001b[0;31mSplinkException\u001b[0m: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):\n\n CREATE TABLE __splink__m_u_counts_d3d7043ef\n AS\n (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), \n__splink__df_match_weight_parts as (\n select \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_comparison_vectors\n ), \n__splink__df_predict as (\n select\n log2(cast(0.015707176032864763 as float8) * ) as match_weight,\n CASE WHEN THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,\n \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n from __splink__df_match_weight_parts\n \n order by 1\n ) \n select 0 as comparison_vector_value,\n sum(match_probability * 1) /\n sum(1) as m_count,\n sum((1-match_probability) * 1) /\n sum(1) as u_count,\n '_probability_two_random_records_match' as output_column_name\n from __splink__df_predict\n )\n \n\nError was: Parser Error: syntax error at or near \")\""
]
}
],
"outputs": [],
"source": [
"splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)"
"# currently throwing an error \n",
"#splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)"
]
},
{
Expand All @@ -644,7 +596,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
"version": "3.11.2"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 3ac77b1

Please sign in to comment.