diff --git a/EDA_and_tests/linkage_test.ipynb b/EDA_and_tests/linkage_test.ipynb
index 1255abc..355962d 100644
--- a/EDA_and_tests/linkage_test.ipynb
+++ b/EDA_and_tests/linkage_test.ipynb
@@ -564,60 +564,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Probability two random records match is estimated to be  0.0155.\n",
-      "This means that amongst all possible pairwise record comparisons, one in 64.67 are expected to match.  With 1,212,903 total possible comparisons, we expect a total of around 18,756.67 matching pairs\n",
-      "----- Estimating u probabilities using random sampling -----\n",
-      "\n",
-      "Estimated u probabilities using random sampling\n",
-      "\n",
-      "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n",
-      "\n",
-      "----- Starting EM training session -----\n",
-      "\n",
-      "Estimating the m probabilities of the model by blocking on:\n",
-      "l.company_name = r.company_name\n",
-      "\n",
-      "Parameter estimates will be made for the following comparison(s):\n",
-      "\n",
-      "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
-      "\n"
-     ]
-    },
-    {
-     "ename": "SplinkException",
-     "evalue": "Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):\n\n        CREATE TABLE __splink__m_u_counts_d3d7043ef\n        AS\n        (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), \n__splink__df_match_weight_parts as (\n    select \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n    from __splink__df_comparison_vectors\n    ), \n__splink__df_predict as (\n    select\n    log2(cast(0.015707176032864763 as float8) * ) as match_weight,\n    CASE WHEN  THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,\n    \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n    from __splink__df_match_weight_parts\n    \n    order by 1\n    ) \n    select 0 as comparison_vector_value,\n           sum(match_probability * 1) /\n               sum(1) as m_count,\n           sum((1-match_probability) * 1) /\n               sum(1) as u_count,\n           '_probability_two_random_records_match' as output_column_name\n    from __splink__df_predict\n    )\n        \n\nError was: Parser Error: syntax error at or near \")\"",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mParserException\u001b[0m                           Traceback (most recent call last)",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:713\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m    712\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 713\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    714\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    715\u001b[0m     \u001b[38;5;66;03m# Parse our SQL through sqlglot to pretty print\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:225\u001b[0m, in \u001b[0;36mDuckDBLinker._run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m    224\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_sql_execution\u001b[39m(\u001b[38;5;28mself\u001b[39m, final_sql, templated_name, physical_name):\n\u001b[0;32m--> 225\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_con\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_sql\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mParserException\u001b[0m: Parser Error: syntax error at or near \")\"",
-      "\nThe above exception was the direct cause of the following exception:\n",
-      "\u001b[0;31mSplinkException\u001b[0m                           Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msplink_dedupe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msettings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morganizations_settings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblocking\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43morganizations_blocking\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/project/src/utils/linkage.py:578\u001b[0m, in \u001b[0;36msplink_dedupe\u001b[0;34m(df, settings, blocking)\u001b[0m\n\u001b[1;32m    575\u001b[0m linker\u001b[38;5;241m.\u001b[39mestimate_u_using_random_sampling(max_pairs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5e6\u001b[39m)\n\u001b[1;32m    577\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m blocking:\n\u001b[0;32m--> 578\u001b[0m     \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_parameters_using_expectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    580\u001b[0m df_predict \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mpredict()\n\u001b[1;32m    581\u001b[0m clusters \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39mcluster_pairwise_predictions_at_threshold(\n\u001b[1;32m    582\u001b[0m     df_predict, threshold_match_probability\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.7\u001b[39m\n\u001b[1;32m    583\u001b[0m )  \u001b[38;5;66;03m# default\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:1706\u001b[0m, in \u001b[0;36mLinker.estimate_parameters_using_expectation_maximisation\u001b[0;34m(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)\u001b[0m\n\u001b[1;32m   1684\u001b[0m         logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m   1685\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mWARNING: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1686\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have provided comparisons_to_deactivate but not \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1692\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas an exact match.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1693\u001b[0m         )\n\u001b[1;32m   1695\u001b[0m em_training_session \u001b[38;5;241m=\u001b[39m EMTrainingSession(\n\u001b[1;32m   1696\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m   1697\u001b[0m     blocking_rule,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1703\u001b[0m     estimate_without_term_frequencies\u001b[38;5;241m=\u001b[39mestimate_without_term_frequencies,\n\u001b[1;32m   1704\u001b[0m )\n\u001b[0;32m-> 1706\u001b[0m \u001b[43mem_training_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1708\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_populate_m_u_from_trained_values()\n\u001b[1;32m   1710\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m populate_probability_two_random_records_match_from_trained_values:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/em_training_session.py:197\u001b[0m, in \u001b[0;36mEMTrainingSession._train\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    181\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m EMTrainingException(\n\u001b[1;32m    182\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTraining rule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbr_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m resulted in no record pairs.  \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    183\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis means that in the supplied data set \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    191\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe number of comparisons that will be generated by a blocking rule.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    192\u001b[0m     )\n\u001b[1;32m    194\u001b[0m \u001b[38;5;66;03m# Compute the new params, populating the paramters in the copied settings object\u001b[39;00m\n\u001b[1;32m    195\u001b[0m \u001b[38;5;66;03m# At this stage, we do not overwrite any of the parameters\u001b[39;00m\n\u001b[1;32m    196\u001b[0m \u001b[38;5;66;03m# in the original (main) setting object\u001b[39;00m\n\u001b[0;32m--> 197\u001b[0m \u001b[43mexpectation_maximisation\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcvv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    199\u001b[0m rule \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_blocking_rule_for_training\u001b[38;5;241m.\u001b[39mblocking_rule_sql\n\u001b[1;32m    200\u001b[0m training_desc \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEM, blocked on: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrule\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/expectation_maximisation.py:256\u001b[0m, in \u001b[0;36mexpectation_maximisation\u001b[0;34m(em_training_session, df_comparison_vector_values)\u001b[0m\n\u001b[1;32m    254\u001b[0m     df_params \u001b[38;5;241m=\u001b[39m linker\u001b[38;5;241m.\u001b[39m_execute_sql_pipeline([agreement_pattern_counts])\n\u001b[1;32m    255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 256\u001b[0m     df_params \u001b[38;5;241m=\u001b[39m \u001b[43mlinker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf_comparison_vector_values\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    257\u001b[0m param_records \u001b[38;5;241m=\u001b[39m df_params\u001b[38;5;241m.\u001b[39mas_pandas_dataframe()\n\u001b[1;32m    258\u001b[0m param_records \u001b[38;5;241m=\u001b[39m compute_proportions_for_new_parameters(param_records)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:651\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m    645\u001b[0m     dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sql_to_splink_dataframe_checking_cache(\n\u001b[1;32m    646\u001b[0m         sql_gen,\n\u001b[1;32m    647\u001b[0m         output_tablename_templated,\n\u001b[1;32m    648\u001b[0m         use_cache,\n\u001b[1;32m    649\u001b[0m     )\n\u001b[1;32m    650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 651\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m    652\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    653\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mreset()\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:645\u001b[0m, in \u001b[0;36mLinker._execute_sql_pipeline\u001b[0;34m(self, input_dataframes, use_cache)\u001b[0m\n\u001b[1;32m    642\u001b[0m output_tablename_templated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pipeline\u001b[38;5;241m.\u001b[39mqueue[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39moutput_table_name\n\u001b[1;32m    644\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 645\u001b[0m     dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_to_splink_dataframe_checking_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    646\u001b[0m \u001b[43m        \u001b[49m\u001b[43msql_gen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    647\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    648\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    649\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    650\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    651\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:911\u001b[0m, in \u001b[0;36mLinker._sql_to_splink_dataframe_checking_cache\u001b[0;34m(self, sql, output_tablename_templated, use_cache)\u001b[0m\n\u001b[1;32m    908\u001b[0m         \u001b[38;5;28mprint\u001b[39m(df_pd)  \u001b[38;5;66;03m# noqa: T201\u001b[39;00m\n\u001b[1;32m    910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 911\u001b[0m     splink_dataframe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_sql_against_backend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    912\u001b[0m \u001b[43m        \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_tablename_templated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name_hash\u001b[49m\n\u001b[1;32m    913\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    914\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intermediate_table_cache\u001b[38;5;241m.\u001b[39mexecuted_queries\u001b[38;5;241m.\u001b[39mappend(splink_dataframe)\n\u001b[1;32m    916\u001b[0m splink_dataframe\u001b[38;5;241m.\u001b[39mcreated_by_splink \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/duckdb/linker.py:220\u001b[0m, in \u001b[0;36mDuckDBLinker._execute_sql_against_backend\u001b[0;34m(self, sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m    213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_delete_table_from_database(physical_name)\n\u001b[1;32m    215\u001b[0m sql \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m    216\u001b[0m \u001b[38;5;124mCREATE TABLE \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m    217\u001b[0m \u001b[38;5;124mAS\u001b[39m\n\u001b[1;32m    218\u001b[0m \u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\n\u001b[1;32m    219\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m--> 220\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_and_run_sql_execution\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplated_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mphysical_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DuckDBDataFrame(templated_name, physical_name, \u001b[38;5;28mself\u001b[39m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/splink/linker.py:725\u001b[0m, in \u001b[0;36mLinker._log_and_run_sql_execution\u001b[0;34m(self, final_sql, templated_name, physical_name)\u001b[0m\n\u001b[1;32m    722\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m    723\u001b[0m     \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m--> 725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SplinkException(\n\u001b[1;32m    726\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError executing the following sql for table \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    727\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemplated_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mphysical_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfinal_sql\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    728\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mError was: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    729\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
-      "\u001b[0;31mSplinkException\u001b[0m: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_d3d7043ef):\n\n        CREATE TABLE __splink__m_u_counts_d3d7043ef\n        AS\n        (WITH __splink__df_comparison_vectors as (select * from __splink__df_comparison_vectors_db8087bba), \n__splink__df_match_weight_parts as (\n    select \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n    from __splink__df_comparison_vectors\n    ), \n__splink__df_predict as (\n    select\n    log2(cast(0.015707176032864763 as float8) * ) as match_weight,\n    CASE WHEN  THEN 1.0 ELSE (cast(0.015707176032864763 as float8) * )/(1+(cast(0.015707176032864763 as float8) * )) END as match_probability,\n    \"unique_id_l\",\"unique_id_r\",\"zipcode_l\",\"zipcode_r\",\"company_name_l\",\"company_name_r\",match_key \n    from __splink__df_match_weight_parts\n    \n    order by 1\n    ) \n    select 0 as comparison_vector_value,\n           sum(match_probability * 1) /\n               sum(1) as m_count,\n           sum((1-match_probability) * 1) /\n               sum(1) as u_count,\n           '_probability_two_random_records_match' as output_column_name\n    from __splink__df_predict\n    )\n        \n\nError was: Parser Error: syntax error at or near \")\""
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)"
+    "# currently throwing an error  \n",
+    "#splink_dedupe(data, settings=organizations_settings, blocking = organizations_blocking)"
    ]
   },
   {
@@ -644,7 +596,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.2"
   }
  },
  "nbformat": 4,
diff --git a/src/utils/classify_InfoGroup_data.py b/src/utils/classify_InfoGroup_data.py
index de6551d..62bfa75 100644
--- a/src/utils/classify_InfoGroup_data.py
+++ b/src/utils/classify_InfoGroup_data.py
@@ -1,3 +1,5 @@
+"""Script to get relevant InfoGroup data for classification in the pipeline"""
+
 import re
 
 # TODO: #92 Make orgs classification script into more well-defined pipeline
@@ -100,10 +102,11 @@ def prepare_infogroup_data(
     Args:
         infogroup_csv: the InfoGroup csv file
         SIC6_codes_df: DataFrame of the relevant SIC6 codes w/ corresponding regex codes and descriptions
+        output_file_path: the resulting df will be written as a csv to this file path location
         testing: Boolean - True if code is being tested on only several chunks, False if whole InfoGroup csv should be used
         chunksize: the number of rows per chunk in the IG dataset (default 10,000 but can be changed for testing purposes)
         num_testing_chunks: number of chunks to iterate through when testing = True
-        output_file_path: the resulting df will be written as a csv to this file path location
+
     Returns:
         a DataFrame with information for only the relevant companies from the InfoGroup
         dataset that is formatted in the same schema as the aggregated company df for downstream