Added a parallel notebook

DeNederlandscheBank · Apr 12, 2023 · 507b409 · 507b409
1 parent 76095df
commit 507b409
Showing 1 changed file with 210 additions and 0 deletions.
diff --git a/notebook/name_matching parallel.ipynb b/notebook/name_matching parallel.ipynb
@@ -0,0 +1,210 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import ray\n",
+    "from name_matching.name_matcher import NameMatcher"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Initiate ray"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ray.init()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load some dummy data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adjusted_names = pd.read_csv('../test/adjusted_test_names.csv', index_col=0)\n",
+    "test_names = pd.read_csv('../test/test_names.csv', index_col=0)\n",
+    "display(adjusted_names.head(5))\n",
+    "display(test_names.head(5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Initiate the NameMatcher object on the company_name column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matcher = NameMatcher(  ngrams=(2, 5),\n",
+    "                        top_n=10,\n",
+    "                        number_of_rows=500,\n",
+    "                        number_of_matches=3,\n",
+    "                        lowercase=True,\n",
+    "                        punctuations=True,\n",
+    "                        remove_ascii=True,\n",
+    "                        legal_suffixes=False,\n",
+    "                        common_words=False,\n",
+    "                        preprocess_split=False,\n",
+    "                        verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the desired string matching metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matcher.set_distance_metrics(['iterative_sub_string', 'pearson_ii', 'bag', 'fuzzy_wuzzy_partial_string', 'editex'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the main part of the data that should be matched"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matcher.load_and_process_master_data('company_name', test_names, transform=True)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define a function for the running of the name matching"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@ray.remote\n",
+    "def match_name_parallel(adjusted_names, matcher):\n",
+    "    results = matcher.match_names(to_be_matched=adjusted_names, column_matching='company_name')\n",
+    "    return results"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split the names in instances of 100 and add it to a list of remote function calls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = []\n",
+    "for i in range(0, len(adjusted_names), 100):\n",
+    "    results.append(match_name_parallel.remote(adjusted_names[i:i+100], matcher))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get the results once all of the workers finished their work"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matches = pd.concat(ray.get(results))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "recombine the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "complete_matched_data = pd.merge(pd.merge(test_names, matches, how='left', right_index=True, left_index=True), adjusted_names, how='left', left_on='match_index_0', right_index=True, suffixes=['', '_matched'])\n",
+    "complete_matched_data"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "8db41bd6fef7744b08932e845d8249a243b3ddc44edf3dd1c68df61a819e8ad4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}