diff --git a/notebook/name_matching parallel.ipynb b/notebook/name_matching parallel.ipynb new file mode 100644 index 0000000..70d1820 --- /dev/null +++ b/notebook/name_matching parallel.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import ray\n", + "from name_matching.name_matcher import NameMatcher" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initiate ray" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ray.init()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load some dummy data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adjusted_names = pd.read_csv('../test/adjusted_test_names.csv', index_col=0)\n", + "test_names = pd.read_csv('../test/test_names.csv', index_col=0)\n", + "display(adjusted_names.head(5))\n", + "display(test_names.head(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initiate the NameMatcher object on the company_name column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "matcher = NameMatcher( ngrams=(2, 5),\n", + " top_n=10,\n", + " number_of_rows=500,\n", + " number_of_matches=3,\n", + " lowercase=True,\n", + " punctuations=True,\n", + " remove_ascii=True,\n", + " legal_suffixes=False,\n", + " common_words=False,\n", + " preprocess_split=False,\n", + " verbose=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the desired string matching metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "matcher.set_distance_metrics(['iterative_sub_string', 'pearson_ii', 'bag', 'fuzzy_wuzzy_partial_string', 'editex'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the main part of the data that should be matched" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "matcher.load_and_process_master_data('company_name', test_names, transform=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a function for the running of the name matching" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def match_name_parallel(adjusted_names, matcher):\n", + " results = matcher.match_names(to_be_matched=adjusted_names, column_matching='company_name')\n", + " return results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split the names in instances of 100 and add it to a list of remote function calls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "for i in range(0, len(adjusted_names), 100):\n", + " results.append(match_name_parallel.remote(adjusted_names[i:i+100], matcher))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the results once all of the workers finished their work" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "matches = pd.concat(ray.get(results))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "recombine the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "complete_matched_data = pd.merge(pd.merge(test_names, matches, how='left', right_index=True, left_index=True), adjusted_names, how='left', left_on='match_index_0', right_index=True, suffixes=['', '_matched'])\n", + "complete_matched_data" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "8db41bd6fef7744b08932e845d8249a243b3ddc44edf3dd1c68df61a819e8ad4" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}