Skip to content

Commit

Permalink
Added a parallel notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
mnijhuis-dnb committed Apr 12, 2023
1 parent 76095df commit 507b409
Showing 1 changed file with 210 additions and 0 deletions.
210 changes: 210 additions & 0 deletions notebook/name_matching parallel.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import ray\n",
"from name_matching.name_matcher import NameMatcher"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Initiate ray"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ray.init()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load some dummy data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"adjusted_names = pd.read_csv('../test/adjusted_test_names.csv', index_col=0)\n",
"test_names = pd.read_csv('../test/test_names.csv', index_col=0)\n",
"display(adjusted_names.head(5))\n",
"display(test_names.head(5))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Initiate the NameMatcher object on the company_name column"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matcher = NameMatcher( ngrams=(2, 5),\n",
" top_n=10,\n",
" number_of_rows=500,\n",
" number_of_matches=3,\n",
" lowercase=True,\n",
" punctuations=True,\n",
" remove_ascii=True,\n",
" legal_suffixes=False,\n",
" common_words=False,\n",
" preprocess_split=False,\n",
" verbose=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the desired string matching metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matcher.set_distance_metrics(['iterative_sub_string', 'pearson_ii', 'bag', 'fuzzy_wuzzy_partial_string', 'editex'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the main part of the data that should be matched"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matcher.load_and_process_master_data('company_name', test_names, transform=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a function for the running of the name matching"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"def match_name_parallel(adjusted_names, matcher):\n",
" results = matcher.match_names(to_be_matched=adjusted_names, column_matching='company_name')\n",
" return results"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Split the names in instances of 100 and add it to a list of remote function calls"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"for i in range(0, len(adjusted_names), 100):\n",
" results.append(match_name_parallel.remote(adjusted_names[i:i+100], matcher))"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the results once all of the workers finished their work"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matches = pd.concat(ray.get(results))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"recombine the data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"complete_matched_data = pd.merge(pd.merge(test_names, matches, how='left', right_index=True, left_index=True), adjusted_names, how='left', left_on='match_index_0', right_index=True, suffixes=['', '_matched'])\n",
"complete_matched_data"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"vscode": {
"interpreter": {
"hash": "8db41bd6fef7744b08932e845d8249a243b3ddc44edf3dd1c68df61a819e8ad4"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 507b409

Please sign in to comment.