Skip to content

Commit

Permalink
Added plot of SequenceMatcher ratios averaged over emails and samples
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Jan 3, 2025
1 parent 9f34add commit 5cb790c
Showing 1 changed file with 52 additions and 4 deletions.
56 changes: 52 additions & 4 deletions notebook/batching_performance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@
"plt.ylabel(\"Average Email Time [s]\")\n",
"plt.ylim(bottom=0)\n",
"plt.title(\"Average email time for different batch sizes\")\n",
"plt.vlines(0, 0, (max(average_email_times_for_batches) + max(std_email_times_for_batches) + 10), colors=\"gray\", linestyles=\"--\")\n",
"plt.grid(which='major', color='#666666', linestyle='--', alpha = 0.8)\n",
"plt.grid(which='minor', color='#666666', linestyle='--', alpha = 0.3)\n",
"plt.minorticks_on()"
Expand All @@ -264,24 +265,71 @@
"outputs": [],
"source": [
"# declare a result as standard to compare the other results to\n",
"standard_ps_texts = [email_dict[\"pseudo_content\"] for email_dict in next(output[\"email_outputs\"] for output in outputs if output[\"batch_size\"] == 1)]\n",
"standard_batch_size = 1 # batch size with best qualitative results\n",
"standard_ps_texts = [email_dict[\"pseudo_content\"] for email_dict in next(output[\"email_outputs\"] for output in outputs if output[\"batch_size\"] == standard_batch_size)]\n",
"\n",
"# iterate over other results and print diffs\n",
"for output in outputs:\n",
" #print(f\"----- Comparing batch size {output['batch_size']} sample {output['sample']} to standard: -----\")\n",
" print(f\"----- Comparing batch size {output['batch_size']} sample {output['sample']} to standard {standard_batch_size}: -----\")\n",
" ps_texts = [email_dict[\"pseudo_content\"] for email_dict in output[\"email_outputs\"]]\n",
" # diff to standard\n",
" average_sqm_ratio = 0. # SequenceMatcherRatio averaged over all emails for this result\n",
" for idx, (text, stdtext) in enumerate(zip(ps_texts, standard_ps_texts)):\n",
" #print(f\"--- Comparing email text {idx} ---\")\n",
" print(f\"--- Comparing email text {idx} ---\")\n",
" diff = difflib.ndiff(stdtext.splitlines(keepends=True), text.splitlines(keepends=True))\n",
" for line in diff:\n",
" if line.startswith('+ ') or line.startswith('- '):\n",
" print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}:\")\n",
" print(line, end='')\n",
" # also test the matching ratio\n",
" rt = difflib.SequenceMatcher(None, stdtext, text).ratio()\n",
" average_sqm_ratio += rt\n",
" if not rt == 1.0:\n",
" print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}: Matching ratio is not 1!\")"
" print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}: Matching ratio is {rt}\")\n",
"\n",
" average_sqm_ratio = average_sqm_ratio / len(ps_texts)\n",
" output[\"average_sqm_ratio\"] = average_sqm_ratio"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# average the SequenceMatcher ratios over n_samples for all batching sizes\n",
"average_sqm_ratio_for_batches = [0.]*len(batching_sizes)\n",
"for output in outputs:\n",
" average_sqm_ratio_for_batches[batching_sizes.index(output[\"batch_size\"])] += output[\"average_sqm_ratio\"]\n",
"average_sqm_ratio_for_batches = [asr/n_samples for asr in average_sqm_ratio_for_batches]\n",
"\n",
"# if n_samples > 1, calculate standard deviation.\n",
"std_sqm_ratio_for_batches = [0.]*len(batching_sizes)\n",
"if n_samples > 1:\n",
" for output in outputs:\n",
" ix = batching_sizes.index(output[\"batch_size\"])\n",
" std_sqm_ratio_for_batches[ix] += (output[\"average_sqm_ratio\"] - average_sqm_ratio_for_batches[ix])**2\n",
" std_sqm_ratio_for_batches = [(stdr**(1./2.))/(n_samples-1) for stdr in std_sqm_ratio_for_batches]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plot the average SequenceMatcher ratio for batch sizes\n",
"plt.errorbar(batching_sizes, average_sqm_ratio_for_batches, yerr=std_sqm_ratio_for_batches, linestyle='None', marker='.', capsize=2, elinewidth=1)\n",
"plt.xlabel(\"n batches\")\n",
"plt.ylabel(\"Average SequenceMatcher Ratio\")\n",
"plt.title(\"Average SequenceMatcher Ratio compared to Standard for different batch sizes\")\n",
"plt.hlines(1, -2, batching_sizes[len(batching_sizes)-1]+1, colors=\"black\")\n",
"plt.vlines(0, 1.1, 0, colors=\"gray\", linestyles=\"--\")\n",
"plt.xlim(-1.5, batching_sizes[len(batching_sizes)-1]+0.5)\n",
"plt.ylim(0,1.1)\n",
"plt.grid(which='major', color='#666666', linestyle='--', alpha = 0.8)\n",
"plt.grid(which='minor', color='#666666', linestyle='--', alpha = 0.3)\n",
"plt.minorticks_on()"
]
},
{
Expand Down

0 comments on commit 5cb790c

Please sign in to comment.