From 16210316172d99a1a5d1c0c23e6ee3acbbca2f07 Mon Sep 17 00:00:00 2001 From: Mykola Melnyk Date: Tue, 11 Aug 2020 13:38:10 +0300 Subject: [PATCH] Updated UpdateTextPosition notebook --- jupyter/SparkOcrUpdateTextPosition.ipynb | 63 ++++++++++++++++-------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/jupyter/SparkOcrUpdateTextPosition.ipynb b/jupyter/SparkOcrUpdateTextPosition.ipynb index 09a78fd..b9d0f7a 100644 --- a/jupyter/SparkOcrUpdateTextPosition.ipynb +++ b/jupyter/SparkOcrUpdateTextPosition.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -65,9 +65,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n", + "\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "# install from PYPI using secret\n", "%pip install spark-nlp==2.5.5\n", @@ -93,14 +104,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "SparkConf Configured, Starting to listen on port: 59744\n", + "SparkConf Configured, Starting to listen on port: 53378\n", "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n" ] }, @@ -114,11 +125,11 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", - "
v2.4.4
\n", + "
v2.3.2
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", @@ -130,10 +141,10 @@ " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 2, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -150,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -193,7 +204,7 @@ " .setOutputCol(\"spell\")\n", " \n", " tokenAssem = TokenAssembler() \\\n", - " .setInputCols(\"spell\") \\\n", + " .setInputCols([\"spell\", \"document\"]) \\\n", " .setOutputCol(\"newDocs\")\n", "\n", " updatedText = UpdateTextPosition() \\\n", @@ -248,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -266,9 +277,19 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "spellcheck_norvig download started this may take some time.\n", + "Approximate size to download 4.2 MB\n", + "[OK!]\n" + ] + } + ], "source": [ "ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n", "updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n", @@ -288,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "metadata": { "pycharm": { "name": "#%%\n" @@ -298,10 +319,10 @@ { "data": { "text/plain": [ - "72914" + "1671" ] }, - "execution_count": 9, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -344,4 +365,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +}