GH140_Configure_auto_build_spark

Adapt Notebook to use spark-nlp compatabile with spark23 Spark NLP Apache Spark 2.3.x Apache Spark 2.4.x 2.5.5 YES YES Upgrade spark-nlp-jsl to 2.5.5
JohnSnowLabs · Aug 12, 2020 · 1bf7bf2 · 1bf7bf2 · review-notebook-app · Aug 12, 2020
1 parent 82db63f
commit 1bf7bf2
Showing 1 changed file with 24 additions and 43 deletions.
diff --git a/jupyter/SparkOcrUpdateTextPosition.ipynb b/jupyter/SparkOcrUpdateTextPosition.ipynb
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,23 +65,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n",
-      "\u001B[33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n",
-      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001B[0m\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# install from PYPI using secret\n",
-    "%pip install spark-nlp==2.5.5\n",
+    "%pip install spark-nlp==2.4.5\n",
     "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade"
    ]
   },
@@ -104,14 +93,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "SparkConf Configured, Starting to listen on port: 53378\n",
+      "SparkConf Configured, Starting to listen on port: 59744\n",
       "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
      ]
     },
@@ -125,11 +114,11 @@
        "        <div>\n",
        "            <p><b>SparkContext</b></p>\n",
        "\n",
-       "            <p><a href=\"http://kolia-mbp.dlink:4041\">Spark UI</a></p>\n",
+       "            <p><a href=\"http://melnyks-mbp:4043\">Spark UI</a></p>\n",
        "\n",
        "            <dl>\n",
        "              <dt>Version</dt>\n",
-       "                <dd><code>v2.3.2</code></dd>\n",
+       "                <dd><code>v2.4.4</code></dd>\n",
        "              <dt>Master</dt>\n",
        "                <dd><code>local[*]</code></dd>\n",
        "              <dt>AppName</dt>\n",
@@ -141,10 +130,10 @@
        "        "
       ],
       "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x1195bb510>"
+       "<pyspark.sql.session.SparkSession at 0x10c27d2d0>"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -155,17 +144,18 @@
     "if license:\n",
     "    os.environ['JSL_OCR_LICENSE'] = license\n",
     "\n",
-    "spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version=\"2.5.5\")\n",
+    "spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version=\"2.4.5\")\n",
     "spark"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "from pyspark.ml import Pipeline\n",
+    "from pyspark.ml import PipelineModel\n",
     "from sparkocr.transformers import *\n",
     "from sparknlp.annotator import *\n",
     "from sparknlp.base import *\n",
@@ -181,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -204,7 +194,7 @@
     "          .setOutputCol(\"spell\")\n",
     "    \n",
     "    tokenAssem = TokenAssembler() \\\n",
-    "          .setInputCols([\"spell\", \"document\"]) \\\n",
+    "          .setInputCols(\"spell\") \\\n",
     "          .setOutputCol(\"newDocs\")\n",
     "\n",
     "    updatedText = UpdateTextPosition() \\\n",
@@ -223,6 +213,7 @@
     "    \n",
     "    return pipeline\n",
     "\n",
+    "\n",
     "def ocr_pipeline():\n",
     "    # Transforrm PDF document to images per page\n",
     "        pdf_to_image = PdfToImage() \\\n",
@@ -259,7 +250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -277,19 +268,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "spellcheck_norvig download started this may take some time.\n",
-      "Approximate size to download 4.2 MB\n",
-      "[OK!]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n",
     "updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n",
@@ -309,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 9,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -319,10 +300,10 @@
     {
      "data": {
       "text/plain": [
-       "1671"
+       "72914"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }