From fc857fc34e22c212e2902d5f9e600c6fee2feddb Mon Sep 17 00:00:00 2001
From: Mykola Melnyk <kolia1985@gmail.com>
Date: Thu, 14 May 2020 14:28:25 +0300
Subject: [PATCH] Fixed notebooks

---
 jupyter/SparkOCRS3AccesExample.ipynb      |  14 +-
 jupyter/SparkOCRWritePdfToS3.ipynb        |   2 +-
 jupyter/SparkOcrSavedLoadedPipeline.ipynb | 210 ++++++++++++++--------
 3 files changed, 146 insertions(+), 80 deletions(-)
diff --git a/jupyter/SparkOCRS3AccesExample.ipynb b/jupyter/SparkOCRS3AccesExample.ipynb
index a6dffa6..0741166 100644
--- a/jupyter/SparkOCRS3AccesExample.ipynb
+++ b/jupyter/SparkOCRS3AccesExample.ipynb
@@ -75,7 +75,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "SparkConf Configured, Starting to listen on port: 50980\n",
+      "SparkConf Configured, Starting to listen on port: 50635\n",
       "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
      ]
     },
@@ -89,7 +89,7 @@
        "        <div>\n",
        "            <p><b>SparkContext</b></p>\n",
        "\n",
-       "            <p><a href=\"http://melnyks-mbp:4040\">Spark UI</a></p>\n",
+       "            <p><a href=\"http://melnyks-mbp.dlink:4040\">Spark UI</a></p>\n",
        "\n",
        "            <dl>\n",
        "              <dt>Version</dt>\n",
@@ -105,7 +105,7 @@
        "        "
       ],
       "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x122268d10>"
+       "<pyspark.sql.session.SparkSession at 0x126ecc610>"
       ]
      },
      "execution_count": 4,
@@ -124,6 +124,8 @@
     "# you can set AWS API Keys to env variables  \n",
     "# os.environ['AWS_ACCESS_KEY_ID'] = \"your key\"\n",
     "# os.environ['AWS_SECRET_ACCESS_KEY'] = \"your secret\"\n",
+    "os.environ['AWS_ACCESS_KEY_ID'] = \"AKIASRWSDKBGBUGEZJ3G\"\n",
+    "os.environ['AWS_SECRET_ACCESS_KEY'] = \"BEJUdxKC9H/Zn/qgDUV6ScAyGhPay68lMX9m096P\"\n",
     "\n",
     "# set additinal dependensies for read data from S3\n",
     "conf = SparkConf() \\\n",
@@ -208,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -238,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -904,4 +906,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/jupyter/SparkOCRWritePdfToS3.ipynb b/jupyter/SparkOCRWritePdfToS3.ipynb
index 5528b0a..12944a4 100644
--- a/jupyter/SparkOCRWritePdfToS3.ipynb
+++ b/jupyter/SparkOCRWritePdfToS3.ipynb
@@ -897,7 +897,7 @@
     }
    ],
    "source": [
-    "results.write\n",
+    "results.write \\\n",
     "  .format(\"binaryFormat\") \\\n",
     "  .option(\"type\", \"pdf\") \\\n",
     "  .option(\"field\", \"pdf\") \\\n",
diff --git a/jupyter/SparkOcrSavedLoadedPipeline.ipynb b/jupyter/SparkOcrSavedLoadedPipeline.ipynb
index 759fb1d..3ba1a66 100644
--- a/jupyter/SparkOcrSavedLoadedPipeline.ipynb
+++ b/jupyter/SparkOcrSavedLoadedPipeline.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -63,48 +63,80 @@
     "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
-    "## Initialization of spark session"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing /Users/nmelnik/IdeaProjects/spark-ocr/python/dist/spark-ocr-1.3.0rc1.tar.gz\n",
+      "Requirement already satisfied: numpy==1.17.4 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (1.17.4)\n",
+      "Requirement already satisfied: pillow==6.2.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (6.2.1)\n",
+      "Requirement already satisfied: py4j==0.10.7 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (0.10.7)\n",
+      "Requirement already satisfied: pyspark==2.4.4 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (2.4.4)\n",
+      "Requirement already satisfied: python-levenshtein==0.12.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (0.12.0)\n",
+      "Requirement already satisfied: scikit-image==0.16.2 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (0.16.2)\n",
+      "Requirement already satisfied: implicits==1.0.2 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (1.0.2)\n",
+      "Requirement already satisfied: setuptools in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from python-levenshtein==0.12.0->spark-ocr==1.3.0rc1) (46.0.0)\n",
+      "Requirement already satisfied: imageio>=2.3.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.8.0)\n",
+      "Requirement already satisfied: networkx>=2.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.4)\n",
+      "Requirement already satisfied: PyWavelets>=0.4.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.1.1)\n",
+      "Requirement already satisfied: scipy>=0.19.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.4.1)\n",
+      "Requirement already satisfied: matplotlib!=3.0.0,>=2.0.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (3.2.0)\n",
+      "Requirement already satisfied: decorator>=4.3.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from networkx>=2.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (4.4.2)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.1.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.8.1)\n",
+      "Requirement already satisfied: cycler>=0.10 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (0.10.0)\n",
+      "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.4.6)\n",
+      "Requirement already satisfied: six>=1.5 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from python-dateutil>=2.1->matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.14.0)\n",
+      "Building wheels for collected packages: spark-ocr\n",
+      "  Building wheel for spark-ocr (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for spark-ocr: filename=spark_ocr-1.3.0rc1-cp37-none-any.whl size=5015774 sha256=1892d1f304f47ac249a064d3f1ad114a83689b6a18c370f762fbe5f1908cdc63\n",
+      "  Stored in directory: /Users/nmelnik/Library/Caches/pip/wheels/79/ee/b7/b1d6d10a6be137d65bd31f7d0159dcc1d704587c685a48fb4e\n",
+      "Successfully built spark-ocr\n",
+      "Installing collected packages: spark-ocr\n",
+      "  Found existing installation: spark-ocr 1.3.0rc1\n",
+      "    Uninstalling spark-ocr-1.3.0rc1:\n",
+      "      Successfully uninstalled spark-ocr-1.3.0rc1\n",
+      "Successfully installed spark-ocr-1.3.0rc1\n",
+      "\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.1 is available.\n",
+      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
-    "from pyspark import SparkConf\n",
-    "from sparkocr import start\n",
-    "\n",
-    "if license:\n",
-    "    os.environ['JSL_OCR_LICENSE'] = license\n",
-    "    \n",
-    "conf = SparkConf() \\\n",
-    "    .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n",
-    "\n",
-    "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
-    "spark"
+    "# or install from local path\n",
+    "# %pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
-    "## Imports"
+    "## Initialization of spark session"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SparkConf Configured, Starting to listen on port: 54494\n",
+      "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -115,7 +147,7 @@
        "        <div>\n",
        "            <p><b>SparkContext</b></p>\n",
        "\n",
-       "            <p><a href=\"http://192.168.1.15:4040\">Spark UI</a></p>\n",
+       "            <p><a href=\"http://melnyks-mbp.dlink:4040\">Spark UI</a></p>\n",
        "\n",
        "            <dl>\n",
        "              <dt>Version</dt>\n",
@@ -131,14 +163,40 @@
        "        "
       ],
       "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x11e6dbcd0>"
+       "<pyspark.sql.session.SparkSession at 0x119ff3950>"
       ]
      },
-     "execution_count": 55,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "source": [
+    "from pyspark import SparkConf\n",
+    "from sparkocr import start\n",
+    "\n",
+    "if license:\n",
+    "    os.environ['JSL_OCR_LICENSE'] = license\n",
+    "    \n",
+    "conf = SparkConf() \\\n",
+    "    .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n",
+    "\n",
+    "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from pyspark.ml import PipelineModel\n",
     "\n",
@@ -154,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,9 +228,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "images = spark.read.format(\"binaryFile\").load(images_path).cache()\n",
     "images.count()"
@@ -187,7 +256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -209,22 +278,6 @@
     "])"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Fit the pipeline to training images."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = pipeline.fit(images)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -234,7 +287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 11,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -248,28 +301,28 @@
       "+-------+--------------------+-----------------+\n",
       "|pagenum|                text|       confidence|\n",
       "+-------+--------------------+-----------------+\n",
-      "|      0|FOREWORD\n",
-      "\n",
-      "Electro...|95.88622707790799|\n",
+      "|      0|> Confidential Cl...|84.30319298638238|\n",
       "+-------+--------------------+-----------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "results=model.transform(images)"
+    "pipeline.transform(images) \\\n",
+    "   .select(\"pagenum\",\"text\", \"confidence\") \\\n",
+    "   .show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## save the fitted pipeline to disk"
+    "## Save the pipeline to disk"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -277,19 +330,19 @@
    },
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(\"ocr_model\")"
+    "pipeline.write().overwrite().save(\"ocr_model\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## save the unfit  pipeline to disk"
+    "## Load back the model pipeline"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -297,27 +350,38 @@
    },
    "outputs": [],
    "source": [
-    "pipeline.write().overwrite().save(\"unfit_ocr_model\")"
+    "stored_pipeline = PipelineModel.load(\"ocr_model\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## load back the model pipeline"
+    "## Run loaded pipeline"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------+--------------------+-----------------+\n",
+      "|pagenum|                text|       confidence|\n",
+      "+-------+--------------------+-----------------+\n",
+      "|      0|> Confidential Cl...|84.30319298638238|\n",
+      "+-------+--------------------+-----------------+\n",
+      "\n"
+     ]
     }
-   },
-   "outputs": [],
+   ],
    "source": [
-    "sameModel = PipelineModel.load(\"ocr_model\")\n"
+    "stored_pipeline.transform(images) \\\n",
+    "  .select(\"pagenum\",\"text\", \"confidence\") \\\n",
+    "  .show()"
    ]
   }
  ],