Updated notebooks

JohnSnowLabs · May 11, 2020 · 28ca8de · 28ca8de
1 parent c2ed3d3
commit 28ca8de
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 69 deletions.
diff --git a/jupyter/SparkOCRWriteImageToS3.ipynb b/jupyter/SparkOCRWriteImageToS3.ipynb
@@ -217,8 +217,8 @@
     "binary_to_image.setInputCol(\"content\")\n",
     "binary_to_image.setOutputCol(\"image\")\n",
     "\n",
-    "# Run tesseract OCR for each region\n",
-    "ocr = TesseractOcr()\n",
+    "# Run OCR for each region\n",
+    "ocr = ImageToText()\n",
     "ocr.setInputCol(\"image\")\n",
     "ocr.setOutputCol(\"text\")\n",
     "ocr.setIgnoreResolution(False)\n",
@@ -880,8 +880,13 @@
     }
    ],
    "source": [
-    "results.write.format(\"binaryFormat\").option(\"type\", \"text\").option(\"field\", \"text\")\\\n",
-    "  .option(\"extension\", \"txt\").mode(\"overwrite\").save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/texts/\")"
+    "results.write \\\n",
+    "  .format(\"binaryFormat\") \\\n",
+    "  .option(\"type\", \"text\") \\\n",
+    "  .option(\"field\", \"text\") \\\n",
+    "  .option(\"extension\", \"txt\") \\\n",
+    "  .mode(\"overwrite\") \\\n",
+    "  .save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/texts/\")"
    ]
   }
  ],
@@ -906,13 +911,13 @@
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/jupyter/SparkOCRWritePdfToS3.ipynb b/jupyter/SparkOCRWritePdfToS3.ipynb
@@ -173,12 +173,10 @@
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "## Read pdf objects"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
@@ -225,8 +223,8 @@
     "            .setInputCol(\"image_raw\") \\\n",
     "            .setOutputCol(\"image\") \\\n",
     "            .setThreshold(130)\n",
-    "# Run tesseract OCR for each region\n",
-    "ocr = TesseractOcr() \\\n",
+    "# Run OCR for each region\n",
+    "ocr = ImageToText() \\\n",
     "            .setInputCol(\"image\") \\\n",
     "            .setOutputCol(\"text\") \\\n",
     "            .setIgnoreResolution(False) \\\n",
@@ -899,8 +897,13 @@
     }
    ],
    "source": [
-    "results.write.format(\"binaryFormat\").option(\"type\", \"pdf\").option(\"field\", \"pdf\")\\\n",
-    "  .option(\"extension\", \"pdf\").mode(\"overwrite\").save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/pdfs/\")"
+    "results.write\n",
+    "  .format(\"binaryFormat\") \\\n",
+    "  .option(\"type\", \"pdf\") \\\n",
+    "  .option(\"field\", \"pdf\") \\\n",
+    "  .option(\"extension\", \"pdf\") \\\n",
+    "  .mode(\"overwrite\") \\\n",
+    "  .save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/pdfs/\")"
    ]
   }
  ],
@@ -925,13 +928,13 @@
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/jupyter/SparkOcrSavedLoadedPipeline.ipynb b/jupyter/SparkOcrSavedLoadedPipeline.ipynb
@@ -4,25 +4,25 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Save  Images Objects to S3 using  Spark OCR\n",
+    "# Save/Load  Spark OCR pipeline\n",
     "## Initialize spark session"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
+   "outputs": [],
    "source": [
     "secret = \"\"\n",
     "license = \"\"\n",
     "version = secret.split(\"-\")[0]\n",
     "spark_ocr_jar_path = \"../../target/scala-2.11\""
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "code",
@@ -80,23 +80,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "from pyspark import SparkConf\n",
     "from sparkocr import start\n",
     "\n",
     "if license:\n",
     "    os.environ['JSL_OCR_LICENSE'] = license\n",
     "    \n",
-    "# you can set AWS API Keys to env variables  \n",
-    "# os.environ['AWS_ACCESS_KEY_ID'] = \"your key\"\n",
-    "# os.environ['AWS_SECRET_ACCESS_KEY'] = \"your secret\"\n",
-    "\n",
-    "# set additinal dependensies for read data from S3\n",
     "conf = SparkConf() \\\n",
     "    .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n",
-    "# or you can set AWS API Keys here\n",
-    "#    .set('spark.hadoop.fs.s3a.access.key', \"your key\" ) \\\n",
-    "#    .set('spark.hadoop.fs.s3a.secret.key', \"your secret\")\n",
     "\n",
     "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
     "spark"
@@ -205,8 +196,8 @@
     "binary_to_image.setInputCol(\"content\")\n",
     "binary_to_image.setOutputCol(\"image\")\n",
     "\n",
-    "# Run tesseract OCR for each region\n",
-    "ocr = TesseractOcr()\n",
+    "# Run OCR for each region\n",
+    "ocr = ImageToText()\n",
     "ocr.setInputCol(\"image\")\n",
     "ocr.setOutputCol(\"text\")\n",
     "ocr.setIgnoreResolution(False)\n",
@@ -271,72 +262,63 @@
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "## save the fitted pipeline to disk"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "outputs": [],
-   "source": [
-    "model.write().overwrite().save(\"ocr_model\")"
-   ],
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
+   },
+   "outputs": [],
+   "source": [
+    "model.write().overwrite().save(\"ocr_model\")"
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "## save the unfit  pipeline to disk"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "outputs": [],
-   "source": [
-    "pipeline.write().overwrite().save(\"unfit_ocr_model\")"
-   ],
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
+   },
+   "outputs": [],
+   "source": [
+    "pipeline.write().overwrite().save(\"unfit_ocr_model\")"
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "## load back the model pipeline"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "outputs": [],
-   "source": [
-    "sameModel = PipelineModel.load(\"ocr_model\")\n"
-   ],
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
+   },
+   "outputs": [],
+   "source": [
+    "sameModel = PipelineModel.load(\"ocr_model\")\n"
+   ]
   }
  ],
  "metadata": {
@@ -355,18 +337,18 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.7"
   },
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/jupyter/SparkOcrUpdateTextPosition.ipynb b/jupyter/SparkOcrUpdateTextPosition.ipynb
@@ -226,7 +226,7 @@
     "            .setOutputCol(\"image\") \\\n",
     "            .setThreshold(130)\n",
     "\n",
-    "        ocr = TesseractOcr() \\\n",
+    "        ocr = ImageToText() \\\n",
     "            .setInputCol(\"image\") \\\n",
     "            .setOutputCol(\"text\") \\\n",
     "            .setIgnoreResolution(False) \\\n",
@@ -336,4 +336,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}