From fc857fc34e22c212e2902d5f9e600c6fee2feddb Mon Sep 17 00:00:00 2001 From: Mykola Melnyk Date: Thu, 14 May 2020 14:28:25 +0300 Subject: [PATCH] Fixed notebooks --- jupyter/SparkOCRS3AccesExample.ipynb | 14 +- jupyter/SparkOCRWritePdfToS3.ipynb | 2 +- jupyter/SparkOcrSavedLoadedPipeline.ipynb | 210 ++++++++++++++-------- 3 files changed, 146 insertions(+), 80 deletions(-) diff --git a/jupyter/SparkOCRS3AccesExample.ipynb b/jupyter/SparkOCRS3AccesExample.ipynb index a6dffa6..0741166 100644 --- a/jupyter/SparkOCRS3AccesExample.ipynb +++ b/jupyter/SparkOCRS3AccesExample.ipynb @@ -75,7 +75,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "SparkConf Configured, Starting to listen on port: 50980\n", + "SparkConf Configured, Starting to listen on port: 50635\n", "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n" ] }, @@ -89,7 +89,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -105,7 +105,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -124,6 +124,8 @@ "# you can set AWS API Keys to env variables \n", "# os.environ['AWS_ACCESS_KEY_ID'] = \"your key\"\n", "# os.environ['AWS_SECRET_ACCESS_KEY'] = \"your secret\"\n", + "os.environ['AWS_ACCESS_KEY_ID'] = \"AKIASRWSDKBGBUGEZJ3G\"\n", + "os.environ['AWS_SECRET_ACCESS_KEY'] = \"BEJUdxKC9H/Zn/qgDUV6ScAyGhPay68lMX9m096P\"\n", "\n", "# set additinal dependensies for read data from S3\n", "conf = SparkConf() \\\n", @@ -208,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -238,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -904,4 +906,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/jupyter/SparkOCRWritePdfToS3.ipynb b/jupyter/SparkOCRWritePdfToS3.ipynb index 5528b0a..12944a4 100644 --- a/jupyter/SparkOCRWritePdfToS3.ipynb +++ b/jupyter/SparkOCRWritePdfToS3.ipynb @@ -897,7 +897,7 @@ } ], "source": [ - "results.write\n", + "results.write \\\n", " .format(\"binaryFormat\") \\\n", " .option(\"type\", \"pdf\") \\\n", " .option(\"field\", \"pdf\") \\\n", diff --git a/jupyter/SparkOcrSavedLoadedPipeline.ipynb b/jupyter/SparkOcrSavedLoadedPipeline.ipynb index 759fb1d..3ba1a66 100644 --- a/jupyter/SparkOcrSavedLoadedPipeline.ipynb +++ b/jupyter/SparkOcrSavedLoadedPipeline.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "pycharm": { "name": "#%%\n" @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -63,48 +63,80 @@ "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade" ] }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Initialization of spark session" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /Users/nmelnik/IdeaProjects/spark-ocr/python/dist/spark-ocr-1.3.0rc1.tar.gz\n", + "Requirement already satisfied: numpy==1.17.4 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (1.17.4)\n", + "Requirement already satisfied: pillow==6.2.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (6.2.1)\n", + "Requirement already satisfied: py4j==0.10.7 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (0.10.7)\n", + "Requirement already satisfied: pyspark==2.4.4 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (2.4.4)\n", + "Requirement already satisfied: python-levenshtein==0.12.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (0.12.0)\n", + "Requirement already satisfied: scikit-image==0.16.2 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (0.16.2)\n", + "Requirement already satisfied: implicits==1.0.2 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from spark-ocr==1.3.0rc1) (1.0.2)\n", + "Requirement already satisfied: setuptools in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from python-levenshtein==0.12.0->spark-ocr==1.3.0rc1) (46.0.0)\n", + "Requirement already satisfied: imageio>=2.3.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.8.0)\n", + "Requirement already satisfied: networkx>=2.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.4)\n", + "Requirement already satisfied: PyWavelets>=0.4.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.1.1)\n", + "Requirement already satisfied: scipy>=0.19.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.4.1)\n", + "Requirement already satisfied: matplotlib!=3.0.0,>=2.0.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from scikit-image==0.16.2->spark-ocr==1.3.0rc1) (3.2.0)\n", + "Requirement already satisfied: decorator>=4.3.0 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from networkx>=2.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (4.4.2)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.8.1)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (0.10.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (2.4.6)\n", + "Requirement already satisfied: six>=1.5 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from python-dateutil>=2.1->matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.3.0rc1) (1.14.0)\n", + "Building wheels for collected packages: spark-ocr\n", + " Building wheel for spark-ocr (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for spark-ocr: filename=spark_ocr-1.3.0rc1-cp37-none-any.whl size=5015774 sha256=1892d1f304f47ac249a064d3f1ad114a83689b6a18c370f762fbe5f1908cdc63\n", + " Stored in directory: /Users/nmelnik/Library/Caches/pip/wheels/79/ee/b7/b1d6d10a6be137d65bd31f7d0159dcc1d704587c685a48fb4e\n", + "Successfully built spark-ocr\n", + "Installing collected packages: spark-ocr\n", + " Found existing installation: spark-ocr 1.3.0rc1\n", + " Uninstalling spark-ocr-1.3.0rc1:\n", + " Successfully uninstalled spark-ocr-1.3.0rc1\n", + "Successfully installed spark-ocr-1.3.0rc1\n", + "\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "from pyspark import SparkConf\n", - "from sparkocr import start\n", - "\n", - "if license:\n", - " os.environ['JSL_OCR_LICENSE'] = license\n", - " \n", - "conf = SparkConf() \\\n", - " .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n", - "\n", - "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n", - "spark" + "# or install from local path\n", + "# %pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "## Imports" + "## Initialization of spark session" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SparkConf Configured, Starting to listen on port: 54494\n", + "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n" + ] + }, { "data": { "text/html": [ @@ -115,7 +147,7 @@ "
\n", "

SparkContext

\n", "\n", - "

Spark UI

\n", + "

Spark UI

\n", "\n", "
\n", "
Version
\n", @@ -131,14 +163,40 @@ " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 55, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "from pyspark import SparkConf\n", + "from sparkocr import start\n", + "\n", + "if license:\n", + " os.environ['JSL_OCR_LICENSE'] = license\n", + " \n", + "conf = SparkConf() \\\n", + " .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n", + "\n", + "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], "source": [ "from pyspark.ml import PipelineModel\n", "\n", @@ -154,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -170,9 +228,20 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "images = spark.read.format(\"binaryFile\").load(images_path).cache()\n", "images.count()" @@ -187,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -209,22 +278,6 @@ "])" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fit the pipeline to training images." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "model = pipeline.fit(images)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -234,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" @@ -248,28 +301,28 @@ "+-------+--------------------+-----------------+\n", "|pagenum| text| confidence|\n", "+-------+--------------------+-----------------+\n", - "| 0|FOREWORD\n", - "\n", - "Electro...|95.88622707790799|\n", + "| 0|> Confidential Cl...|84.30319298638238|\n", "+-------+--------------------+-----------------+\n", "\n" ] } ], "source": [ - "results=model.transform(images)" + "pipeline.transform(images) \\\n", + " .select(\"pagenum\",\"text\", \"confidence\") \\\n", + " .show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## save the fitted pipeline to disk" + "## Save the pipeline to disk" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "pycharm": { "name": "#%%\n" @@ -277,19 +330,19 @@ }, "outputs": [], "source": [ - "model.write().overwrite().save(\"ocr_model\")" + "pipeline.write().overwrite().save(\"ocr_model\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## save the unfit pipeline to disk" + "## Load back the model pipeline" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "pycharm": { "name": "#%%\n" @@ -297,27 +350,38 @@ }, "outputs": [], "source": [ - "pipeline.write().overwrite().save(\"unfit_ocr_model\")" + "stored_pipeline = PipelineModel.load(\"ocr_model\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## load back the model pipeline" + "## Run loaded pipeline" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------+-----------------+\n", + "|pagenum| text| confidence|\n", + "+-------+--------------------+-----------------+\n", + "| 0|> Confidential Cl...|84.30319298638238|\n", + "+-------+--------------------+-----------------+\n", + "\n" + ] } - }, - "outputs": [], + ], "source": [ - "sameModel = PipelineModel.load(\"ocr_model\")\n" + "stored_pipeline.transform(images) \\\n", + " .select(\"pagenum\",\"text\", \"confidence\") \\\n", + " .show()" ] } ],