Skip to content

Commit

Permalink
Updated notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
mykolamelnykml committed May 11, 2020
1 parent c2ed3d3 commit 28ca8de
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 69 deletions.
19 changes: 12 additions & 7 deletions jupyter/SparkOCRWriteImageToS3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@
"binary_to_image.setInputCol(\"content\")\n",
"binary_to_image.setOutputCol(\"image\")\n",
"\n",
"# Run tesseract OCR for each region\n",
"ocr = TesseractOcr()\n",
"# Run OCR for each region\n",
"ocr = ImageToText()\n",
"ocr.setInputCol(\"image\")\n",
"ocr.setOutputCol(\"text\")\n",
"ocr.setIgnoreResolution(False)\n",
Expand Down Expand Up @@ -880,8 +880,13 @@
}
],
"source": [
"results.write.format(\"binaryFormat\").option(\"type\", \"text\").option(\"field\", \"text\")\\\n",
" .option(\"extension\", \"txt\").mode(\"overwrite\").save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/texts/\")"
"results.write \\\n",
" .format(\"binaryFormat\") \\\n",
" .option(\"type\", \"text\") \\\n",
" .option(\"field\", \"text\") \\\n",
" .option(\"extension\", \"txt\") \\\n",
" .mode(\"overwrite\") \\\n",
" .save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/texts/\")"
]
}
],
Expand All @@ -906,13 +911,13 @@
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
25 changes: 14 additions & 11 deletions jupyter/SparkOCRWritePdfToS3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,10 @@
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read pdf objects"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -225,8 +223,8 @@
" .setInputCol(\"image_raw\") \\\n",
" .setOutputCol(\"image\") \\\n",
" .setThreshold(130)\n",
"# Run tesseract OCR for each region\n",
"ocr = TesseractOcr() \\\n",
"# Run OCR for each region\n",
"ocr = ImageToText() \\\n",
" .setInputCol(\"image\") \\\n",
" .setOutputCol(\"text\") \\\n",
" .setIgnoreResolution(False) \\\n",
Expand Down Expand Up @@ -899,8 +897,13 @@
}
],
"source": [
"results.write.format(\"binaryFormat\").option(\"type\", \"pdf\").option(\"field\", \"pdf\")\\\n",
" .option(\"extension\", \"pdf\").mode(\"overwrite\").save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/pdfs/\")"
"results.write\n",
" .format(\"binaryFormat\") \\\n",
" .option(\"type\", \"pdf\") \\\n",
" .option(\"field\", \"pdf\") \\\n",
" .option(\"extension\", \"pdf\") \\\n",
" .mode(\"overwrite\") \\\n",
" .save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/pdfs/\")"
]
}
],
Expand All @@ -925,13 +928,13 @@
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
80 changes: 31 additions & 49 deletions jupyter/SparkOcrSavedLoadedPipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Save Images Objects to S3 using Spark OCR\n",
"# Save/Load Spark OCR pipeline\n",
"## Initialize spark session"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"secret = \"\"\n",
"license = \"\"\n",
"version = secret.split(\"-\")[0]\n",
"spark_ocr_jar_path = \"../../target/scala-2.11\""
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -80,23 +80,14 @@
"metadata": {},
"outputs": [],
"source": [
"\n",
"from pyspark import SparkConf\n",
"from sparkocr import start\n",
"\n",
"if license:\n",
" os.environ['JSL_OCR_LICENSE'] = license\n",
" \n",
"# you can set AWS API Keys to env variables \n",
"# os.environ['AWS_ACCESS_KEY_ID'] = \"your key\"\n",
"# os.environ['AWS_SECRET_ACCESS_KEY'] = \"your secret\"\n",
"\n",
"# set additinal dependensies for read data from S3\n",
"conf = SparkConf() \\\n",
" .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n",
"# or you can set AWS API Keys here\n",
"# .set('spark.hadoop.fs.s3a.access.key', \"your key\" ) \\\n",
"# .set('spark.hadoop.fs.s3a.secret.key', \"your secret\")\n",
"\n",
"spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
"spark"
Expand Down Expand Up @@ -205,8 +196,8 @@
"binary_to_image.setInputCol(\"content\")\n",
"binary_to_image.setOutputCol(\"image\")\n",
"\n",
"# Run tesseract OCR for each region\n",
"ocr = TesseractOcr()\n",
"# Run OCR for each region\n",
"ocr = ImageToText()\n",
"ocr.setInputCol(\"image\")\n",
"ocr.setOutputCol(\"text\")\n",
"ocr.setIgnoreResolution(False)\n",
Expand Down Expand Up @@ -271,72 +262,63 @@
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## save the fitted pipeline to disk"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"model.write().overwrite().save(\"ocr_model\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
"outputs": [],
"source": [
"model.write().overwrite().save(\"ocr_model\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## save the unfit pipeline to disk"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"pipeline.write().overwrite().save(\"unfit_ocr_model\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
"outputs": [],
"source": [
"pipeline.write().overwrite().save(\"unfit_ocr_model\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## load back the model pipeline"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"sameModel = PipelineModel.load(\"ocr_model\")\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
"outputs": [],
"source": [
"sameModel = PipelineModel.load(\"ocr_model\")\n"
]
}
],
"metadata": {
Expand All @@ -355,18 +337,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.7.7"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
4 changes: 2 additions & 2 deletions jupyter/SparkOcrUpdateTextPosition.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@
" .setOutputCol(\"image\") \\\n",
" .setThreshold(130)\n",
"\n",
" ocr = TesseractOcr() \\\n",
" ocr = ImageToText() \\\n",
" .setInputCol(\"image\") \\\n",
" .setOutputCol(\"text\") \\\n",
" .setIgnoreResolution(False) \\\n",
Expand Down Expand Up @@ -336,4 +336,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

0 comments on commit 28ca8de

Please sign in to comment.