Skip to content

Commit

Permalink
Merge branch 'master1' into 130-release-candidate
Browse files Browse the repository at this point in the history
  • Loading branch information
mykolamelnykml committed May 27, 2020
2 parents 289c296 + 8c58c40 commit cfe25a0
Show file tree
Hide file tree
Showing 10 changed files with 39 additions and 39 deletions.
8 changes: 4 additions & 4 deletions jupyter/SparkOCRGreyBackground.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -205,14 +205,14 @@
"remove_objects.setMaxSizeObject(1000)\n",
"remove_objects.setMinSizeObject(None)\n",
"\n",
"# Run OCR for each region\n",
"ocr_corrected = ImageToText()\n",
"# Run tesseract OCR for each region\n",
"ocr_corrected = TesseractOcr()\n",
"ocr_corrected.setInputCol(\"corrected_image\")\n",
"ocr_corrected.setOutputCol(\"text_corrected\")\n",
"ocr_corrected.setPositionsCol(\"positions_corrected\")\n",
"ocr_corrected.setConfidenceThreshold(75)\n",
"\n",
"ocr = ImageToText()\n",
"ocr = TesseractOcr()\n",
"ocr.setInputCol(\"image\")\n",
"ocr.setOutputCol(\"text\")\n",
"\n",
Expand Down Expand Up @@ -535,4 +535,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
4 changes: 2 additions & 2 deletions jupyter/SparkOCRS3AccesExample.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@
"binary_to_image = BinaryToImage()\n",
"binary_to_image.setOutputCol(\"image\")\n",
"\n",
"# Run OCR for each region\n",
"ocr = ImageToText()\n",
"# Run tesseract OCR for each region\n",
"ocr = TesseractOcr()\n",
"ocr.setInputCol(\"image\")\n",
"ocr.setOutputCol(\"text\")\n",
"ocr.setIgnoreResolution(False)\n",
Expand Down
6 changes: 3 additions & 3 deletions jupyter/SparkOCRremoveBackgroundNoise.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@
"remove_objects.setOutputCol(\"corrected_image\")\n",
"remove_objects.setMinSizeFont(30)\n",
"\n",
"# Run OCR for each region\n",
"ocr = ImageToText()\n",
"# Run tesseract OCR for each region\n",
"ocr = TesseractOcr()\n",
"ocr.setInputCol(\"corrected_image\")\n",
"ocr.setOutputCol(\"text\")\n",
"# Path to the tessdata related to the OS and version\n",
Expand Down Expand Up @@ -647,4 +647,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
16 changes: 8 additions & 8 deletions jupyter/SparkOCRremoveRackgroundNoiseAndDrawRegions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,15 @@
"draw_regions.setInputRegionsCol(\"region\")\n",
"draw_regions.setOutputCol(\"image_with_regions\")\n",
"\n",
"# Run OCR for corrected image\n",
"ocr_corrected = ImageToText()\n",
"# Run tesseract OCR for corrected image\n",
"ocr_corrected = TesseractOcr()\n",
"ocr_corrected.setInputCol(\"corrected_image\")\n",
"ocr_corrected.setOutputCol(\"corrected_text\")\n",
"ocr_corrected.setPositionsCol(\"corrected_positions\")\n",
"ocr_corrected.setConfidenceThreshold(65)\n",
"\n",
"# Run tesseract OCR for original image\n",
"ocr = ImageToText()\n",
"ocr = TesseractOcr()\n",
"ocr.setInputCol(\"image\")\n",
"ocr.setOutputCol(\"text\")\n",
"\n",
Expand Down Expand Up @@ -293,8 +293,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[31mFilename:\n",
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001B[0m\n",
"\u001b[31mFilename:\n",
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001b[0m\n",
"Recognized text:\n",
" \n",
"\n",
Expand Down Expand Up @@ -376,8 +376,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[31mFilename:\n",
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001B[0m\n",
"\u001b[31mFilename:\n",
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001b[0m\n",
"Recognized text:\n",
"° Date 7/16/68\n",
"Sanple No 5031\n",
Expand Down Expand Up @@ -583,4 +583,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
8 changes: 4 additions & 4 deletions jupyter/SparkOcrHttpSource.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@
" pdf_to_image.setInputCol(\"content\")\n",
" pdf_to_image.setOutputCol(\"image\")\n",
"\n",
" # Run OCR\n",
" ocr = ImageToText()\n",
" # Run tesseract OCR\n",
" ocr = TesseractOcr()\n",
" ocr.setInputCol(\"image\")\n",
" ocr.setOutputCol(\"text\")\n",
" ocr.setConfidenceThreshold(65)\n",
Expand Down Expand Up @@ -438,7 +438,7 @@
" binary_to_image.setOutputCol(\"image\")\n",
"\n",
" # Run tesseract OCR\n",
" ocr = ImageToText()\n",
" ocr = TesseractOcr()\n",
" ocr.setInputCol(\"image\")\n",
" ocr.setOutputCol(\"text\")\n",
" ocr.setConfidenceThreshold(65)\n",
Expand Down Expand Up @@ -606,4 +606,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
6 changes: 3 additions & 3 deletions jupyter/SparkOcrSimpleExample.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@
" pdf_to_image.setInputCol(\"content\")\n",
" pdf_to_image.setOutputCol(\"image\")\n",
"\n",
" # Run OCR\n",
" ocr = ImageToText()\n",
" # Run tesseract OCR\n",
" ocr = TesseractOcr()\n",
" ocr.setInputCol(\"image\")\n",
" ocr.setOutputCol(\"text\")\n",
" ocr.setConfidenceThreshold(65)\n",
Expand Down Expand Up @@ -543,4 +543,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
4 changes: 2 additions & 2 deletions jupyter/SparkOcrStoreResultsToPdfWithTextLayout.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@
" .setKeepInput(True)\n",
" \n",
" # Run OCR\n",
" ocr = ImageToText() \\\n",
" ocr = TesseractOcr() \\\n",
" .setInputCol(\"image\") \\\n",
" .setOutputCol(\"text\") \\\n",
" .setConfidenceThreshold(60) \\\n",
Expand Down Expand Up @@ -372,4 +372,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@
"Requirement already satisfied: kiwisolver>=1.0.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.2.0) (1.1.0)\n",
"Requirement already satisfied: six>=1.5 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from python-dateutil>=2.1->matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.2.0) (1.14.0)\n",
"Building wheels for collected packages: spark-ocr\n",
" Building wheel for spark-ocr (setup.py) ... \u001B[?25ldone\n",
"\u001B[?25h Created wheel for spark-ocr: filename=spark_ocr-1.2.0-py3-none-any.whl size=5012116 sha256=b79c63e97b4235bbb3c7e061d6a42840bb1886c0351fa6e52262964bfe8333f3\n",
" Building wheel for spark-ocr (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for spark-ocr: filename=spark_ocr-1.2.0-py3-none-any.whl size=5012116 sha256=b79c63e97b4235bbb3c7e061d6a42840bb1886c0351fa6e52262964bfe8333f3\n",
" Stored in directory: /Users/nmelnik/Library/Caches/pip/wheels/8f/18/a8/6a746cb146272537dd3c50b17baa2711dab0a33acc5ed77549\n",
"Successfully built spark-ocr\n",
"Installing collected packages: spark-ocr\n",
Expand All @@ -112,7 +112,7 @@
],
"source": [
"# or install from local path\n",
"#%pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz"
"%pip install --user ../../python/dist/spark-ocr-1.2.0.tar.gz"
]
},
{
Expand Down Expand Up @@ -227,7 +227,7 @@
" .setKeepInput(True)\n",
" \n",
" # Run OCR\n",
" ocr = ImageToText() \\\n",
" ocr = TesseractOcr() \\\n",
" .setInputCol(\"image\") \\\n",
" .setOutputCol(\"text\") \\\n",
" .setConfidenceThreshold(60) \\\n",
Expand Down Expand Up @@ -503,4 +503,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
10 changes: 5 additions & 5 deletions jupyter/SparkOcrStreamingPDF.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@
"pdf_to_image = PdfToImage()\n",
"pdf_to_image.setOutputCol(\"image\")\n",
"\n",
"# Run OCR for each region\n",
"ocr = ImageToText()\n",
"# Run tesseract OCR for each region\n",
"ocr = TesseractOcr()\n",
"ocr.setInputCol(\"image\")\n",
"ocr.setOutputCol(\"text\")\n",
"ocr.setConfidenceThreshold(60)\n",
Expand Down Expand Up @@ -264,7 +264,7 @@
}
],
"source": [
"# get progress of streaming job\n",
"# get progress of streamig job\n",
"query.lastProgress"
]
},
Expand All @@ -274,7 +274,7 @@
"metadata": {},
"outputs": [],
"source": [
"# need to run for stop streaming job\n",
"# need to run for stop steraming job\n",
"query.stop()"
]
},
Expand Down Expand Up @@ -489,4 +489,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
6 changes: 3 additions & 3 deletions jupyter/SparkOcrWithSkewCorrection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"outputs": [],
"source": [
"# or install from local path\n",
"# %pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz"
"# %pip install --user ../../python/dist/spark-ocr-1.1.0rc1.tar.gz"
]
},
{
Expand Down Expand Up @@ -178,8 +178,8 @@
" skew_corrector.setOutputCol(\"corrected_image\")\n",
" skew_corrector.setAutomaticSkewCorrection(skew_correction)\n",
"\n",
" # Run OCR\n",
" ocr = ImageToText()\n",
" # Run tesseract OCR\n",
" ocr = TesseractOcr()\n",
" ocr.setInputCol(\"corrected_image\")\n",
" ocr.setOutputCol(\"text\")\n",
" \n",
Expand Down

0 comments on commit cfe25a0

Please sign in to comment.