Skip to content

Commit

Permalink
GH140_Configure_auto_build_spark
Browse files Browse the repository at this point in the history
Adapt Notebook to use spark-nlp compatabile with spark23

Spark NLP	Apache Spark 2.3.x	Apache Spark 2.4.x
2.5.5	             YES	        YES

Upgrade spark-nlp-jsl to 2.5.5
  • Loading branch information
546075910 committed Aug 12, 2020
1 parent 82db63f commit 1bf7bf2
Showing 1 changed file with 24 additions and 43 deletions.
67 changes: 24 additions & 43 deletions jupyter/SparkOcrUpdateTextPosition.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -65,23 +65,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n",
"\u001B[33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n",
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001B[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"outputs": [],
"source": [
"# install from PYPI using secret\n",
"%pip install spark-nlp==2.5.5\n",
"%pip install spark-nlp==2.4.5\n",
"%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade"
]
},
Expand All @@ -104,14 +93,14 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SparkConf Configured, Starting to listen on port: 53378\n",
"SparkConf Configured, Starting to listen on port: 59744\n",
"JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
]
},
Expand All @@ -125,11 +114,11 @@
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://kolia-mbp.dlink:4041\">Spark UI</a></p>\n",
" <p><a href=\"http://melnyks-mbp:4043\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v2.3.2</code></dd>\n",
" <dd><code>v2.4.4</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
Expand All @@ -141,10 +130,10 @@
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x1195bb510>"
"<pyspark.sql.session.SparkSession at 0x10c27d2d0>"
]
},
"execution_count": 13,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -155,17 +144,18 @@
"if license:\n",
" os.environ['JSL_OCR_LICENSE'] = license\n",
"\n",
"spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version=\"2.5.5\")\n",
"spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version=\"2.4.5\")\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.ml import Pipeline\n",
"from pyspark.ml import PipelineModel\n",
"from sparkocr.transformers import *\n",
"from sparknlp.annotator import *\n",
"from sparknlp.base import *\n",
Expand All @@ -181,7 +171,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -204,7 +194,7 @@
" .setOutputCol(\"spell\")\n",
" \n",
" tokenAssem = TokenAssembler() \\\n",
" .setInputCols([\"spell\", \"document\"]) \\\n",
" .setInputCols(\"spell\") \\\n",
" .setOutputCol(\"newDocs\")\n",
"\n",
" updatedText = UpdateTextPosition() \\\n",
Expand All @@ -223,6 +213,7 @@
" \n",
" return pipeline\n",
"\n",
"\n",
"def ocr_pipeline():\n",
" # Transforrm PDF document to images per page\n",
" pdf_to_image = PdfToImage() \\\n",
Expand Down Expand Up @@ -259,7 +250,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -277,19 +268,9 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"spellcheck_norvig download started this may take some time.\n",
"Approximate size to download 4.2 MB\n",
"[OK!]\n"
]
}
],
"outputs": [],
"source": [
"ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n",
"updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n",
Expand All @@ -309,7 +290,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 9,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand All @@ -319,10 +300,10 @@
{
"data": {
"text/plain": [
"1671"
"72914"
]
},
"execution_count": 21,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand Down

1 comment on commit 1bf7bf2

@review-notebook-app
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review Jupyter notebook diffs for this commit on  ReviewNB

You can open a pull request to discuss changes and offer feedback.


Powered by ReviewNB

Please sign in to comment.