JohnSnowLabs · KshitizGIT · Jun 20, 2023
diff --git a/johnsnowlabs/auto_install/health_checks/report.py b/johnsnowlabs/auto_install/health_checks/report.py
@@ -7,7 +7,8 @@
 from johnsnowlabs.auto_install.softwares import Software
 from johnsnowlabs.py_models.jsl_secrets import LicenseInfos
 from johnsnowlabs.utils.enums import ProductName
-from johnsnowlabs.utils.my_jsl_api import get_access_key_from_browser, get_user_licenses
+from johnsnowlabs.utils.my_jsl_api import (get_access_key_from_browser,
+                                           get_user_licenses)
 
 
 def check_health(check_install=True):
@@ -49,6 +50,7 @@ def check_health(check_install=True):
         if health_check:
             health_check[product] = product.health_check()
 
+    return install_status
 
 def list_remote_licenses():
     access_token = get_access_key_from_browser()

diff --git a/tests/installations/test_auto_install.py b/tests/installations/test_auto_install.py
@@ -0,0 +1,44 @@
+import os
+import shutil
+import unittest
+
+from johnsnowlabs import nlp, settings
+from johnsnowlabs.auto_install.softwares import (Software, SparkHcSoftware,
+                                                 SparkNlpSoftware,
+                                                 SparkOcrSoftware)
+from johnsnowlabs.utils.enums import ProductName
+from johnsnowlabs.utils.venv_utils import VenvWrapper
+
+
+class AutoInstallationTestCases(unittest.TestCase):
+    def setUp(self) -> None:
+        shutil.rmtree(settings.root_dir, ignore_errors=True)
+        import pip
+        for product in ProductName:
+            software = Software.for_name(product)
+            if software and software.pypi_name:
+                pip.main(["uninstall", "-y", software.pypi_name])
+
+    def test_only_spark_nlp_should_be_installed_if_secrets_are_empty(self):
+        nlp.install(browser_login=False)
+        installed_products = nlp.check_health()
+
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertFalse(installed_products[SparkHcSoftware])
+        self.assertFalse(installed_products[SparkOcrSoftware])
+
+    def test_spark_hc_is_installed_if_licensed_provided(self):
+        nlp.install(med_license=os.environ.get("VALID_LICENSE"))
+        installed_products = nlp.check_health()
+
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertTrue(installed_products[SparkHcSoftware])
+        self.assertFalse(installed_products[SparkOcrSoftware])
+
+
+    def test_spark_ocr_is_installed_if_visual_is_true(self):
+        nlp.install(med_license=os.environ.get("VALID_LICENSE"), visual=True)
+        installed_products = nlp.check_health()
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertTrue(installed_products[SparkHcSoftware])
+        self.assertTrue(installed_products[SparkOcrSoftware])
diff --git a/tests/sessions/cross_libs.py b/tests/sessions/cross_libs.py
@@ -0,0 +1,44 @@
+import os
+import sys
+import unittest
+
+from johnsnowlabs import nlp
+from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
+                                                 SparkNlpSoftware,
+                                                 SparkOcrSoftware)
+from tests.utils import clear_installed_jsl_installation, get_cross_lib_pipe
+
+os.environ["PYSPARK_PYTHON"] = sys.executable
+os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
+
+def setUpModule():
+    nlp.install(browser_login=False, spark_nlp=True, nlp=True, visual=True,
+                med_license=os.environ.get("VALID_LICENSE"), ocr_license=os.environ.get("VALID_LICENSE"),
+                aws_access_key="",
+                aws_key_id=""
+                )
+
+
+def tearDownModule():
+    clear_installed_jsl_installation()
+
+
+class InstallationTestCase(unittest.TestCase):
+    def test_all_libs_are_installedd(self):
+        installed_products = nlp.check_health()
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertTrue(installed_products[SparkHcSoftware])
+        self.assertTrue(installed_products[SparkOcrSoftware])
+
+class SparkSessionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.spark = nlp.start( visual=True)
+
+    def simple_cross_library_session(self):
+        import pkg_resources
+        doc_example = pkg_resources.resource_filename(
+            "sparkocr", "resources/ocr/docs/doc2.docx"
+        )
+        df = self.spark.read.format("binaryFile").load(doc_example).cache()
+        get_cross_lib_pipe().fit(df).transform(df).show()
diff --git a/tests/sessions/healthcare_lib.py b/tests/sessions/healthcare_lib.py
@@ -0,0 +1,61 @@
+import os
+import sys
+import unittest
+
+from johnsnowlabs import medical, nlp
+from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
+                                                 SparkNlpSoftware,
+                                                 SparkOcrSoftware)
+from tests.utils import (clear_installed_jsl_installation,
+                         get_finance_pipeline, get_legal_pipeline)
+
+os.environ["PYSPARK_PYTHON"] = sys.executable
+os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
+
+def setUpModule():
+    nlp.install(browser_login=False, spark_nlp=True, nlp=True, visual=False,
+                med_license=os.environ.get("VALID_LICENSE"),
+                aws_access_key="",
+                aws_key_id=""
+                )
+
+
+def tearDownModule():
+    clear_installed_jsl_installation()
+
+
+class InstallationTestCase(unittest.TestCase):
+    def test_spark_nlp_jsl_is_installed(self):
+        installed_products = nlp.check_health()
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertTrue(installed_products[SparkHcSoftware])
+        self.assertFalse(installed_products[SparkOcrSoftware])
+
+class SparkSessionTestCase(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.spark = nlp.start()
+
+    def test_healthcare_session(self):
+        print("Test Healthcare session ...")
+        d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc")
+        t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok")
+        c = (
+            medical.BertForTokenClassification()
+            .pretrained()
+            .setInputCols(["tok", "doc"])
+            .setOutputCol("class")
+        )
+        p = nlp.Pipeline(stages=[d, t, c])
+        p = nlp.to_nlu_pipe(p)
+        print(p.predict("Hello form John SNow labs"))
+
+    def test_finance_session(self):
+        print("Testing Finance Session ...")
+        nlp.Pipeline(get_finance_pipeline()).fullAnnotate("unit")
+
+
+    def test_legal_session(self):
+        print("Testing Legal Session ...")
+        nlp.Pipeline(get_legal_pipeline()).fullAnnotate("Shwrm")
diff --git a/tests/sessions/spark_nlp_lib.py b/tests/sessions/spark_nlp_lib.py
@@ -0,0 +1,64 @@
+import os
+import sys
+import unittest
+
+from johnsnowlabs import nlp
+from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
+                                                 SparkNlpSoftware,
+                                                 SparkOcrSoftware)
+from tests.utils import clear_installed_jsl_installation
+
+os.environ["PYSPARK_PYTHON"] = sys.executable
+os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
+
+
+
+def setUpModule():
+    nlp.install(browser_login=False, spark_nlp=True, nlp=False, visual=False)
+
+
+def tearDownModule():
+    clear_installed_jsl_installation()
+
+
+class InstallationTestCase(unittest.TestCase):
+    def test_only_spark_nlp_should_be_installed_if_secrets_are_empty(self):
+
+        installed_products = nlp.check_health()
+
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertFalse(installed_products[SparkHcSoftware])
+        self.assertFalse(installed_products[SparkOcrSoftware])
+
+class SparkSessionTestCase(unittest.TestCase):
+    @classmethod 
+    def setUpClass(cls):
+        cls.spark = nlp.start()
+
+    def test_sparknlp_session(self):
+        print("Start test_spark_nlp_session")
+        d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc")
+        t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok")
+        c = (
+                    nlp.DeBertaForTokenClassification()
+                    .setInputCols(["tok", "doc"])
+                    .setOutputCol("class")
+                )
+        p = nlp.Pipeline(stages=[d, t])
+        p = nlp.to_nlu_pipe(p)
+        print(p.predict("Hello World"))
+
+    def test_sparknlp_gpu_session(self):
+        print("Start test_spark_nlp_gpu_session")
+        self.spark = nlp.start(hardware_target="gpu")
+        d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc")
+        t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok")
+        c = (
+            nlp.DeBertaForTokenClassification()
+            .setInputCols(["tok", "doc"])
+            .setOutputCol("class")
+        )
+        p = nlp.Pipeline(stages=[d, t])
+        p = nlp.to_nlu_pipe(p)
+        print(p.predict("Hello form John SNow labs"))
+
diff --git a/tests/sessions/visual_lib.py b/tests/sessions/visual_lib.py
@@ -0,0 +1,98 @@
+import os
+import sys
+import unittest
+
+from johnsnowlabs import nlp, visual
+from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
+                                                 SparkNlpSoftware,
+                                                 SparkOcrSoftware)
+from tests.utils import (clear_installed_jsl_installation,
+                         get_finance_pipeline, get_legal_pipeline)
+
+os.environ["PYSPARK_PYTHON"] = sys.executable
+os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
+
+def setUpModule():
+    nlp.install(browser_login=False, spark_nlp=True, nlp=False, visual=True,
+                ocr_license=os.environ.get("VALID_LICENSE"),
+                aws_key_id="",
+                aws_access_key=""
+                )
+
+
+def tearDownModule():
+    clear_installed_jsl_installation()
+
+
+class InstallationTestCase(unittest.TestCase):
+    def test_spark_ocr_is_installed(self):
+        installed_products = nlp.check_health()
+        self.assertTrue(installed_products[SparkNlpSoftware])
+        self.assertFalse(installed_products[SparkHcSoftware])
+        self.assertTrue(installed_products[SparkOcrSoftware])
+
+class SparkSessionTestCase(unittest.TestCase):
+    @classmethod
+    def setUp(cls):
+        cls.spark = nlp.start(visual=True)
+
+    def test_healthcare_session(self):
+        print("Test OCR session ...")
+        pdf_to_image = visual.PdfToImage()
+        pdf_to_image.setImageType(visual.ImageType.TYPE_3BYTE_BGR)
+
+        # Detect tables on the page using pretrained model
+        # It can be finetuned for have more accurate results for more specific documents
+        table_detector = visual.ImageTableDetector.pretrained(
+            "general_model_table_detection_v2", "en", "clinical/ocr"
+        )
+        table_detector.setInputCol("image")
+        table_detector.setOutputCol("region")
+
+        # Draw detected region's with table to the page
+        draw_regions = visual.ImageDrawRegions()
+        draw_regions.setInputCol("image")
+        draw_regions.setInputRegionsCol("region")
+        draw_regions.setOutputCol("image_with_regions")
+        draw_regions.setRectColor(visual.Color.red)
+
+        # Extract table regions to separate images
+        splitter = visual.ImageSplitRegions()
+        splitter.setInputCol("image")
+        splitter.setInputRegionsCol("region")
+        splitter.setOutputCol("table_image")
+        splitter.setDropCols("image")
+
+        # Detect cells on the table image
+        cell_detector = visual.ImageTableCellDetector()
+        cell_detector.setInputCol("table_image")
+        cell_detector.setOutputCol("cells")
+        cell_detector.setAlgoType("morphops")
+
+        # Extract text from the detected cells
+        table_recognition = visual.ImageCellsToTextTable()
+        table_recognition.setInputCol("table_image")
+        table_recognition.setCellsCol("cells")
+        table_recognition.setMargin(3)
+        table_recognition.setStrip(True)
+        table_recognition.setOutputCol("table")
+
+        pipeline = nlp.PipelineModel(
+            stages=[
+                pdf_to_image,
+                table_detector,
+                draw_regions,
+                splitter,
+                cell_detector,
+                table_recognition,
+            ]
+        )
+
+        import pkg_resources
+
+        pdf_example = pkg_resources.resource_filename(
+            "sparkocr", "resources/ocr/pdfs/tabular-pdf/data.pdf"
+        )
+        pdf_example_df = self.spark.read.format("binaryFile").load(pdf_example).cache()
+        pipeline.transform(pdf_example_df).show()
+