Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEATURE MYJSL-370: Fixed tests on johnsnowlabs repo #416

Draft
wants to merge 1 commit into
base: feature/KshitizGIT-InstallCompatibleSecrets-MYJSL-369
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion johnsnowlabs/auto_install/health_checks/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from johnsnowlabs.auto_install.softwares import Software
from johnsnowlabs.py_models.jsl_secrets import LicenseInfos
from johnsnowlabs.utils.enums import ProductName
from johnsnowlabs.utils.my_jsl_api import get_access_key_from_browser, get_user_licenses
from johnsnowlabs.utils.my_jsl_api import (get_access_key_from_browser,
get_user_licenses)


def check_health(check_install=True):
Expand Down Expand Up @@ -49,6 +50,7 @@ def check_health(check_install=True):
if health_check:
health_check[product] = product.health_check()

return install_status

def list_remote_licenses():
access_token = get_access_key_from_browser()
Expand Down
44 changes: 44 additions & 0 deletions tests/installations/test_auto_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import shutil
import unittest

from johnsnowlabs import nlp, settings
from johnsnowlabs.auto_install.softwares import (Software, SparkHcSoftware,
SparkNlpSoftware,
SparkOcrSoftware)
from johnsnowlabs.utils.enums import ProductName
from johnsnowlabs.utils.venv_utils import VenvWrapper


class AutoInstallationTestCases(unittest.TestCase):
def setUp(self) -> None:
shutil.rmtree(settings.root_dir, ignore_errors=True)
import pip
for product in ProductName:
software = Software.for_name(product)
if software and software.pypi_name:
pip.main(["uninstall", "-y", software.pypi_name])

def test_only_spark_nlp_should_be_installed_if_secrets_are_empty(self):
nlp.install(browser_login=False)
installed_products = nlp.check_health()

self.assertTrue(installed_products[SparkNlpSoftware])
self.assertFalse(installed_products[SparkHcSoftware])
self.assertFalse(installed_products[SparkOcrSoftware])

def test_spark_hc_is_installed_if_licensed_provided(self):
nlp.install(med_license=os.environ.get("VALID_LICENSE"))
installed_products = nlp.check_health()

self.assertTrue(installed_products[SparkNlpSoftware])
self.assertTrue(installed_products[SparkHcSoftware])
self.assertFalse(installed_products[SparkOcrSoftware])


def test_spark_ocr_is_installed_if_visual_is_true(self):
nlp.install(med_license=os.environ.get("VALID_LICENSE"), visual=True)
installed_products = nlp.check_health()
self.assertTrue(installed_products[SparkNlpSoftware])
self.assertTrue(installed_products[SparkHcSoftware])
self.assertTrue(installed_products[SparkOcrSoftware])
44 changes: 44 additions & 0 deletions tests/sessions/cross_libs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import sys
import unittest

from johnsnowlabs import nlp
from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
SparkNlpSoftware,
SparkOcrSoftware)
from tests.utils import clear_installed_jsl_installation, get_cross_lib_pipe

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

def setUpModule():
nlp.install(browser_login=False, spark_nlp=True, nlp=True, visual=True,
med_license=os.environ.get("VALID_LICENSE"), ocr_license=os.environ.get("VALID_LICENSE"),
aws_access_key="",
aws_key_id=""
)


def tearDownModule():
clear_installed_jsl_installation()


class InstallationTestCase(unittest.TestCase):
def test_all_libs_are_installedd(self):
installed_products = nlp.check_health()
self.assertTrue(installed_products[SparkNlpSoftware])
self.assertTrue(installed_products[SparkHcSoftware])
self.assertTrue(installed_products[SparkOcrSoftware])

class SparkSessionTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.spark = nlp.start( visual=True)

def simple_cross_library_session(self):
import pkg_resources
doc_example = pkg_resources.resource_filename(
"sparkocr", "resources/ocr/docs/doc2.docx"
)
df = self.spark.read.format("binaryFile").load(doc_example).cache()
get_cross_lib_pipe().fit(df).transform(df).show()
61 changes: 61 additions & 0 deletions tests/sessions/healthcare_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
import sys
import unittest

from johnsnowlabs import medical, nlp
from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
SparkNlpSoftware,
SparkOcrSoftware)
from tests.utils import (clear_installed_jsl_installation,
get_finance_pipeline, get_legal_pipeline)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

def setUpModule():
nlp.install(browser_login=False, spark_nlp=True, nlp=True, visual=False,
med_license=os.environ.get("VALID_LICENSE"),
aws_access_key="",
aws_key_id=""
)


def tearDownModule():
clear_installed_jsl_installation()


class InstallationTestCase(unittest.TestCase):
def test_spark_nlp_jsl_is_installed(self):
installed_products = nlp.check_health()
self.assertTrue(installed_products[SparkNlpSoftware])
self.assertTrue(installed_products[SparkHcSoftware])
self.assertFalse(installed_products[SparkOcrSoftware])

class SparkSessionTestCase(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.spark = nlp.start()

def test_healthcare_session(self):
print("Test Healthcare session ...")
d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc")
t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok")
c = (
medical.BertForTokenClassification()
.pretrained()
.setInputCols(["tok", "doc"])
.setOutputCol("class")
)
p = nlp.Pipeline(stages=[d, t, c])
p = nlp.to_nlu_pipe(p)
print(p.predict("Hello form John SNow labs"))

def test_finance_session(self):
print("Testing Finance Session ...")
nlp.Pipeline(get_finance_pipeline()).fullAnnotate("unit")


def test_legal_session(self):
print("Testing Legal Session ...")
nlp.Pipeline(get_legal_pipeline()).fullAnnotate("Shwrm")
64 changes: 64 additions & 0 deletions tests/sessions/spark_nlp_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os
import sys
import unittest

from johnsnowlabs import nlp
from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
SparkNlpSoftware,
SparkOcrSoftware)
from tests.utils import clear_installed_jsl_installation

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable



def setUpModule():
nlp.install(browser_login=False, spark_nlp=True, nlp=False, visual=False)


def tearDownModule():
clear_installed_jsl_installation()


class InstallationTestCase(unittest.TestCase):
def test_only_spark_nlp_should_be_installed_if_secrets_are_empty(self):

installed_products = nlp.check_health()

self.assertTrue(installed_products[SparkNlpSoftware])
self.assertFalse(installed_products[SparkHcSoftware])
self.assertFalse(installed_products[SparkOcrSoftware])

class SparkSessionTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.spark = nlp.start()

def test_sparknlp_session(self):
print("Start test_spark_nlp_session")
d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc")
t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok")
c = (
nlp.DeBertaForTokenClassification()
.setInputCols(["tok", "doc"])
.setOutputCol("class")
)
p = nlp.Pipeline(stages=[d, t])
p = nlp.to_nlu_pipe(p)
print(p.predict("Hello World"))

def test_sparknlp_gpu_session(self):
print("Start test_spark_nlp_gpu_session")
self.spark = nlp.start(hardware_target="gpu")
d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc")
t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok")
c = (
nlp.DeBertaForTokenClassification()
.setInputCols(["tok", "doc"])
.setOutputCol("class")
)
p = nlp.Pipeline(stages=[d, t])
p = nlp.to_nlu_pipe(p)
print(p.predict("Hello form John SNow labs"))

98 changes: 98 additions & 0 deletions tests/sessions/visual_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import os
import sys
import unittest

from johnsnowlabs import nlp, visual
from johnsnowlabs.auto_install.softwares import (SparkHcSoftware,
SparkNlpSoftware,
SparkOcrSoftware)
from tests.utils import (clear_installed_jsl_installation,
get_finance_pipeline, get_legal_pipeline)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

def setUpModule():
nlp.install(browser_login=False, spark_nlp=True, nlp=False, visual=True,
ocr_license=os.environ.get("VALID_LICENSE"),
aws_key_id="",
aws_access_key=""
)


def tearDownModule():
clear_installed_jsl_installation()


class InstallationTestCase(unittest.TestCase):
def test_spark_ocr_is_installed(self):
installed_products = nlp.check_health()
self.assertTrue(installed_products[SparkNlpSoftware])
self.assertFalse(installed_products[SparkHcSoftware])
self.assertTrue(installed_products[SparkOcrSoftware])

class SparkSessionTestCase(unittest.TestCase):
@classmethod
def setUp(cls):
cls.spark = nlp.start(visual=True)

def test_healthcare_session(self):
print("Test OCR session ...")
pdf_to_image = visual.PdfToImage()
pdf_to_image.setImageType(visual.ImageType.TYPE_3BYTE_BGR)

# Detect tables on the page using pretrained model
# It can be finetuned for have more accurate results for more specific documents
table_detector = visual.ImageTableDetector.pretrained(
"general_model_table_detection_v2", "en", "clinical/ocr"
)
table_detector.setInputCol("image")
table_detector.setOutputCol("region")

# Draw detected region's with table to the page
draw_regions = visual.ImageDrawRegions()
draw_regions.setInputCol("image")
draw_regions.setInputRegionsCol("region")
draw_regions.setOutputCol("image_with_regions")
draw_regions.setRectColor(visual.Color.red)

# Extract table regions to separate images
splitter = visual.ImageSplitRegions()
splitter.setInputCol("image")
splitter.setInputRegionsCol("region")
splitter.setOutputCol("table_image")
splitter.setDropCols("image")

# Detect cells on the table image
cell_detector = visual.ImageTableCellDetector()
cell_detector.setInputCol("table_image")
cell_detector.setOutputCol("cells")
cell_detector.setAlgoType("morphops")

# Extract text from the detected cells
table_recognition = visual.ImageCellsToTextTable()
table_recognition.setInputCol("table_image")
table_recognition.setCellsCol("cells")
table_recognition.setMargin(3)
table_recognition.setStrip(True)
table_recognition.setOutputCol("table")

pipeline = nlp.PipelineModel(
stages=[
pdf_to_image,
table_detector,
draw_regions,
splitter,
cell_detector,
table_recognition,
]
)

import pkg_resources

pdf_example = pkg_resources.resource_filename(
"sparkocr", "resources/ocr/pdfs/tabular-pdf/data.pdf"
)
pdf_example_df = self.spark.read.format("binaryFile").load(pdf_example).cache()
pipeline.transform(pdf_example_df).show()

Loading