diff --git a/johnsnowlabs/auto_install/health_checks/report.py b/johnsnowlabs/auto_install/health_checks/report.py index 69cb81a5d2..20f0bc7926 100644 --- a/johnsnowlabs/auto_install/health_checks/report.py +++ b/johnsnowlabs/auto_install/health_checks/report.py @@ -7,7 +7,8 @@ from johnsnowlabs.auto_install.softwares import Software from johnsnowlabs.py_models.jsl_secrets import LicenseInfos from johnsnowlabs.utils.enums import ProductName -from johnsnowlabs.utils.my_jsl_api import get_access_key_from_browser, get_user_licenses +from johnsnowlabs.utils.my_jsl_api import (get_access_key_from_browser, + get_user_licenses) def check_health(check_install=True): @@ -49,6 +50,7 @@ def check_health(check_install=True): if health_check: health_check[product] = product.health_check() + return install_status def list_remote_licenses(): access_token = get_access_key_from_browser() diff --git a/tests/installations/test_auto_install.py b/tests/installations/test_auto_install.py new file mode 100644 index 0000000000..ad185ed6bf --- /dev/null +++ b/tests/installations/test_auto_install.py @@ -0,0 +1,44 @@ +import os +import shutil +import unittest + +from johnsnowlabs import nlp, settings +from johnsnowlabs.auto_install.softwares import (Software, SparkHcSoftware, + SparkNlpSoftware, + SparkOcrSoftware) +from johnsnowlabs.utils.enums import ProductName +from johnsnowlabs.utils.venv_utils import VenvWrapper + + +class AutoInstallationTestCases(unittest.TestCase): + def setUp(self) -> None: + shutil.rmtree(settings.root_dir, ignore_errors=True) + import pip + for product in ProductName: + software = Software.for_name(product) + if software and software.pypi_name: + pip.main(["uninstall", "-y", software.pypi_name]) + + def test_only_spark_nlp_should_be_installed_if_secrets_are_empty(self): + nlp.install(browser_login=False) + installed_products = nlp.check_health() + + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertFalse(installed_products[SparkHcSoftware]) + self.assertFalse(installed_products[SparkOcrSoftware]) + + def test_spark_hc_is_installed_if_licensed_provided(self): + nlp.install(med_license=os.environ.get("VALID_LICENSE")) + installed_products = nlp.check_health() + + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertTrue(installed_products[SparkHcSoftware]) + self.assertFalse(installed_products[SparkOcrSoftware]) + + + def test_spark_ocr_is_installed_if_visual_is_true(self): + nlp.install(med_license=os.environ.get("VALID_LICENSE"), visual=True) + installed_products = nlp.check_health() + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertTrue(installed_products[SparkHcSoftware]) + self.assertTrue(installed_products[SparkOcrSoftware]) diff --git a/tests/sessions/cross_libs.py b/tests/sessions/cross_libs.py new file mode 100644 index 0000000000..6ea7301f57 --- /dev/null +++ b/tests/sessions/cross_libs.py @@ -0,0 +1,44 @@ +import os +import sys +import unittest + +from johnsnowlabs import nlp +from johnsnowlabs.auto_install.softwares import (SparkHcSoftware, + SparkNlpSoftware, + SparkOcrSoftware) +from tests.utils import clear_installed_jsl_installation, get_cross_lib_pipe + +os.environ["PYSPARK_PYTHON"] = sys.executable +os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + +def setUpModule(): + nlp.install(browser_login=False, spark_nlp=True, nlp=True, visual=True, + med_license=os.environ.get("VALID_LICENSE"), ocr_license=os.environ.get("VALID_LICENSE"), + aws_access_key="", + aws_key_id="" + ) + + +def tearDownModule(): + clear_installed_jsl_installation() + + +class InstallationTestCase(unittest.TestCase): + def test_all_libs_are_installedd(self): + installed_products = nlp.check_health() + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertTrue(installed_products[SparkHcSoftware]) + self.assertTrue(installed_products[SparkOcrSoftware]) + +class SparkSessionTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.spark = nlp.start( visual=True) + + def simple_cross_library_session(self): + import pkg_resources + doc_example = pkg_resources.resource_filename( + "sparkocr", "resources/ocr/docs/doc2.docx" + ) + df = self.spark.read.format("binaryFile").load(doc_example).cache() + get_cross_lib_pipe().fit(df).transform(df).show() diff --git a/tests/sessions/healthcare_lib.py b/tests/sessions/healthcare_lib.py new file mode 100644 index 0000000000..3ae6253efc --- /dev/null +++ b/tests/sessions/healthcare_lib.py @@ -0,0 +1,61 @@ +import os +import sys +import unittest + +from johnsnowlabs import medical, nlp +from johnsnowlabs.auto_install.softwares import (SparkHcSoftware, + SparkNlpSoftware, + SparkOcrSoftware) +from tests.utils import (clear_installed_jsl_installation, + get_finance_pipeline, get_legal_pipeline) + +os.environ["PYSPARK_PYTHON"] = sys.executable +os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + +def setUpModule(): + nlp.install(browser_login=False, spark_nlp=True, nlp=True, visual=False, + med_license=os.environ.get("VALID_LICENSE"), + aws_access_key="", + aws_key_id="" + ) + + +def tearDownModule(): + clear_installed_jsl_installation() + + +class InstallationTestCase(unittest.TestCase): + def test_spark_nlp_jsl_is_installed(self): + installed_products = nlp.check_health() + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertTrue(installed_products[SparkHcSoftware]) + self.assertFalse(installed_products[SparkOcrSoftware]) + +class SparkSessionTestCase(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.spark = nlp.start() + + def test_healthcare_session(self): + print("Test Healthcare session ...") + d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") + t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") + c = ( + medical.BertForTokenClassification() + .pretrained() + .setInputCols(["tok", "doc"]) + .setOutputCol("class") + ) + p = nlp.Pipeline(stages=[d, t, c]) + p = nlp.to_nlu_pipe(p) + print(p.predict("Hello form John SNow labs")) + + def test_finance_session(self): + print("Testing Finance Session ...") + nlp.Pipeline(get_finance_pipeline()).fullAnnotate("unit") + + + def test_legal_session(self): + print("Testing Legal Session ...") + nlp.Pipeline(get_legal_pipeline()).fullAnnotate("Shwrm") diff --git a/tests/sessions/spark_nlp_lib.py b/tests/sessions/spark_nlp_lib.py new file mode 100644 index 0000000000..8439f9a309 --- /dev/null +++ b/tests/sessions/spark_nlp_lib.py @@ -0,0 +1,64 @@ +import os +import sys +import unittest + +from johnsnowlabs import nlp +from johnsnowlabs.auto_install.softwares import (SparkHcSoftware, + SparkNlpSoftware, + SparkOcrSoftware) +from tests.utils import clear_installed_jsl_installation + +os.environ["PYSPARK_PYTHON"] = sys.executable +os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + + + +def setUpModule(): + nlp.install(browser_login=False, spark_nlp=True, nlp=False, visual=False) + + +def tearDownModule(): + clear_installed_jsl_installation() + + +class InstallationTestCase(unittest.TestCase): + def test_only_spark_nlp_should_be_installed_if_secrets_are_empty(self): + + installed_products = nlp.check_health() + + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertFalse(installed_products[SparkHcSoftware]) + self.assertFalse(installed_products[SparkOcrSoftware]) + +class SparkSessionTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.spark = nlp.start() + + def test_sparknlp_session(self): + print("Start test_spark_nlp_session") + d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") + t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") + c = ( + nlp.DeBertaForTokenClassification() + .setInputCols(["tok", "doc"]) + .setOutputCol("class") + ) + p = nlp.Pipeline(stages=[d, t]) + p = nlp.to_nlu_pipe(p) + print(p.predict("Hello World")) + + def test_sparknlp_gpu_session(self): + print("Start test_spark_nlp_gpu_session") + self.spark = nlp.start(hardware_target="gpu") + d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") + t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") + c = ( + nlp.DeBertaForTokenClassification() + .setInputCols(["tok", "doc"]) + .setOutputCol("class") + ) + p = nlp.Pipeline(stages=[d, t]) + p = nlp.to_nlu_pipe(p) + print(p.predict("Hello form John SNow labs")) + diff --git a/tests/sessions/visual_lib.py b/tests/sessions/visual_lib.py new file mode 100644 index 0000000000..9f749dadc1 --- /dev/null +++ b/tests/sessions/visual_lib.py @@ -0,0 +1,98 @@ +import os +import sys +import unittest + +from johnsnowlabs import nlp, visual +from johnsnowlabs.auto_install.softwares import (SparkHcSoftware, + SparkNlpSoftware, + SparkOcrSoftware) +from tests.utils import (clear_installed_jsl_installation, + get_finance_pipeline, get_legal_pipeline) + +os.environ["PYSPARK_PYTHON"] = sys.executable +os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + +def setUpModule(): + nlp.install(browser_login=False, spark_nlp=True, nlp=False, visual=True, + ocr_license=os.environ.get("VALID_LICENSE"), + aws_key_id="", + aws_access_key="" + ) + + +def tearDownModule(): + clear_installed_jsl_installation() + + +class InstallationTestCase(unittest.TestCase): + def test_spark_ocr_is_installed(self): + installed_products = nlp.check_health() + self.assertTrue(installed_products[SparkNlpSoftware]) + self.assertFalse(installed_products[SparkHcSoftware]) + self.assertTrue(installed_products[SparkOcrSoftware]) + +class SparkSessionTestCase(unittest.TestCase): + @classmethod + def setUp(cls): + cls.spark = nlp.start(visual=True) + + def test_healthcare_session(self): + print("Test OCR session ...") + pdf_to_image = visual.PdfToImage() + pdf_to_image.setImageType(visual.ImageType.TYPE_3BYTE_BGR) + + # Detect tables on the page using pretrained model + # It can be finetuned for have more accurate results for more specific documents + table_detector = visual.ImageTableDetector.pretrained( + "general_model_table_detection_v2", "en", "clinical/ocr" + ) + table_detector.setInputCol("image") + table_detector.setOutputCol("region") + + # Draw detected region's with table to the page + draw_regions = visual.ImageDrawRegions() + draw_regions.setInputCol("image") + draw_regions.setInputRegionsCol("region") + draw_regions.setOutputCol("image_with_regions") + draw_regions.setRectColor(visual.Color.red) + + # Extract table regions to separate images + splitter = visual.ImageSplitRegions() + splitter.setInputCol("image") + splitter.setInputRegionsCol("region") + splitter.setOutputCol("table_image") + splitter.setDropCols("image") + + # Detect cells on the table image + cell_detector = visual.ImageTableCellDetector() + cell_detector.setInputCol("table_image") + cell_detector.setOutputCol("cells") + cell_detector.setAlgoType("morphops") + + # Extract text from the detected cells + table_recognition = visual.ImageCellsToTextTable() + table_recognition.setInputCol("table_image") + table_recognition.setCellsCol("cells") + table_recognition.setMargin(3) + table_recognition.setStrip(True) + table_recognition.setOutputCol("table") + + pipeline = nlp.PipelineModel( + stages=[ + pdf_to_image, + table_detector, + draw_regions, + splitter, + cell_detector, + table_recognition, + ] + ) + + import pkg_resources + + pdf_example = pkg_resources.resource_filename( + "sparkocr", "resources/ocr/pdfs/tabular-pdf/data.pdf" + ) + pdf_example_df = self.spark.read.format("binaryFile").load(pdf_example).cache() + pipeline.transform(pdf_example_df).show() + diff --git a/tests/spark_session.py b/tests/spark_session.py deleted file mode 100644 index 44463332e9..0000000000 --- a/tests/spark_session.py +++ /dev/null @@ -1,267 +0,0 @@ -import sys - -from johnsnowlabs import * -import unittest -import pkg_resources - - -import os - -os.environ["PYSPARK_PYTHON"] = sys.executable -os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable - - -# finance.ClassifierDLApproach() -class ImportTestCase(unittest.TestCase): - def test_sparknlp_session(self): - nlp.start() - d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") - t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") - c = ( - nlp.DeBertaForTokenClassification() - .setInputCols(["tok", "doc"]) - .setOutputCol("class") - ) - p = nlp.Pipeline(stages=[d, t]) - p = nlp.to_nlu_pipe(p) - print(p.predict("Hello World")) - - def test_sparknlp_gpu_session(self): - nlp.start(hardware_target="gpu") - d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") - t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") - c = ( - nlp.DeBertaForTokenClassification() - .setInputCols(["tok", "doc"]) - .setOutputCol("class") - ) - p = nlp.Pipeline(stages=[d, t]) - p = nlp.to_nlu_pipe(p) - print(p.predict("Hello form John SNow labs")) - - def test_sparknlp_m1_session(self): - import os - - nlp.start(hardware_target="m1") - d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") - t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") - c = ( - nlp.DeBertaForTokenClassification() - .pretrained() - .setInputCols(["tok", "doc"]) - .setOutputCol("class") - ) - nlp.UniversalSentenceEncoder.pretrained() - p = nlp.Pipeline(stages=[d, t]) - p = nlp.to_nlu_pipe(p) - print(p.predict("Hello form John SNow labs")) - - def test_healthcare_session(self): - nlp.start() - d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") - t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") - c = ( - medical.BertForTokenClassification() - .pretrained() - .setInputCols(["tok", "doc"]) - .setOutputCol("class") - ) - p = nlp.Pipeline(stages=[d, t, c]) - p = nlp.to_nlu_pipe(p) - print(p.predict("Hello form John SNow labs")) - - def test_ocr_session(self): - # Convert pdf to image - p = "/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/licenses/4_1_LATEST_OCR_HC_BCK.json" - spark = nlp.start(visual=True) - - pdf_to_image = visual.PdfToImage() - pdf_to_image.setImageType(visual.ImageType.TYPE_3BYTE_BGR) - - # Detect tables on the page using pretrained model - # It can be finetuned for have more accurate results for more specific documents - table_detector = visual.ImageTableDetector.pretrained( - "general_model_table_detection_v2", "en", "clinical/ocr" - ) - table_detector.setInputCol("image") - table_detector.setOutputCol("region") - - # Draw detected region's with table to the page - draw_regions = visual.ImageDrawRegions() - draw_regions.setInputCol("image") - draw_regions.setInputRegionsCol("region") - draw_regions.setOutputCol("image_with_regions") - draw_regions.setRectColor(visual.Color.red) - - # Extract table regions to separate images - splitter = visual.ImageSplitRegions() - splitter.setInputCol("image") - splitter.setInputRegionsCol("region") - splitter.setOutputCol("table_image") - splitter.setDropCols("image") - - # Detect cells on the table image - cell_detector = visual.ImageTableCellDetector() - cell_detector.setInputCol("table_image") - cell_detector.setOutputCol("cells") - cell_detector.setAlgoType("morphops") - - # Extract text from the detected cells - table_recognition = visual.ImageCellsToTextTable() - table_recognition.setInputCol("table_image") - table_recognition.setCellsCol("cells") - table_recognition.setMargin(3) - table_recognition.setStrip(True) - table_recognition.setOutputCol("table") - - pipeline = nlp.PipelineModel( - stages=[ - pdf_to_image, - table_detector, - draw_regions, - splitter, - cell_detector, - table_recognition, - ] - ) - - import pkg_resources - - pdf_example = pkg_resources.resource_filename( - "sparkocr", "resources/ocr/pdfs/tabular-pdf/data.pdf" - ) - pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache() - pipeline.transform(pdf_example_df).show() - - def test_legal_session(self): - nlp.start() - - nlp.Pipeline(self.get_legal_pipe()).fullAnnotate("Shwrm") - - def test_finance_session(self): - nlp.start() - nlp.Pipeline(self.get_finance_pipe()).fullAnnotate("unit") - - @staticmethod - def get_finance_pipe() -> nlp.PipelineModel: - documentAssembler = ( - nlp.DocumentAssembler().setInputCol("text").setOutputCol("ner_chunk") - ) - - embeddings = ( - nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") - .setInputCols("ner_chunk") - .setOutputCol("sentence_embeddings") - ) - - resolver = ( - finance.SentenceEntityResolverModel.pretrained( - "finel_tickers2names", "en", "finance/models" - ) - .setInputCols(["ner_chunk", "sentence_embeddings"]) - .setOutputCol("name") - .setDistanceFunction("EUCLIDEAN") - ) - - return nlp.PipelineModel(stages=[documentAssembler, embeddings, resolver]) - - @staticmethod - def get_legal_pipe() -> nlp.PipelineModel: - z = legal.ZeroShotRelationExtractionModel.pretrained( - "finre_zero_shot", "en", "finance/models" - ) - documentAssembler = ( - nlp.DocumentAssembler().setInputCol("text").setOutputCol("ner_chunk") - ) - - embeddings = ( - nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") - .setInputCols("ner_chunk") - .setOutputCol("sentence_embeddings") - ) - - resolver = ( - legal.SentenceEntityResolverModel.pretrained( - "legel_crunchbase_companynames", "en", "legal/models" - ) - .setInputCols(["ner_chunk", "sentence_embeddings"]) - .setOutputCol("name") - .setDistanceFunction("EUCLIDEAN") - ) - - return nlp.PipelineModel(stages=[documentAssembler, embeddings, resolver]) - - @staticmethod - def get_cross_lib_pipe() -> nlp.PipelineModel: - # Returns pipe with one anno per lib - # TODO add some fancy OCR DL models? - doc2text = visual.DocToText().setInputCol("content").setOutputCol("text") - d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") - t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") - # One classifier per NLP lib - - c1 = ( - medical.BertForTokenClassifier() - .pretrained() - .setInputCols(["tok", "doc"]) - .setOutputCol("medical") - ) - - c2 = ( - nlp.DeBertaForTokenClassification() - .setInputCols(["tok", "doc"]) - .setOutputCol("opene_source") - ) - - c3 = ( - finance.BertForSequenceClassification.pretrained( - "finclf_augmented_esg", "en", "finance/models" - ) - .setInputCols(["tok", "doc"]) - .setOutputCol("finance") - ) - - c4 = ( - legal.BertForSequenceClassification.pretrained( - "legclf_bert_judgements_agent", "en", "legal/models" - ) - .setInputCols(["tok", "doc"]) - .setOutputCol("legal") - ) - - return nlp.Pipeline(stages=[doc2text, d, t, c1, c2, c3, c4]) - - def test_simple_cross_lib(self): - spark = nlp.start() - doc_example = pkg_resources.resource_filename( - "sparkocr", "resources/ocr/docs/doc2.docx" - ) - df = spark.read.format("binaryFile").load(doc_example).cache() - self.get_cross_lib_pipe().fit(df).transform(df).show() - - def test_simple_cross_lib_gpu(self): - spark = nlp.start(hardware_target="gpu") - doc_example = pkg_resources.resource_filename( - "sparkocr", "resources/ocr/docs/doc2.docx" - ) - df = spark.read.format("binaryFile").load(doc_example).cache() - self.get_cross_lib_pipe().fit(df).transform(df).show() - - def test_cross_engine_session(self): - import itertools - - # Test every combination of jars with CPU jars - for c in range(3): - p = itertools.combinations(["nlp-cpu", "ocr", "hc"], c) - for pp in p: - print(pp) - - # Test every combination of jars with GPU jars - for c in range(3): - p = itertools.combinations(["nlp-gpu", "ocr", "hc"], c) - for pp in p: - print(pp) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000000..1e78e80b32 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,98 @@ +import shutil + +import pip + +from johnsnowlabs import finance, legal, medical, nlp, settings, visual + + +def clear_installed_jsl_installation(): + shutil.rmtree(settings.root_dir, ignore_errors=True) + pip.main(["uninstall", "-y", "johnsnowlabs"]) + pip.main(["uninstall", "-y", "nlu"]) + pip.main(["uninstall", "-y", "spark-nlp"]) + pip.main(["uninstall", "-y", "spark-nlp-jsl"]) + pip.main(["uninstall", "-y", "spark-ocr"]) + +def get_finance_pipeline(): + documentAssembler = ( + nlp.DocumentAssembler().setInputCol("text").setOutputCol("ner_chunk") + ) + + embeddings = ( + nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") + .setInputCols("ner_chunk") + .setOutputCol("sentence_embeddings") + ) + + resolver = ( + finance.SentenceEntityResolverModel.pretrained( + "finel_tickers2names", "en", "finance/models" + ) + .setInputCols(["ner_chunk", "sentence_embeddings"]) + .setOutputCol("name") + .setDistanceFunction("EUCLIDEAN") + ) + + return nlp.PipelineModel(stages=[documentAssembler, embeddings, resolver]) + +def get_legal_pipeline() -> nlp.PipelineModel: + + documentAssembler = ( + nlp.DocumentAssembler().setInputCol("text").setOutputCol("ner_chunk") + ) + + embeddings = ( + nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") + .setInputCols("ner_chunk") + .setOutputCol("sentence_embeddings") + ) + + resolver = ( + legal.SentenceEntityResolverModel.pretrained( + "legel_crunchbase_companynames", "en", "legal/models" + ) + .setInputCols(["ner_chunk", "sentence_embeddings"]) + .setOutputCol("name") + .setDistanceFunction("EUCLIDEAN") + ) + + return nlp.PipelineModel(stages=[documentAssembler, embeddings, resolver]) + +def get_cross_lib_pipe() -> nlp.PipelineModel: + # Returns pipe with one anno per lib + # TODO add some fancy OCR DL models? + doc2text = visual.DocToText().setInputCol("content").setOutputCol("text") + d = nlp.DocumentAssembler().setInputCol("text").setOutputCol("doc") + t = nlp.Tokenizer().setInputCols("doc").setOutputCol("tok") + # One classifier per NLP lib + + c1 = ( + medical.BertForTokenClassifier() + .pretrained() + .setInputCols(["tok", "doc"]) + .setOutputCol("medical") + ) + + c2 = ( + nlp.DeBertaForTokenClassification() + .setInputCols(["tok", "doc"]) + .setOutputCol("opene_source") + ) + + c3 = ( + finance.BertForSequenceClassification.pretrained( + "finclf_augmented_esg", "en", "finance/models" + ) + .setInputCols(["tok", "doc"]) + .setOutputCol("finance") + ) + + c4 = ( + legal.BertForSequenceClassification.pretrained( + "legclf_bert_judgements_agent", "en", "legal/models" + ) + .setInputCols(["tok", "doc"]) + .setOutputCol("legal") + ) + + return nlp.Pipeline(stages=[doc2text, d, t, c1, c2, c3, c4])