diff --git a/docs/en/licensed_annotator_entries/AssertionDL.md b/docs/en/licensed_annotator_entries/AssertionDL.md index 8043eea2a3..26e1c7754a 100644 --- a/docs/en/licensed_annotator_entries/AssertionDL.md +++ b/docs/en/licensed_annotator_entries/AssertionDL.md @@ -33,27 +33,39 @@ ASSERTION {%- endcapture -%} {%- capture model_python_medical -%} + from johnsnowlabs import * # Define pipeline stages to extract NER chunks first -data = spark.createDataFrame([ - ["Patient with severe fever and sore throat"], - ["Patient shows no stomach pain"], - ["She was maintained on an epidural and PCA for pain control."]]).toDF("text") -documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document") -sentenceDetector = nlp.SentenceDetector().setInputCols(["document"]).setOutputCol("sentence") -tokenizer = nlp.Tokenizer().setInputCols(["sentence"]).setOutputCol("token") +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = nlp.SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"])\ .setOutputCol("embeddings") + nerModel = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models") \ - .setInputCols(["sentence", "token", "embeddings"]).setOutputCol("ner") -nerConverter = nlp.NerConverter().setInputCols(["sentence", "token", "ner"]).setOutputCol("ner_chunk") + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +nerConverter = nlp.NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") # Then a pretrained AssertionDLModel is used to extract the assertion status clinicalAssertion = medical.AssertionDLModel.pretrained("assertion_dl", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ .setOutputCol("assertion") -assertionPipeline = Pipeline(stages=[ +assertionPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, @@ -63,25 +75,29 @@ assertionPipeline = Pipeline(stages=[ clinicalAssertion ]) -assertionModel = assertionPipeline.fit(data) +data = spark.createDataFrame([ + ["Patient with severe fever and sore throat"], + ["Patient shows no stomach pain"], + ["She was maintained on an epidural and PCA for pain control."]]).toDF("text") + # Show results -result = assertionModel.transform(data) -result.selectExpr("ner_chunk.result", "assertion.result").show(3, truncate=False) +result = assertionPipeline.fit(data).transform(data) +result.selectExpr("ner_chunk.result as chunk_result", "assertion.result as assertion_result").show(3, truncate=False) + +--------------------------------+--------------------------------+ -|result |result | +|chunk_result |assertion_result | +--------------------------------+--------------------------------+ |[severe fever, sore throat] |[present, present] | |[stomach pain] |[absent] | |[an epidural, PCA, pain control]|[present, present, hypothetical]| +--------------------------------+--------------------------------+ + {%- endcapture -%} {%- capture model_python_finance -%} from johnsnowlabs import * -data = spark.createDataFrame([["Our competitors include the following by general category: legacy antivirus product providers, such as McAfee LLC and Broadcom Inc."]]).toDF("text") - document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") @@ -110,7 +126,7 @@ assertion = finance.AssertionDLModel.pretrained("finassertion_competitors", "en" .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ .setOutputCol("assertion") -pipeline = Pipeline(stages=[ +pipeline = nlp.Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, @@ -120,15 +136,17 @@ pipeline = Pipeline(stages=[ assertion ]) -assertionModel = pipeline.fit(data) +data = spark.createDataFrame([["Our competitors include the following by general category: legacy antivirus product providers, such as McAfee LLC and Broadcom Inc."]]).toDF("text") + # Show results -result = assertionModel.transform(data) +result = pipeline.fit(data).transform(data) result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata, result.assertion.result)).alias("cols"))\ .select(F.expr("cols['1']['sentence']").alias("sent_id"), F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label"), F.expr("cols['2']").alias("assertion")).show(truncate=False) + +-------+------------+---------+----------+ |sent_id|chunk |ner_label|assertion | +-------+------------+---------+----------+ @@ -139,9 +157,6 @@ result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.m {%- capture model_python_legal -%} from johnsnowlabs import * - -data = spark.createDataFrame([["This is an Intellectual Property Agreement between Amazon Inc. and Atlantic Inc."]]).toDF("text") - document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") @@ -175,8 +190,7 @@ assertion = legal.AssertionDLModel.pretrained("legassertion_time", "en", "legal/ .setInputCols(["sentence", "ner_chunk", "embeddings_ass"]) \ .setOutputCol("assertion") - -nlpPipeline = Pipeline(stages=[ +nlpPipeline = nlp.Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, @@ -187,10 +201,11 @@ nlpPipeline = Pipeline(stages=[ assertion ]) -assertionModel = nlpPipeline.fit(data) +data = spark.createDataFrame([["This is an Intellectual Property Agreement between Amazon Inc. and Atlantic Inc."]]).toDF("text") + # Show results -result = assertionModel.transform(data) +result = nlpPipeline.fit(data).transform(data) result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.begin, result.ner_chunk.end, @@ -201,6 +216,7 @@ result.select(F.explode(F.arrays_zip(result.ner_chunk.result, F.expr("cols['2']").alias("end"), F.expr("cols['3']['entity']").alias("ner_label"), F.expr("cols['4']").alias("assertion")).show(truncate=False) + +-------------------------------+-----+---+---------+---------+ |chunk |begin|end|ner_label|assertion| +-------------------------------+-----+---+---------+---------+ @@ -208,28 +224,41 @@ result.select(F.explode(F.arrays_zip(result.ner_chunk.result, |Amazon Inc |51 |60 |PARTY |PRESENT | |Atlantic Inc |67 |78 |PARTY |PRESENT | +-------------------------------+-----+---+---------+---------+ + {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * // Define pipeline stages to extract NER chunks first -val data = Seq( - "Patient with severe fever and sore throat", - "Patient shows no stomach pain", - "She was maintained on an epidural and PCA for pain control.").toDF("text") -val documentAssembler = new nlp.DocumentAssembler().setInputCol("text").setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector().setInputCols("document").setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer().setInputCols("sentence").setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models").setInputCols(Array("sentence", "token")).setOutputCol("embeddings") -val nerModel = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models") - .setInputCols(Array("sentence", "token", "embeddings")).setOutputCol("ner") -val nerConverter = new nlp.NerConverter().setInputCols(Array("sentence", "token", "ner")).setOutputCol("ner_chunk") + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val nerModel = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val nerConverter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") // Then a pretrained AssertionDLModel is used to extract the assertion status -val clinicalAssertion = medical.AssertionDLModel.pretrained("assertion_dl", "en", "clinical/models") - .setInputCols(Array("sentence", "ner_chunk", "embeddings")) - .setOutputCol("assertion") +val clinicalAssertion = AssertionDLModel.pretrained("assertion_dl", "en", "clinical/models") + .setInputCols(Array("sentence", "ner_chunk", "embeddings")) + .setOutputCol("assertion") val assertionPipeline = new Pipeline().setStages(Array( documentAssembler, @@ -241,50 +270,53 @@ val assertionPipeline = new Pipeline().setStages(Array( clinicalAssertion )) -val assertionModel = assertionPipeline.fit(data) +val data = Seq( + "Patient with severe fever and sore throat", + "Patient shows no stomach pain", + "She was maintained on an epidural and PCA for pain control.").toDF("text") + // Show results -val result = assertionModel.transform(data) -result.selectExpr("ner_chunk.result", "assertion.result").show(3, truncate=false) +val result = assertionPipeline.fit(data).transform(data) +result.selectExpr("ner_chunk.result as chunk_result", "assertion.result as assertion_result").show(3, truncate=false) + +--------------------------------+--------------------------------+ -|result |result | +|chunk_result |assertion_result | +--------------------------------+--------------------------------+ |[severe fever, sore throat] |[present, present] | |[stomach pain] |[absent] | |[an epidural, PCA, pain control]|[present, present, hypothetical]| +--------------------------------+--------------------------------+ + {%- endcapture -%} {%- capture model_scala_finance -%} -from johnsnowlabs import * -val data = Seq("Our competitors include the following by general category: legacy antivirus product providers, such as McAfee LLC and Broadcom Inc.").toDF("text") - -val document_assembler = new nlp.DocumentAssembler() +val document_assembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentence_detector = new nlp.SentenceDetector() +val sentence_detector = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") -val embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") -val ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias","en","finance/models") +val ner_model = FinanceNerModel.pretrained("finner_orgs_prods_alias","en","finance/models") .setInputCols(Array("sentence", "token", "embeddings")) .setOutputCol("ner") -val ner_converter = new finance.NerConverterInternal() +val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val assertion = finance.AssertionDLModel.pretrained("finassertion_competitors", "en", "finance/models") +val assertion = AssertionDLModel.pretrained("finassertion_competitors", "en", "finance/models") .setInputCols(Array("sentence", "ner_chunk", "embeddings")) .setOutputCol("assertion") @@ -296,47 +328,56 @@ val pipeline = new Pipeline().setStages(Array( ner_model, ner_converter, assertion - ) + )) + +val data = Seq("Our competitors include the following by general category: legacy antivirus product providers, such as McAfee LLC and Broadcom Inc.").toDF("text") + + +// Show results +val result = pipeline.fit(data).transform(data) + ++-------+------------+---------+----------+ +|sent_id|chunk |ner_label|assertion | ++-------+------------+---------+----------+ +|0 |McAfee LLC |ORG |COMPETITOR| +|0 |Broadcom Inc|ORG |COMPETITOR| ++-------+------------+---------+----------+ -val assertionModel = pipeline.fit(data) {%- endcapture -%} {%- capture model_scala_legal -%} -from johnsnowlabs import * - -val data = Seq("This is an Intellectual Property Agreement between Amazon Inc. and Atlantic Inc.").toDF("text") -val document_assembler = new nlp.DocumentAssembler() +val document_assembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx") +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx") .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") -val embeddings_ner = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en") +val embeddings_ner = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings_ner") -val ner_model = legal.NerModel.pretrained('legner_contract_doc_parties', 'en', 'legal/models') +val ner_model = LegalNerModel.pretrained('legner_contract_doc_parties', 'en', 'legal/models') .setInputCols(Array("sentence", "token", "embeddings_ner")) .setOutputCol("ner") -val ner_converter = new nlp.NerConverter() +val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") .setWhiteList(Array("DOC", "EFFDATE", "PARTY")) -val embeddings_ass = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +val embeddings_ass = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings_ass") -val assertion = legal.AssertionDLModel.pretrained("legassertion_time", "en", "legal/models") +val assertion = AssertionDLModel.pretrained("legassertion_time", "en", "legal/models") .setInputCols(Array("sentence", "ner_chunk", "embeddings_ass")) .setOutputCol("assertion") @@ -349,9 +390,22 @@ val pipeline = new Pipeline().setStages(Array( ner_converter, embeddings_ass, assertion - ) + )) + +val data = Seq("This is an Intellectual Property Agreement between Amazon Inc. and Atlantic Inc.").toDF("text") + + +// Show results +val result = pipeline.fit(data).transform(data) + ++-------------------------------+-----+---+---------+---------+ +|chunk |begin|end|ner_label|assertion| ++-------------------------------+-----+---+---------+---------+ +|Intellectual Property Agreement|11 |41 |DOC |PRESENT | +|Amazon Inc |51 |60 |PARTY |PRESENT | +|Atlantic Inc |67 |78 |PARTY |PRESENT | ++-------------------------------+-----+---+---------+---------+ -val assertionModel = pipeline.fit(data) {%- endcapture -%} @@ -387,12 +441,20 @@ from johnsnowlabs import * document = nlp.DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") + chunk = nlp.Doc2Chunk() \ .setInputCols(["document"]) \ - .setOutputCol("chunk") + .setOutputCol("chunk") \ + .setChunkCol("target")\ + .setStartCol("start")\ + .setStartColByTokenIndex(True)\ + .setFailOnMissing(False)\ + .setLowerCase(True) + token = nlp.Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") + embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ .setInputCols(["document", "token"]) \ .setOutputCol("embeddings") @@ -410,7 +472,7 @@ assertionStatus = medical.AssertionDLApproach() \ .setEndCol("end") \ .setMaxSentLen(250) -trainingPipeline = Pipeline().setStages([ +trainingPipeline = nlp.Pipeline().setStages([ document, chunk, token, @@ -418,8 +480,7 @@ trainingPipeline = Pipeline().setStages([ assertionStatus ]) -assertionModel = trainingPipeline.fit(data) -assertionResults = assertionModel.transform(data).cache() +assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_python_legal -%} @@ -428,12 +489,15 @@ from johnsnowlabs import * document = nlp.DocumentAssembler()\ .setInputCol("sentence")\ .setOutputCol("document") + chunk = nlp.Doc2Chunk()\ .setInputCols("document")\ .setOutputCol("doc_chunk") + token = nlp.Tokenizer()\ .setInputCols(['document'])\ .setOutputCol('token') + roberta_embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \ .setInputCols(["document", "token"]) \ .setOutputCol("embeddings") \ @@ -442,7 +506,7 @@ roberta_embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_ # Define AssertionDLApproach with parameters and start training assertionStatus = legal.AssertionDLApproach()\ .setLabelCol("assertion_label")\ - .setInputCols("document", "doc_chunk", "embeddings")\ + .setInputCols(["document", "doc_chunk", "embeddings"])\ .setOutputCol("assertion")\ .setBatchSize(128)\ .setLearningRate(0.001)\ @@ -459,7 +523,7 @@ assertionStatus = legal.AssertionDLApproach()\ #.setValidationSplit(0.2)\ #.setDropout(0.1)\ -trainingPipeline = Pipeline().setStages([ +trainingPipeline = nlp.Pipeline().setStages([ document, chunk, token, @@ -467,8 +531,7 @@ trainingPipeline = Pipeline().setStages([ assertionStatus ]) -assertionModel = trainingPipeline.fit(data) -assertionResults = assertionModel.transform(data).cache() +assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_python_finance -%} @@ -477,12 +540,15 @@ from johnsnowlabs import * document = nlp.DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") + chunk = nlp.Doc2Chunk() \ .setInputCols(["document"]) \ .setOutputCol("chunk") + token = nlp.Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") + embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ .setInputCols(["document", "token"]) \ .setOutputCol("embeddings") @@ -500,7 +566,7 @@ assertionStatus = finance.AssertionDLApproach() \ .setEndCol("end") \ .setMaxSentLen(250) -trainingPipeline = Pipeline().setStages([ +trainingPipeline = nlp.Pipeline().setStages([ document, chunk, token, @@ -508,35 +574,36 @@ trainingPipeline = Pipeline().setStages([ assertionStatus ]) -assertionModel = trainingPipeline.fit(data) -assertionResults = assertionModel.transform(data).cache() +assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_scala_medical -%} -from johnsnowlabs import * // First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. -val document = new nlp.DocumentAssembler() +val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val chunk = new nlp.Doc2Chunk() - .setInputCols("document") + +val chunk = new Doc2Chunk() + .setInputCols(Array("document")) .setOutputCol("chunk") -val token = new nlp.Tokenizer() + +val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols("document", "token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("document", "token")) .setOutputCol("embeddings") // Define AssertionDLApproach with parameters and start training -val assertionStatus = new medical.AssertionDLApproach() +val assertionStatus = new AssertionDLApproach() .setLabelCol("label") - .setInputCols("document", "chunk", "embeddings") + .setInputCols(Array("document", "chunk", "embeddings")) .setOutputCol("assertion") .setBatchSize(128) - .setDropout(0.012f) - .setLearningRate(0.015f) + .setDropout(0.012) + .setLearningRate(0.015) .setEpochs(1) .setStartCol("start") .setEndCol("end") @@ -550,51 +617,52 @@ val trainingPipeline = new Pipeline().setStages(Array( assertionStatus )) -val assertionModel = trainingPipeline.fit(data) -val assertionResults = assertionModel.transform(data).cache() +val assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_scala_legal -%} -from johnsnowlabs import * -val document = new nlp.DocumentAssembler()\ - .setInputCol("sentence")\ +val document = new DocumentAssembler() + .setInputCol("sentence") .setOutputCol("document") -val chunk = new nlp.Doc2Chunk()\ - .setInputCols("document")\ - .setOutputCol("doc_chunk")\ - .setChunkCol("chunk")\ - .setStartCol("tkn_start")\ - .setStartColByTokenIndex(True)\ - .setFailOnMissing(False)\ - .setLowerCase(False) -val token = new nlp.Tokenizer()\ - .setInputCols(['document'])\ + +val chunk = new Doc2Chunk() + .setInputCols(Array("document")) + .setOutputCol("doc_chunk") + .setChunkCol("chunk") + .setStartCol("tkn_start") + .setStartColByTokenIndex(true) + .setFailOnMissing(false) + .setLowerCase(false) + +val token = new Tokenizer() + .setInputCols(Array('document')) .setOutputCol('token') -val roberta_embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \ - .setInputCols(["document", "token"]) \ - .setOutputCol("embeddings") \ + +val roberta_embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") .setMaxSentenceLength(512) # Define AssertionDLApproach with parameters and start training -val assertionStatus = new legal.AssertionDLApproach()\ - .setLabelCol("assertion_label")\ - .setInputCols("document", "doc_chunk", "embeddings")\ - .setOutputCol("assertion")\ - .setBatchSize(128)\ - .setLearningRate(0.001)\ - .setEpochs(2)\ - .setStartCol("tkn_start")\ - .setEndCol("tkn_end")\ - .setMaxSentLen(1200)\ - .setEnableOutputLogs(True)\ - .setOutputLogsPath('training_logs/')\ - .setGraphFolder(graph_folder)\ - .setGraphFile(f"{graph_folder}/assertion_graph.pb")\ - .setTestDataset(path="test_data.parquet", read_as='SPARK', options={'format': 'parquet'})\ +val assertionStatus = new AssertionDLApproach() + .setLabelCol("assertion_label") + .setInputCols(Array("document", "doc_chunk", "embeddings")) + .setOutputCol("assertion") + .setBatchSize(128) + .setLearningRate(0.001) + .setEpochs(2) + .setStartCol("tkn_start") + .setEndCol("tkn_end") + .setMaxSentLen(1200) + .setEnableOutputLogs(true) + .setOutputLogsPath('training_logs/') + .setGraphFolder(graph_folder) + .setGraphFile(f"{graph_folder}/assertion_graph.pb") + .setTestDataset(path="test_data.parquet", read_as='SPARK', options={'format': 'parquet'}) .setScopeWindow(scope_window) - #.setValidationSplit(0.2)\ - #.setDropout(0.1)\ + #.setValidationSplit(0.2) + #.setDropout(0.1) val trainingPipeline = new Pipeline().setStages(Array( document, @@ -604,35 +672,37 @@ val trainingPipeline = new Pipeline().setStages(Array( assertionStatus )) -val assertionModel = trainingPipeline.fit(data) -val assertionResults = assertionModel.transform(data).cache() +val assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_scala_finance -%} from johnsnowlabs import * // First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. -val document = new nlp.DocumentAssembler() +val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val chunk = new nlp.Doc2Chunk() - .setInputCols("document") + +val chunk = new Doc2Chunk() + .setInputCols(Array("document")) .setOutputCol("chunk") -val token = new nlp.Tokenizer() + +val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols("document", "token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("document", "token")) .setOutputCol("embeddings") // Define AssertionDLApproach with parameters and start training -val assertionStatus = new finance.AssertionDLApproach() +val assertionStatus = new AssertionDLApproach() .setLabelCol("label") - .setInputCols("document", "chunk", "embeddings") + .setInputCols(Array("document", "chunk", "embeddings")) .setOutputCol("assertion") .setBatchSize(128) - .setDropout(0.012f) - .setLearningRate(0.015f) + .setDropout(0.012) + .setLearningRate(0.015) .setEpochs(1) .setStartCol("start") .setEndCol("end") @@ -646,8 +716,7 @@ val trainingPipeline = new Pipeline().setStages(Array( assertionStatus )) -val assertionModel = trainingPipeline.fit(data) -val assertionResults = assertionModel.transform(data).cache() +val assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%}