From 998dbdff955256b4154ca4005aea9380326233d9 Mon Sep 17 00:00:00 2001 From: Akar <67700732+akrztrk@users.noreply.github.com> Date: Thu, 28 Dec 2023 08:33:42 +0100 Subject: [PATCH] updated annotators (#841) --- .../AssertionChunkConverter.md | 56 +- .../licensed_annotator_entries/AssertionDL.md | 56 +- .../AssertionFilterer.md | 26 +- .../AssertionLogReg.md | 42 +- .../AverageEmbeddings.md | 8 + ...on.md => BertForSequenceClassification.md} | 17 +- ...lassifier.md => BertForTokenClassifier.md} | 8 +- .../BertSentenceChunkEmbeddings.md | 24 +- .../licensed_annotator_entries/Chunk2Token.md | 46 +- .../ChunkConverter.md | 65 +- .../ChunkEntityResolver.md | 42 +- .../ChunkFilterer.md | 54 +- .../ChunkKeyPhraseExtraction.md | 51 +- .../licensed_annotator_entries/ChunkMapper.md | 92 +-- .../licensed_annotator_entries/ChunkMerge.md | 44 +- .../DateNormalizer.md | 11 +- ...=> DistilBertForSequenceClassification.md} | 18 +- .../Doc2ChunkInternal.md | 5 +- .../DocumentHashCoder.md | 2 +- .../DocumentLogRegClassifier.md | 13 +- .../DrugNormalizer.md | 2 +- .../EntityChunkEmbeddings.md | 3 +- .../FeaturesAssembler.md | 3 +- .../GenericClassifier.md | 142 ++++- .../licensed_annotator_entries/NerChunker.md | 2 +- .../NerConverterInternal.md | 108 ++-- .../NerDisambiguator.md | 471 +++++--------- .../en/licensed_annotator_entries/NerModel.md | 569 +++++++++++------ .../NerQuestionGenerator.md | 4 + .../QuestionAnswering.md | 16 +- .../RelationExtraction.md | 356 ++++++----- .../ResolverMerger.md | 1 + docs/en/licensed_annotator_entries/Router.md | 4 + .../SentenceEntityResolver.md | 593 ++++++++---------- .../WindowedSentenceModel.md | 5 + docs/en/licensed_annotators.md | 35 +- 36 files changed, 1611 insertions(+), 1383 deletions(-) rename docs/en/licensed_annotator_entries/{MedicalBertForSequenceClassification.md => BertForSequenceClassification.md} (79%) rename docs/en/licensed_annotator_entries/{MedicalBertForTokenClassifier.md => BertForTokenClassifier.md} (95%) rename docs/en/licensed_annotator_entries/{MedicalDistilBertForSequenceClassification.md => DistilBertForSequenceClassification.md} (78%) diff --git a/docs/en/licensed_annotator_entries/AssertionChunkConverter.md b/docs/en/licensed_annotator_entries/AssertionChunkConverter.md index e4a17d897f..3eaf429f91 100644 --- a/docs/en/licensed_annotator_entries/AssertionChunkConverter.md +++ b/docs/en/licensed_annotator_entries/AssertionChunkConverter.md @@ -14,7 +14,7 @@ In some cases, there may be issues while creating the chunk column when using to The `AssertionChunkConverter` annotator uses both the begin and end indices of the tokens as input to add more robust metadata to the chunk column in a way that improves the reliability of the indices and avoids loss of data. -AssertionChunkConverter Parameters: +Parameters: - `chunkBeginCol`: (Str) The column containing the start index of the chunk. @@ -39,6 +39,7 @@ CHUNK {%- endcapture -%} {%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -98,7 +99,8 @@ results.selectExpr( {%- endcapture -%} {%- capture model_scala_medical -%} - +import spark.implicits._ + val document_assembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") @@ -129,19 +131,6 @@ val data = Seq(Array( val results = pipeline.fit(data).transform(data) -results.selectExpr( - "target", - "char_begin", - "char_end", - "token_begin", - "token_end", - "chunk.begin", - "chunk.end", - "tokens[token_begin].result as begin_result", - "tokens[token_end].result as end_result", - "chunk.result" -).show(false) - +------+----------+--------+-----------+---------+--------------------------+------------------------+------+----------------------------------------------+ |target|char_begin|char_end|token_begin|token_end|tokens[token_begin].result|tokens[token_end].result|target|chunk | +------+----------+--------+-----------+---------+--------------------------+------------------------+------+----------------------------------------------+ @@ -152,6 +141,7 @@ results.selectExpr( {%- endcapture -%} {%- capture model_python_finance -%} +from johnsnowlabs import nlp, finance document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -211,7 +201,8 @@ results.selectExpr( {%- endcapture -%} {%- capture model_scala_finance -%} - +import spark.implicits._ + val document_assembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") @@ -242,19 +233,6 @@ val data = Seq(Array( val results = pipeline.fit(data).transform(data) -results.selectExpr( - "target", - "char_begin", - "char_end", - "token_begin", - "token_end", - "chunk.begin", - "chunk.end", - "tokens[token_begin].result as begin_result", - "tokens[token_end].result as end_result", - "chunk.result" -).show(false) - +-----------------+----------+--------+-----------+---------+-----+----+------------+----------+-------------------+ |target |char_begin|char_end|token_begin|token_end|begin|end |begin_result|end_result|result | +-----------------+----------+--------+-----------+---------+-----+----+------------+----------+-------------------+ @@ -265,6 +243,7 @@ results.selectExpr( {%- endcapture -%} {%- capture model_python_legal -%} +from johnsnowlabs import nlp, legal document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -324,6 +303,7 @@ results.selectExpr( {%- endcapture -%} {%- capture model_scala_legal -%} +import spark.implicits._ val document_assembler = new DocumentAssembler() .setInputCol("text") @@ -355,19 +335,6 @@ val data = Seq(Array( val results = pipeline.fit(data).transform(data) -results.selectExpr( - "target", - "char_begin", - "char_end", - "token_begin", - "token_end", - "chunk.begin", - "chunk.end", - "tokens[token_begin].result as begin_result", - "tokens[token_end].result as end_result", - "chunk.result" -).show(false) - +-------+----------+--------+-----------+---------+--------------------------+------------------------+-------+-----------------------------------------------+ |target |char_begin|char_end|token_begin|token_end|tokens[token_begin].result|tokens[token_end].result|target |chunk | +-------+----------+--------+-----------+---------+--------------------------+------------------------+-------+-----------------------------------------------+ @@ -385,11 +352,13 @@ results.selectExpr( [AssertionChunkConverter](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/assertion/assertion_chunk_converter/index.html#sparknlp_jsl.annotator.assertion.assertion_chunk_converter.AssertionChunkConverter) {%- endcapture -%} +{%- capture model_notebook_link -%} +[AssertionChunkConverterNootebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/AssertionChunkConverter.ipynb) +{%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model -approach=approach model_description=model_description model_input_anno=model_input_anno model_output_anno=model_output_anno @@ -401,4 +370,5 @@ model_python_legal=model_python_legal model_scala_legal=model_scala_legal model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/AssertionDL.md b/docs/en/licensed_annotator_entries/AssertionDL.md index 26e1c7754a..bddc98b479 100644 --- a/docs/en/licensed_annotator_entries/AssertionDL.md +++ b/docs/en/licensed_annotator_entries/AssertionDL.md @@ -20,6 +20,15 @@ and [WordEmbeddingsModel](/docs/en/annotators#wordembeddings). The result is an assertion status annotation for each recognized entity. Possible values include `“present”, “absent”, “hypothetical”, “conditional”, “associated_with_other_person”` etc. +Parameters: +- `inputCols`: Gets current column names of input annotations. + +- `outputCol`: Gets output column name of annotations. + +- `ScopeWindow`: Sets the scope of the window of the assertion expression. + +- `EntityAssertionCaseSensitive`: Sets the case sensitivity of entities and assertion labels + For pretrained models please see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Assertion+Status) for available models. {%- endcapture -%} @@ -34,7 +43,7 @@ ASSERTION {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Define pipeline stages to extract NER chunks first documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -96,7 +105,7 @@ result.selectExpr("ner_chunk.result as chunk_result", "assertion.result as asser {%- endcapture -%} {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -156,7 +165,8 @@ result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.m {%- endcapture -%} {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal + document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") @@ -229,6 +239,7 @@ result.select(F.explode(F.arrays_zip(result.ner_chunk.result, {%- capture model_scala_medical -%} +import spark.implicits._ // Define pipeline stages to extract NER chunks first val documentAssembler = new DocumentAssembler() @@ -278,7 +289,6 @@ val data = Seq( // Show results val result = assertionPipeline.fit(data).transform(data) -result.selectExpr("ner_chunk.result as chunk_result", "assertion.result as assertion_result").show(3, truncate=false) +--------------------------------+--------------------------------+ |chunk_result |assertion_result | @@ -291,6 +301,7 @@ result.selectExpr("ner_chunk.result as chunk_result", "assertion.result as asser {%- endcapture -%} {%- capture model_scala_finance -%} +import spark.implicits._ val document_assembler = new DocumentAssembler() .setInputCol("text") @@ -347,6 +358,7 @@ val result = pipeline.fit(data).transform(data) {%- capture model_scala_legal -%} +import spark.implicits._ val document_assembler = new DocumentAssembler() .setInputCol("text") @@ -419,12 +431,27 @@ val result = pipeline.fit(data).transform(data) [AssertionDLModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/assertion/assertionDL/index.html#sparknlp_jsl.annotator.assertion.assertionDL.AssertionDLModel) {%- endcapture -%} +{%- capture model_notebook_link -%} +[AssertionDLModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/AssertionDLModel.ipynb) +{%- endcapture -%} + {%- capture approach_description -%} Trains AssertionDL, a deep Learning based approach used to extract Assertion Status from extracted entities and text. Contains all the methods for training an AssertionDLModel. For pretrained models please use AssertionDLModel and see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Assertion+Status) for available models. + +Parameters: + +- `inputCols`: Gets current column names of input annotations. + +- `outputCol`: Gets output column name of annotations. + +- `ScopeWindow`: Sets the scope of the window of the assertion expression. + +- `StartCol`: Set a column that contains the token number for the start of the target. + {%- endcapture -%} {%- capture approach_input_anno -%} @@ -436,7 +463,8 @@ ASSERTION {%- endcapture -%} {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical + # First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. document = nlp.DocumentAssembler() \ .setInputCol("text") \ @@ -484,7 +512,8 @@ assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal + # First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. document = nlp.DocumentAssembler()\ .setInputCol("sentence")\ @@ -535,7 +564,8 @@ assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance + # First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. document = nlp.DocumentAssembler() \ .setInputCol("text") \ @@ -578,6 +608,7 @@ assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_scala_medical -%} +import spark.implicits._ // First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. val document = new DocumentAssembler() @@ -621,6 +652,7 @@ val assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_scala_legal -%} +import spark.implicits._ val document = new DocumentAssembler() .setInputCol("sentence") @@ -676,9 +708,9 @@ val assertionResults = trainingPipeline.fit(data).transform(data).cache() {%- endcapture -%} {%- capture approach_scala_finance -%} -from johnsnowlabs import * - +import spark.implicits._ // First, pipeline stages for pre-processing the dataset (containing columns for text and label) are defined. + val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") @@ -728,6 +760,10 @@ val assertionResults = trainingPipeline.fit(data).transform(data).cache() [AssertionDLApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/assertion/assertionDL/index.html#sparknlp_jsl.annotator.assertion.assertionDL.AssertionDLApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[AssertionDLApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/AssertionDLApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -743,6 +779,7 @@ model_scala_finance=model_scala_finance model_scala_legal=model_scala_legal model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -754,4 +791,5 @@ approach_scala_legal=approach_scala_legal approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/AssertionFilterer.md b/docs/en/licensed_annotator_entries/AssertionFilterer.md index fd262cd953..0993005676 100644 --- a/docs/en/licensed_annotator_entries/AssertionFilterer.md +++ b/docs/en/licensed_annotator_entries/AssertionFilterer.md @@ -12,7 +12,7 @@ Filters can be set via a white list on the extracted chunk, the assertion or a r White list for assertion is enabled by default. To use chunk white list, `criteria` has to be set to `"isin"`. For regex, `criteria` has to be set to `"regex"`. -AssertionFilterer Parameters; +Parameters: - `whiteList`: (list) If defined, list of entities to process. The rest will be ignored. @@ -20,7 +20,7 @@ AssertionFilterer Parameters; - `regex`: (list) List of dash-separated pairs of named entities. -- `criteria`: (list) Set tag representing what is the criteria to filter the chunks. possibles values (assertion|isIn|regex). *assertion*: Filter by the assertion *isIn* : Filter by the chunk *regex* : Filter using a regex +- `criteria`: (list) Set tag representing what is the criteria to filter the chunks. possibles values (assertion,isIn,regex). *assertion*: Filter by the assertion, *isIn* : Filter by the chunk, *regex* : Filter using a regex. - `entitiesConfidence`: (Str) Entity pairs to remove based on the confidence level. {%- endcapture -%} @@ -35,7 +35,7 @@ CHUNK {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Annotator that transforms a text column from dataframe into an Annotation ready for NLP documentAssembler = nlp.DocumentAssembler()\ @@ -113,7 +113,7 @@ result.select("filtered.result").show(3, truncate=False) {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -190,7 +190,7 @@ result.select("assertion_filtered.result").show(3, truncate=False) {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -259,7 +259,7 @@ result.select("assertion_filtered.result").show(3, truncate=False) {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * +import spark.implicits._ // Annotator that transforms a text column from dataframe into an Annotation ready for NLP val documentAssembler = new DocumentAssembler() @@ -322,7 +322,6 @@ val result = nlpPipeline.fit(data).transform(data) // Show results: -result.selectExpr("ner_chunk.result as ner_chunk", "assertion.result as assertion").show(3, truncate=false) +------------------------------------------------+--------------------------------------------------+ |ner_chunk |assertion | +------------------------------------------------+--------------------------------------------------+ @@ -339,7 +338,7 @@ result.select("filtered.result").show(3, truncate=false) {%- endcapture -%} {%- capture model_scala_legal -%} -from johnsnowlabs import * +import spark.implicits._ // Annotator that transforms a text column from dataframe into an Annotation ready for NLP val document_assembler = new DocumentAssembler() @@ -406,8 +405,6 @@ val data = Seq(text).toDF("text") val result = nlpPipeline.fit(data).transform(data) // Show results: - -result.selectExpr("ner_chunk.result as ner_chunk", "assertion.result as assertion").show(3, truncate=false) +-----------------------------------------------------------+---------------------------+ |ner_chunk |assertion | +-----------------------------------------------------------+---------------------------+ @@ -424,7 +421,7 @@ result.select("filtered.result").show(3, truncate=false) {%- endcapture -%} {%- capture model_scala_finance -%} -from johnsnowlabs import * +import spark.implicits._ // Annotator that transforms a text column from dataframe into an Annotation ready for NLP val document_assembler = new DocumentAssembler() @@ -484,8 +481,6 @@ val data = Seq(text).toDF("text") val result = nlpPipeline.fit(data).transform(data) // Show results: - -result.selectExpr("ner_chunk.result as ner_chunk", "assertion.result as assertion").show(3, truncate=false) +--------------------------+------------------------+ |ner_chunk |assertion | +--------------------------+------------------------+ @@ -509,6 +504,10 @@ result.select("filtered.result").show(3, truncate=false) [AssertionFilterer](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunker/assertion_filterer/index.html#sparknlp_jsl.annotator.chunker.assertion_filterer.AssertionFilterer) {%- endcapture -%} +{%- capture model_notebook_link -%} +[AssertionFiltererNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/AssertionFilterer.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -523,4 +522,5 @@ model_scala_legal=model_scala_legal model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/AssertionLogReg.md b/docs/en/licensed_annotator_entries/AssertionLogReg.md index 21436fe7c6..20319b2c01 100644 --- a/docs/en/licensed_annotator_entries/AssertionLogReg.md +++ b/docs/en/licensed_annotator_entries/AssertionLogReg.md @@ -18,7 +18,7 @@ Unlike the DL Model, this class does not extend AnnotatorModel. Instead it exten At the moment there are no pretrained models available for this class. Please refer to AssertionLogRegApproach to train your own model. -AssertionLogReg Parametres: +Parametres: - `setAfter(after: Int)`: AssertionLogRegModel.this.type Length of the context after the target (Default: 13) - `setBefore(before: Int)`: AssertionLogRegModel.this.type @@ -45,10 +45,32 @@ ASSERTION [AssertionLogRegModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/assertion/assertion_dl_reg/index.html#sparknlp_jsl.annotator.assertion.assertion_dl_reg.AssertionLogRegModel) {%- endcapture -%} +{%- capture model_notebook_link -%} +[AssertionLogRegModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/AssertionLogRegModel.ipynb) +{%- endcapture -%} + {%- capture approach_description -%} Trains a classification method, which uses the Logarithmic Regression Algorithm. It is used to extract Assertion Status from extracted entities and text. Contains all the methods for training a AssertionLogRegModel, together with trainWithChunk, trainWithStartEnd. + +Parameters: + +- `label` : Column with label per each token + +- `maxIter`: This specifies the maximum number of iterations to be performed in the model's training, default: 26 + +- `regParam` : This specifies the regularization parameter. Regularization helps to control the complexity of the model, aiding in preventing the issue of overfitting. + +- `eNetParam` : Elastic net parameter + +- `beforeParam` : Length of the context before the target + +- `afterParam` : Length of the context after the target + +- `startCol` : Column that contains the token number for the start of the target + +- `endCol` : Column that contains the token number for the end of the target {%- endcapture -%} {%- capture approach_input_anno -%} @@ -60,7 +82,7 @@ ASSERTION {%- endcapture -%} {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # First define pipeline stages to extract embeddings and text chunks documentAssembler = nlp.DocumentAssembler() \ @@ -106,7 +128,7 @@ assertionModel = assertionPipeline.fit(dataset) {%- endcapture -%} {%- capture approach_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal # First define pipeline stages to extract embeddings and text chunks documentAssembler = nlp.DocumentAssembler() \ @@ -152,7 +174,7 @@ assertionModel = assertionPipeline.fit(dataset) {%- endcapture -%} {%- capture approach_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance # First define pipeline stages to extract embeddings and text chunks documentAssembler = nlp.DocumentAssembler() \ @@ -198,7 +220,7 @@ assertionModel = assertionPipeline.fit(dataset) {%- endcapture -%} {%- capture approach_scala_medical -%} -from johnsnowlabs import * + import spark.implicits._ // First define pipeline stages to extract embeddings and text chunks @@ -245,7 +267,7 @@ val assertionModel = assertionPipeline.fit(dataset) {%- endcapture -%} {%- capture approach_scala_legal -%} -from johnsnowlabs import * + import spark.implicits._ // First define pipeline stages to extract embeddings and text chunks @@ -292,7 +314,7 @@ val assertionModel = assertionPipeline.fit(dataset) {%- endcapture -%} {%- capture approach_scala_finance -%} -from johnsnowlabs import * + import spark.implicits._ // First define pipeline stages to extract embeddings and text chunks @@ -346,6 +368,10 @@ val assertionModel = assertionPipeline.fit(dataset) [AssertionLogRegApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/assertion/assertion_dl_reg/index.html#sparknlp_jsl.annotator.assertion.assertion_dl_reg.AssertionLogRegApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[AssertionLogRegApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/AssertionLogRegApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title approach=approach @@ -355,6 +381,7 @@ model_input_anno=model_input_anno model_output_anno=model_output_anno model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -366,4 +393,5 @@ approach_scala_legal=approach_scala_legal approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/AverageEmbeddings.md b/docs/en/licensed_annotator_entries/AverageEmbeddings.md index 2e804e78ca..9281434899 100644 --- a/docs/en/licensed_annotator_entries/AverageEmbeddings.md +++ b/docs/en/licensed_annotator_entries/AverageEmbeddings.md @@ -8,6 +8,14 @@ model {%- capture model_description -%} `AverageEmbeddings` computes the mean of vector embeddings for two sentences of equal size, producing a unified representation. + +Parameters: + +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. + +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. {%- endcapture -%} {%- capture model_input_anno -%} diff --git a/docs/en/licensed_annotator_entries/MedicalBertForSequenceClassification.md b/docs/en/licensed_annotator_entries/BertForSequenceClassification.md similarity index 79% rename from docs/en/licensed_annotator_entries/MedicalBertForSequenceClassification.md rename to docs/en/licensed_annotator_entries/BertForSequenceClassification.md index 1592bdc78d..9bb041e3a3 100644 --- a/docs/en/licensed_annotator_entries/MedicalBertForSequenceClassification.md +++ b/docs/en/licensed_annotator_entries/BertForSequenceClassification.md @@ -1,5 +1,5 @@ {%- capture title -%} -MedicalBertForSequenceClassification +BertForSequenceClassification {%- endcapture -%} {%- capture model -%} @@ -7,8 +7,17 @@ model {%- endcapture -%} {%- capture model_description -%} - `MedicalBertForSequenceClassification` can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. + `BertForSequenceClassification` can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. +Parameters: + +- `batchSize`', 'Size of every batch': default: 8, + +- `coalesceSentences`': "Instead of 1 class per sentence (if inputCols is '''sentence''' output 1 class per document by averaging probabilities in all sentences." default: False, + +- `maxSentenceLength`', 'Max sentence length to process', default: 128 + +- `caseSensitive`', 'whether to ignore case in tokens for embeddings matching',default: True, {%- endcapture -%} {%- capture model_input_anno -%} @@ -96,11 +105,11 @@ val result = pipeline.fit(data).transform(data) {%- capture model_python_api_link -%} -[MedicalBertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/medical_bert_for_sequence_classification/index.html#) +[BertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/medical_bert_for_sequence_classification/index.html#) {%- endcapture -%} {%- capture model_scala_api_link -%} -[MedicalBertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/MedicalBertForSequenceClassification.html) +[BertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/MedicalBertForSequenceClassification.html) {%- endcapture -%} diff --git a/docs/en/licensed_annotator_entries/MedicalBertForTokenClassifier.md b/docs/en/licensed_annotator_entries/BertForTokenClassifier.md similarity index 95% rename from docs/en/licensed_annotator_entries/MedicalBertForTokenClassifier.md rename to docs/en/licensed_annotator_entries/BertForTokenClassifier.md index 03fd34c9bc..c92c9fbd19 100644 --- a/docs/en/licensed_annotator_entries/MedicalBertForTokenClassifier.md +++ b/docs/en/licensed_annotator_entries/BertForTokenClassifier.md @@ -1,5 +1,5 @@ {%- capture title -%} -MedicalBertForTokenClassifier +BertForTokenClassifier {%- endcapture -%} {%- capture model -%} @@ -8,7 +8,7 @@ model {%- capture model_description -%} -`MedicalBertForTokenClassifier` can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) for Named-Entity-Recognition (NER) tasks.\ +`BertForTokenClassifier` can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) for Named-Entity-Recognition (NER) tasks.\ Parameters: @@ -191,11 +191,11 @@ val result = pipeline.fit(data).transform(data) {%- endcapture -%} {%- capture model_python_api_link -%} -[MedicalBertForTokenClassifier](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/medical_bert_for_token_classifier/index.html) +[BertForTokenClassifier](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/medical_bert_for_token_classifier/index.html) {%- endcapture -%} {%- capture model_scala_api_link -%} -[MedicalBertForTokenClassifier](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/MedicalBertForTokenClassifier.html) +[BertForTokenClassifier](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/MedicalBertForTokenClassifier.html) {%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md diff --git a/docs/en/licensed_annotator_entries/BertSentenceChunkEmbeddings.md b/docs/en/licensed_annotator_entries/BertSentenceChunkEmbeddings.md index 8aa17224ec..b51121b033 100644 --- a/docs/en/licensed_annotator_entries/BertSentenceChunkEmbeddings.md +++ b/docs/en/licensed_annotator_entries/BertSentenceChunkEmbeddings.md @@ -9,9 +9,19 @@ model {%- capture model_description -%} This annotator allows aggregating sentence embeddings with ner chunk embeddings to get specific and more accurate resolution codes. It works by averaging sentence and chunk embeddings add contextual information in the embedding value. Input to this annotator is the context (sentence) and ner chunks, while the output is embedding for each chunk that can be fed to the resolver model. -BertSentenceChunkEmbeddings Parametres: +Parameters: -- `setChunkWeight(value: Float)`: BertSentenceChunkEmbeddings.this.type Sets the wieght of the chunk embeddings relative to the sentence embeddings.The `setChunkWeight` parameter can be used to control the influence of surrounding context. +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. + +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + +- `chunkWeight`: Relative weight of chunk embeddings in comparison to sentence embeddings. The value should between 0 and 1. The default is 0.5, which means the chunk and sentence embeddings are given equal weight. + +- `setMaxSentenceLength`: Sets max sentence length to process, by default 128. + +- `caseSensitive`: Determines whether the definitions of the white listed entities are case sensitive. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. > For more information and examples of `BertSentenceChunkEmbeddings` annotator, you can check the [Spark NLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop), and in special, the notebook [24.1.Improved_Entity_Resolution_with_SentenceChunkEmbeddings.ipynb](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/24.1.Improved_Entity_Resolution_with_SentenceChunkEmbeddings.ipynb). @@ -26,7 +36,7 @@ SENTENCE_EMBEDDINGS {%- endcapture -%} {%- capture model_python_medical -%} - +from johnsnowlabs import nlp, medical # Define the pipeline document_assembler = nlp.DocumentAssembler()\ @@ -140,10 +150,6 @@ Blood Type: AB positive. Rubella: Immune. VDRL: Nonreactive. Hepatitis C surface val data = Seq(sampleText).toDF("sampleText") val result = pipeline.fit(data).transform(data) -result.selectExpr("explode(sentence_embeddings) AS s") - .selectExpr("s.result", "slice(s.embeddings, 1, 5) AS averageEmbedding") - .show(truncate=false) - +------+--------------------------------------------------------------+ |result|averageEmbedding | +------+--------------------------------------------------------------+ @@ -165,6 +171,9 @@ result.selectExpr("explode(sentence_embeddings) AS s") [BertSentenceChunkEmbeddings](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/embeddings/bert_sentence_embeddings/index.html#sparknlp_jsl.annotator.embeddings.bert_sentence_embeddings.BertSentenceChunkEmbeddings) {%- endcapture -%} +{%- capture model_notebook_link -%} +[BertSentenceChunkEmbeddingsNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/BertSentenceChunkEmbeddings.ipynb) +{%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title @@ -176,4 +185,5 @@ model_python_medical=model_python_medical model_scala_medical=model_scala_medical model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/Chunk2Token.md b/docs/en/licensed_annotator_entries/Chunk2Token.md index 6fbb1d045b..4059d328dd 100644 --- a/docs/en/licensed_annotator_entries/Chunk2Token.md +++ b/docs/en/licensed_annotator_entries/Chunk2Token.md @@ -12,7 +12,15 @@ array of chunk-based tokens (annotatorType TOKEN). When the input is empty, an empty array is returned. -This Annotator is specially convenient when using NGramGenerator annotations as inputs to WordEmbeddingsModels +This Annotator is specially convenient when using NGramGenerator annotations as inputs to WordEmbeddingsModels. + +Parameters: + +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. + +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. {%- endcapture -%} {%- capture model_input_anno -%} @@ -24,7 +32,7 @@ TOKEN {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Define a pipeline for generating n-grams document = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -74,7 +82,7 @@ result.selectExpr("explode(ngram_tokens)").show(5, False) {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal # Define a pipeline for generating n-grams document = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -122,9 +130,7 @@ result.selectExpr("explode(ngram_tokens)").show(5, False) {%- endcapture -%} {%- capture model_python_finance -%} -from johnsnowlabs import * -# Define a pipeline for generating n-grams -from johnsnowlabs import * +from johnsnowlabs import nlp, finance # Define a pipeline for generating n-grams document = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -173,7 +179,7 @@ result.selectExpr("explode(ngram_tokens)").show(5, False) {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * + import spark.implicits._ // Define a pipeline for generating n-grams @@ -210,8 +216,7 @@ val trainingPipeline = new Pipeline().setStages(Array( val data = Seq(("A 63-year-old man presents to the hospital ...")).toDF("text") -val result = trainingPipeline.fit(data).transform(data).cache() -result.selectExpr("explode(ngram_tokens)").show(5, false) +val result = trainingPipeline.fit(data).transform(data) +----------------------------------------------------------------+ |col | @@ -226,7 +231,7 @@ result.selectExpr("explode(ngram_tokens)").show(5, false) {%- endcapture -%} {%- capture model_scala_legal -%} -from johnsnowlabs import * + import spark.implicits._ // Define a pipeline for generating n-grams @@ -262,8 +267,7 @@ val trainingPipeline = new Pipeline().setStages(Array( val data = Seq(("This is an Intellectual Property Agreement between Amazon Inc. and Atlantic Inc.")).toDF("text") -val result = trainingPipeline.fit(data).transform(data).cache() -result.selectExpr("explode(ngram_tokens)").show(5, false) +val result = trainingPipeline.fit(data).transform(data) +-----------------------------------------------------------------------+ |col | @@ -277,7 +281,7 @@ result.selectExpr("explode(ngram_tokens)").show(5, false) {%- endcapture -%} {%- capture model_scala_finance -%} -from johnsnowlabs import * + import spark.implicits._ // Define a pipeline for generating n-grams @@ -313,8 +317,7 @@ val trainingPipeline = new Pipeline().setStages(Array( val data = Seq(("Our competitors include the following by general category: legacy antivirus product providers, such as McAfee LLC and Broadcom Inc.")).toDF("text") -val result = trainingPipeline.fit(data).transform(data).cache() -result.selectExpr("explode(ngram_tokens)").show(5, false) +val result = trainingPipeline.fit(data).transform(data) +--------------------------------------------------------------------+ |col | @@ -331,6 +334,14 @@ result.selectExpr("explode(ngram_tokens)").show(5, false) [Chunk2Token](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/Chunk2Token.html) {%- endcapture -%} +{%- capture model_python_api_link -%} +[Chunk2Token](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunk2_token/index.html) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[Chunk2TokenNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/Chunk2Token.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -343,4 +354,7 @@ model_python_finance=model_python_finance model_scala_medical=model_scala_medical model_scala_legal=model_scala_legal model_scala_finance=model_scala_finance -model_api_link=model_api_link%} \ No newline at end of file +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/ChunkConverter.md b/docs/en/licensed_annotator_entries/ChunkConverter.md index b95b738714..c7b909e9a4 100644 --- a/docs/en/licensed_annotator_entries/ChunkConverter.md +++ b/docs/en/licensed_annotator_entries/ChunkConverter.md @@ -11,6 +11,15 @@ Convert chunks from [RegexMatcher](https://nlp.johnsnowlabs.com/docs/en/annotato This annotator is important when the user wants to merge entities identified by NER models together with rules-based matching used by the RegexMathcer annotator. In the following steps of the pipeline, all the identified entities can be treated in a unified field. +Parameters: + +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. + +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. + {%- endcapture -%} {%- capture model_input_anno -%} @@ -22,6 +31,8 @@ CHUNK {%- endcapture -%} {%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + # Creating the pipeline rules = ''' \b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER @@ -128,11 +139,13 @@ result.select(F.explode(F.arrays_zip(result.merged_chunks.result, {%- endcapture -%} {%- capture model_scala_medical -%} -val rules = """\b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER""" -with open("regex_rules.txt","w") as f: - f.write(rules) +// val rules = """\b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER""" +// with open("regex_rules.txt","w") as f: +// f.write(rules) + import spark.implicits._ + val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") @@ -186,12 +199,7 @@ val pipeline= new Pipeline().setStages(Array( val data = Seq(("POSTOPERATIVE DIAGNOSIS: Cervical lymphadenopathy. PROCEDURE: Excisional biopsy of right cervical lymph node. ANESTHESIA: General endotracheal anesthesia. Specimen: Right cervical lymph node. EBL: 10 cc. COMPLICATIONS: None. FINDINGS: Enlarged level 2 lymph node was identified and removed and sent for pathologic examination. FLUIDS: Please see anesthesia report. URINE OUTPUT: None recorded during the case. INDICATIONS FOR PROCEDURE: This is a 43-year-old female with a several-year history of persistent cervical lymphadenopathy. She reports that it is painful to palpation on the right and has had multiple CT scans as well as an FNA which were all nondiagnostic. After risks and benefits of surgery were discussed with the patient,an informed consent was obtained. She was scheduled for an excisional biopsy of the right cervical lymph node. PROCEDURE IN DETAIL: The patient was taken to the operating room and placed in the supine position. She was anesthetized with general endotracheal anesthesia. The neck was then prepped and draped in the sterile fashion. Again,noted on palpation there was an enlarged level 2 cervical lymph node.A 3-cm horizontal incision was made over this lymph node. Dissection was carried down until the sternocleidomastoid muscle was identified. The enlarged lymph node that measured approximately 2 cm in diameter was identified and was removed and sent to Pathology for touch prep evaluation. The area was then explored for any other enlarged lymph nodes. None were identified,and hemostasis was achieved with electrocautery. A quarter-inch Penrose drain was placed in the wound.The wound was then irrigated and closed with 3-0 interrupted Vicryl sutures for a deep closure followed by a running 4-0 Prolene subcuticular suture. Mastisol and Steri-Strip were placed over the incision,and sterile bandage was applied. The patient tolerated this procedure well and was extubated without complications and transported to the recovery room in stable condition. She will return to the office tomorrow in followup to have the Penrose drain removed.")).toDF("text") -val result = pipeline.fit(data).transform(data).cache() - -result.select(F.explode(F.arrays_zip(result.merged_chunks.result, - result.merged_chunks.metadata)).alias("cols"))\ - .select(F.expr("cols['0']").alias("chunk"), - F.expr("cols['1']['entity']").alias("merged_entity")).show(15, truncate=100) +val result = pipeline.fit(data).transform(data) +----------------------------------------------+--------------+ | chunk| merged_entity| @@ -215,6 +223,8 @@ result.select(F.explode(F.arrays_zip(result.merged_chunks.result, {%- endcapture -%} {%- capture model_python_finance -%} +from johnsnowlabs import nlp, finance + # Creating the pipeline rules = ''' \b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER @@ -297,12 +307,12 @@ result.select(F.explode(F.arrays_zip(result.merged_chunks.result, {%- endcapture -%} {%- capture model_scala_finance -%} -val rules = """\b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER""" - -with open("regex_rules.txt","w") as f: - f.write(rules) +// val rules = """\b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER""" +// with open("regex_rules.txt","w") as f: +// f.write(rules) import spark.implicits._ + val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") @@ -358,12 +368,7 @@ val pipeline= new Pipeline().setStages(Array( val data = Seq(("AWA Group LP intends to pay dividends on the Common Units on a quarterly basis at an annual rate of 8.00% of the Offering Price.")).toDF("text") -val result = pipeline.fit(data).transform(data).cache() - -result.select(F.explode(F.arrays_zip(result.merged_chunks.result, - result.merged_chunks.metadata)).alias("cols"))\ - .select(F.expr("cols['0']").alias("chunk"), - F.expr("cols['1']['entity']").alias("merged_entity")).show(15, truncate=100) +val result = pipeline.fit(data).transform(data) +--------+-------------+ | chunk|merged_entity| @@ -374,6 +379,8 @@ result.select(F.explode(F.arrays_zip(result.merged_chunks.result, {%- endcapture -%} {%- capture model_python_legal -%} +from johnsnowlabs import nlp, legal + # Creating the pipeline rules = ''' \b[A-Z]+(\s+[A-Z]+)*:\b, SECTION_HEADER @@ -456,12 +463,12 @@ result.select(F.explode(F.arrays_zip(result.merged_chunks.result, {%- endcapture -%} {%- capture model_scala_legal -%} -val rules = """[A-Z]+[\s+[A-Z]+]*,SECTION_HEADER """ - -with open("regex_rules.txt","w") as f: - f.write(rules) +// val rules = """[A-Z]+[\s+[A-Z]+]*,SECTION_HEADER """ +// with open("regex_rules.txt","w") as f: +// f.write(rules) import spark.implicits._ + val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") @@ -517,14 +524,8 @@ val pipeline= new Pipeline().setStages(Array( val data = Seq(("AWA Group LP intends to pay dividends on the Common Units on a quarterly basis at an annual rate of 8.00% of the Offering Price.")).toDF("text") -val result = pipeline.fit(data).transform(data).cache() - -result.select(F.explode(F.arrays_zip(result.merged_chunks.result, - result.merged_chunks.metadata)).alias("cols"))\ - .select(F.expr("cols['0']").alias("chunk"), - F.expr("cols['1']['entity']").alias("merged_entity")).show(15, truncate=100) +val result = pipeline.fit(data).transform(data) - +--------+-------------+ | chunk|merged_entity| +--------+-------------+ @@ -540,6 +541,9 @@ result.select(F.explode(F.arrays_zip(result.merged_chunks.result, [ChunkConverter](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunker/chunk_converter/index.html#sparknlp_jsl.annotator.chunker.chunk_converter.ChunkConverter.html) {%- endcapture -%} +{%- capture model_notebook_link -%} +[ChunkConverterNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkConverter.ipynb) +{%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title @@ -555,4 +559,5 @@ model_python_legal=model_python_legal model_scala_legal=model_scala_legal model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/ChunkEntityResolver.md b/docs/en/licensed_annotator_entries/ChunkEntityResolver.md index 27bfe4f345..97ee9dd6b3 100644 --- a/docs/en/licensed_annotator_entries/ChunkEntityResolver.md +++ b/docs/en/licensed_annotator_entries/ChunkEntityResolver.md @@ -11,7 +11,7 @@ model {%- endcapture -%} {%- capture model_description -%} -Returns a normalized entity for a particular trained ontology / curated dataset (e.g. ICD-10, RxNorm, SNOMED etc). +The ChunkEntityResolverModel encompasses the functionality to produce a normalized entity from a specialized ontology or curated dataset (such as ICD-10, RxNorm, SNOMED, etc.). This model includes comprehensive parameters and methods essential for its training. It operates by transforming a dataset that incorporates two Input Annotations: TOKEN and WORD_EMBEDDINGS, sourced from tools like ChunkTokenizer and ChunkEmbeddings Annotators. Ultimately, it generates the normalized entity relevant to the specified trained ontology or curated dataset, ensuring accurate entity resolution within the given context. For available pretrained models please see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Entity+Resolution). @@ -27,7 +27,7 @@ ENTITY {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Using pretrained models for SNOMED # First the prior steps of the pipeline are defined. # Output of types TOKEN and WORD_EMBEDDINGS are needed. @@ -130,7 +130,7 @@ result.selectExpr("explode(snomed_result)") {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * +import spark.implicits._ // Using pretrained models for SNOMED // First the prior steps of the pipeline are defined. // Output of types TOKEN and WORD_EMBEDDINGS are needed. @@ -156,37 +156,37 @@ val icdo_ner = MedicalNerModel.pretrained("ner_bionlp", "en", "clinical/models") .setInputCols(Array("sentence", "token", "word_embeddings")) .setOutputCol("icdo_ner") -val icdo_chunk = new nlp.NerConverter() +val icdo_chunk = new NerConverter() .setInputCols(Array("sentence","token","icdo_ner")) .setOutputCol("icdo_chunk") .setWhiteList("Cancer") -val icdo_chunk_embeddings = new nlp.ChunkEmbeddings() +val icdo_chunk_embeddings = new ChunkEmbeddings() .setInputCols(Array("icdo_chunk", "word_embeddings")) .setOutputCol("icdo_chunk_embeddings") -val icdo_chunk_resolver = medical.ChunkEntityResolverModel.pretrained("chunkresolve_icdo_clinical", "en", "clinical/models") +val icdo_chunk_resolver = ChunkEntityResolverModel.pretrained("chunkresolve_icdo_clinical", "en", "clinical/models") .setInputCols(Array("token","icdo_chunk_embeddings")) .setOutputCol("tm_icdo_code") -val clinical_ner = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models") +val clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") .setInputCols(Array("sentence", "token", "word_embeddings")) .setOutputCol("ner") -val ner_converter = new nlp.NerConverter() +val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val ner_chunk_tokenizer = new nlp.ChunkTokenizer() +val ner_chunk_tokenizer = new ChunkTokenizer() .setInputCols("ner_chunk") .setOutputCol("ner_token") -val ner_chunk_embeddings = new nlp.ChunkEmbeddings() +val ner_chunk_embeddings = new ChunkEmbeddings() .setInputCols(Array("ner_chunk", "word_embeddings")) .setOutputCol("ner_chunk_embeddings") // Definition of the SNOMED Resolution -val ner_snomed_resolver = medical.ChunkEntityResolverModel\ +val ner_snomed_resolver = ChunkEntityResolverModel\ .pretrained("chunkresolve_snomed_findings_clinical","en","clinical/models") .setInputCols(Array("ner_token","ner_chunk_embeddings"))\ .setOutputCol("snomed_result") @@ -256,7 +256,7 @@ ENTITY {%- endcapture -%} {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Training a SNOMED model # Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels. document = nlp.DocumentAssembler() \ @@ -277,8 +277,8 @@ embeddings = nlp.WordEmbeddingsModel\ .setOutputCol("embeddings") chunkEmb = nlp.ChunkEmbeddings() \ - .setInputCols(["chunk", "embeddings"]) \ - .setOutputCol("chunk_embeddings") + .setInputCols(["chunk", "embeddings"]) \ + .setOutputCol("chunk_embeddings") snomedTrainingPipeline = Pipeline().setStages([ document, @@ -310,27 +310,27 @@ model = snomedExtractor.fit(snomedData) {%- endcapture -%} {%- capture approach_scala_medical -%} -from johnsnowlabs import * +import spark.implicits._ // Training a SNOMED model // Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels. -val document = new nlp.DocumentAssembler() +val document = new DocumentAssembler() .setInputCol("normalized_text") .setOutputCol("document") -val chunk = new nlp.Doc2Chunk() +val chunk = new Doc2Chunk() .setInputCols("document") .setOutputCol("chunk") -val token = new nlp.Tokenizer() +val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel\ +val embeddings = WordEmbeddingsModel\ .pretrained("embeddings_healthcare_100d", "en", "clinical/models") .setInputCols(Array("document", "token")) .setOutputCol("embeddings") -val chunkEmb = new nlp.ChunkEmbeddings() +val chunkEmb = new ChunkEmbeddings() .setInputCols(Array("chunk", "embeddings")) .setOutputCol("chunk_embeddings") @@ -347,7 +347,7 @@ val snomedTrainingModel = snomedTrainingPipeline.fit(data) val snomedData = snomedTrainingModel.transform(data).cache() // Then the Resolver can be trained with -val snomedExtractor = new medical.ChunkEntityResolverApproach() +val snomedExtractor = new ChunkEntityResolverApproach() .setInputCols(Array("token", "chunk_embeddings")) .setOutputCol("recognized") .setNeighbours(1000) diff --git a/docs/en/licensed_annotator_entries/ChunkFilterer.md b/docs/en/licensed_annotator_entries/ChunkFilterer.md index d0cbf45ae9..b1a4abf09a 100644 --- a/docs/en/licensed_annotator_entries/ChunkFilterer.md +++ b/docs/en/licensed_annotator_entries/ChunkFilterer.md @@ -12,20 +12,23 @@ White list criteria is enabled by default. To use regex, `criteria` has to be se Parametres: -- `setBlackList(list: Array[String])`: ChunkFilterer.this.type -If defined, list of entities to ignore. -- `setCaseSensitive(value: Boolean)`: ChunkFilterer.this.type -Determines whether the definitions of the white listed and black listed entities are case sensitive or not. -- `setCriteria(s: String)`: ChunkFilterer.this.type -Sets criteria for how to compare black and white listed values with the result of the Annotation. -- `setEntitiesConfidence(value: HashMap[String, Double])`: ChunkFilterer.this.type -Sets Pairs (entity,confidenceThreshold) to filter the chunks with entities which have confidence lower than the confidence threshold. -- `setFilterEntity(v: String)`: ChunkFilterer.this.type -Possible values are 'result' and 'entity'. -- `setRegex(list: String*)`: ChunkFilterer.this.type -Sets the list of regexes to process the chunks. -- `setWhiteList(list: Array[String])`: ChunkFilterer.this.type -Sets the list of entities to process. +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. + +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + +- `criteria`: Tag representing what is the criteria to filter the chunks. Possibles values are: - isIn: Filter by the chunk - regex: Filter using a regex + +- `whiteList`: If defined, list of entities to process. The rest will be ignored. + +- `blackList`: If defined, list of entities to ignore. The rest will be processed. + +- `regex`: If defined, list of regex to process the chunks (Default: []). + +- `filterEntity`: If equal to “entity”, use the ner label to filter. If set to “result”, use the result attribute of the annotation to filter. + +- `entitiesConfidence`: Path to csv with pairs (entity,confidenceThreshold). Filter the chunks with entities which have confidence lower than the confidence threshold. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. {%- endcapture -%} {%- capture model_input_anno -%} @@ -37,7 +40,7 @@ CHUNK {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Filtering POS tags # First pipeline stages to extract the POS tags are defined @@ -103,7 +106,7 @@ result.selectExpr("explode(filtered)").show(truncate=False) {%- endcapture -%} {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal # Filtering POS tags # First pipeline stages to extract the POS tags are defined @@ -165,7 +168,7 @@ result.selectExpr("explode(filtered)").show(truncate=False) {%- endcapture -%} {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance # Filtering POS tags # First pipeline stages to extract the POS tags are defined @@ -272,7 +275,7 @@ val text ="""Has a past history of gastroenteritis and stomach pain, however pat val data = Seq(text).toDF("text") val result = pipeline.fit(data).transform(data) -result.selectExpr("explode(chunk)").show(truncate=false) +// result.selectExpr("explode(chunk)").show(truncate=false) +---------------------------------------------------------------------------------+ |col | +---------------------------------------------------------------------------------+ @@ -284,7 +287,7 @@ result.selectExpr("explode(chunk)").show(truncate=false) |{chunk, 118, 132, gastroenteritis, {sentence -> 0, chunk -> 5}, []} | +---------------------------------------------------------------------------------+ -result.selectExpr("explode(filtered)").show(truncate=false) +// result.selectExpr("explode(filtered)").show(truncate=false) +-------------------------------------------------------------------+ |col | +-------------------------------------------------------------------+ @@ -336,7 +339,7 @@ val text ="""AWA Group LP intends to pay dividends on the Common Units on a quar val data = Seq(text).toDF("text") val result = pipeline.fit(data).transform(data) -result.selectExpr("explode(chunk)").show(truncate=false) +// result.selectExpr("explode(chunk)").show(truncate=false) +-------------------------------------------------------+ |col | +-------------------------------------------------------+ @@ -344,7 +347,7 @@ result.selectExpr("explode(chunk)").show(truncate=false) |{chunk, 92, 95, rate, {sentence -> 0, chunk -> 1}, []} | +-------------------------------------------------------+ -result.selectExpr("explode(filtered)").show(truncate=False) +// result.selectExpr("explode(filtered)").show(truncate=False) +-------------------------------------------------------+ |col | +-------------------------------------------------------+ @@ -394,7 +397,7 @@ val text ="""AWA Group LP intends to pay dividends on the Common Units on a quar val data = Seq(text).toDF("text") val result = pipeline.fit(data).transform(data) -result.selectExpr("explode(chunk)").show(truncate=false) +// result.selectExpr("explode(chunk)").show(truncate=false) +-------------------------------------------------------+ |col | +-------------------------------------------------------+ @@ -402,7 +405,7 @@ result.selectExpr("explode(chunk)").show(truncate=false) |{chunk, 92, 95, rate, {sentence -> 0, chunk -> 1}, []} | +-------------------------------------------------------+ -result.selectExpr("explode(filtered)").show(truncate=False) +// result.selectExpr("explode(filtered)").show(truncate=False) +-------------------------------------------------------+ |col | +-------------------------------------------------------+ @@ -418,6 +421,10 @@ result.selectExpr("explode(filtered)").show(truncate=False) [ChunkFilterer](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunker/chunker_filterer/index.html#sparknlp_jsl.annotator.chunker.chunker_filterer.ChunkFilterer) {%- endcapture -%} +{%- capture model_notebook_link -%} +[ChunkFiltererNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkFilterer.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -432,4 +439,5 @@ model_scala_legal=model_scala_legal model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/ChunkKeyPhraseExtraction.md b/docs/en/licensed_annotator_entries/ChunkKeyPhraseExtraction.md index 1c353cc7f2..49f3f4662b 100644 --- a/docs/en/licensed_annotator_entries/ChunkKeyPhraseExtraction.md +++ b/docs/en/licensed_annotator_entries/ChunkKeyPhraseExtraction.md @@ -12,18 +12,17 @@ The input to the model consists of chunk annotations and sentence or document an Parametres: -- `setConcatenateSentences(value: Boolean)`: ChunkKeyPhraseExtraction.this.type -Concatenate the input sentence/documentation annotations before computing their embedding Default value is 'true'. -- `setDivergence(value: Float)`: ChunkKeyPhraseExtraction.this.type -Set the level of divergence of the extracted key phrases. -- `setDocumentLevelProcessing(value: Boolean)`: ChunkKeyPhraseExtraction.this.type -Extract key phrases from the whole document (true) or from particular sentences which the chunks refer to (false) Default value is 'true'. -- `setDropPunctuation(value: Boolean)`: ChunkKeyPhraseExtraction.this.type -Remove punctuation marks from input chunks. -- `setSelectMostDifferent(value: Boolean)`: ChunkKeyPhraseExtraction.this.type -Let the model return the top N key phrases which are the most different from each other -- `setTopN(value: Int)`: ChunkKeyPhraseExtraction.this.type -Set the number of key phrases to extract +- `setConcatenateSentences(value: Boolean)`: Concatenate the input sentence/documentation annotations before computing their embedding Default value is 'true'. + +- `setDivergence(value: Float)`: Set the level of divergence of the extracted key phrases. + +- `setDocumentLevelProcessing(value: Boolean)`: Extract key phrases from the whole document (true) or from particular sentences which the chunks refer to (false) Default value is 'true'. + +- `setDropPunctuation(value: Boolean)`: Remove punctuation marks from input chunks. + +- `setSelectMostDifferent(value: Boolean)`: Let the model return the top N key phrases which are the most different from each other. + +- `setTopN(value: Int)`: Set the number of key phrases to extract. This model is a subclass of [[BertSentenceEmbeddings]] and shares all parameters with it. It can load any pretrained BertSentenceEmbeddings model. Available models can be found at the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Sentence+Embeddings). @@ -38,7 +37,7 @@ CHUNK {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical documenter = nlp.DocumentAssembler() \ .setInputCol("text") \ @@ -97,7 +96,7 @@ results.selectExpr("explode(ner_chunk_key_phrases) AS key_phrase")\ {%- endcapture -%} {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -160,7 +159,7 @@ result.selectExpr("explode(ner_chunk_key_phrases) AS key_phrase")\ {%- endcapture -%} {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -269,9 +268,7 @@ val pipeline = new Pipeline().setStages(Array( val text ="""Her Diabetes has become type 2 in the last year with her Diabetes.He complains of swelling in his right forearm.""" val data = Seq(text).toDF("text") -val results = pipeline.fit(data).transform(data) - -results.selectExpr("explode(ner_chunk_key_phrases) AS key_phrase") .selectExpr("key_phrase.result", "key_phrase.metadata.entity", "key_phrase.metadata.DocumentSimilarity", "key_phrase.metadata.MMRScore") .show(truncate=false) +val results = pipeline.fit(data).transform(data) +--------+-------------------------+------------------+-----------------+ |result |entity |DocumentSimilarity|MMRScore | @@ -326,13 +323,7 @@ val nlpPipeline = new Pipeline().setStages(Array( val text ="""This INTELLECTUAL PROPERTY AGREEMENT (this "Agreement"), dated as of December 31, 2018 (the "Effective Date") is entered into by and between Armstrong Flooring, Inc., a Delaware corporation ("Seller") and AFI Licensing LLC, a Delaware limited liability company ("Licensing" and together with Seller, "Arizona") and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation ("Buyer") and Armstrong Hardwood Flooring Company, a Tennessee corporation (the "Company" and together with Buyer the "Buyer Entities") (each of Arizona on the one hand and the Buyer Entities on the other hand, a "Party" and collectively, the "Parties").""" val data = Seq(text).toDF("text") -val result = nlpPipeline.fit(data).transform(data) - -result.selectExpr("explode(ner_chunk_key_phrases) AS key_phrase") - .selectExpr("key_phrase.result", - "key_phrase.metadata.entity", - "key_phrase.metadata.DocumentSimilarity", - "key_phrase.metadata.MMRScore") .show(truncate=false) +val result = nlpPipeline.fit(data).transform(data) +--------------+------+------------------+-------------------+ |result |entity|DocumentSimilarity|MMRScore | @@ -387,10 +378,7 @@ val nlpPipeline = new Pipeline().setStages(Array( val text ="""In 2020, we acquired certain assets of Spell Security Private Limited (also known as "Spell Security"). More specifically, their Compliance product - Policy Compliance (PC).""" val data = Seq(text).toDF("text") -val result = nlpPipeline.fit(data).transform(data) - -result.selectExpr("explode(ner_chunk_key_phrases) AS key_phrase") -.selectExpr("key_phrase.result", "key_phrase.metadata.entity", "key_phrase.metadata.DocumentSimilarity", "key_phrase.metadata.MMRScore") .show(truncate=false) +val result = nlpPipeline.fit(data).transform(data) +------------------------------+-------+------------------+-------------------+ |result |entity |DocumentSimilarity|MMRScore | @@ -408,6 +396,10 @@ result.selectExpr("explode(ner_chunk_key_phrases) AS key_phrase") [ChunkKeyPhraseExtraction](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunker/chunk_key_phrase_extraction/index.html#sparknlp_jsl.annotator.chunker.chunk_key_phrase_extraction.ChunkKeyPhraseExtraction) {%- endcapture -%} +{%- capture model_notebook_link -%} +[ChunkKeyPhraseExtractionNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkKeyPhraseExtraction.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -422,4 +414,5 @@ model_scala_legal=model_scala_legal model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/ChunkMapper.md b/docs/en/licensed_annotator_entries/ChunkMapper.md index eb907c2c4c..25860ab136 100644 --- a/docs/en/licensed_annotator_entries/ChunkMapper.md +++ b/docs/en/licensed_annotator_entries/ChunkMapper.md @@ -21,8 +21,11 @@ The annotator also allows using fuzzy matching, which can take into consideratio Parametres: - `setRels` *(List[str])*: Relations that we are going to use to map the chunk + - `setLowerCase` *(Boolean)*: Set if we want to map the chunks in lower case or not (Default: True) + - `setAllowMultiTokenChunk` *(Boolean)*: Whether to skip relations with multitokens (Default: True) + - `setMultivaluesRelations` *(Boolean)*: Whether to decide to return all values in a relation together or separately (Default: False) @@ -39,6 +42,7 @@ LABEL_DEPENDENCY {%- endcapture -%} {%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical documenter = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -83,7 +87,6 @@ rxnorm_resolver = medical.SentenceEntityResolverModel\ .setOutputCol("rxnorm_code")\ .setDistanceFunction("EUCLIDEAN")\ - resolver2chunk = medical.Resolution2Chunk()\ .setInputCols(["rxnorm_code"]) \ .setOutputCol("rxnorm_chunk")\ @@ -93,7 +96,6 @@ chunkerMapper = medical.ChunkMapperModel.pretrained("rxnorm_drug_brandname_mappe .setOutputCol("rxnorm_drug_brandname_mapper")\ .setRels(["rxnorm_brandname"]) - pipeline = nlp.Pipeline( stages = [ documenter, @@ -128,6 +130,7 @@ result.select(F.explode(F.arrays_zip(result.ner_chunks.result, {%- endcapture -%} {%- capture model_python_finance -%} +from johnsnowlabs import nlp, finance document_assembler = nlp.DocumentAssembler()\ .setInputCol('text')\ @@ -167,8 +170,6 @@ data = spark.createDataFrame([text]).toDF("text") result = pipeline.fit(data).transform(data) -result.select('ner_chunk.result', 'mappings.result').show(truncate=False) - +------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |result|result | +------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -177,6 +178,7 @@ result.select('ner_chunk.result', 'mappings.result').show(truncate=False) {%- endcapture -%} {%- capture model_python_legal -%} +from johnsnowlabs import nlp, legal document_assembler = nlp.DocumentAssembler()\ .setInputCol('text')\ @@ -216,7 +218,6 @@ text = ["""873474341 is an American multinational corporation that is engaged in data = spark.createDataFrame([text]).toDF("text") result= pipeline.fit(data).transform(data) -result.select('ner_chunk.result', 'mappings.result').show(truncate=False) +-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |result |result | @@ -234,11 +235,11 @@ val documenter = new DocumentAssembler() .setOutputCol("document") val sentencer = new SentenceDetector() - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentences") val tokenizer = new Tokenizer() - .setInputCols(Array("sentences")) + .setInputCols("sentences") .setOutputCol("tokens") val words_embedder = WordEmbeddingsModel @@ -248,13 +249,13 @@ val words_embedder = WordEmbeddingsModel val ner_tagger = MedicalNerModel .pretrained("ner_posology","en","clinical/models") - .setInputCols("sentences","tokens","embeddings") + .setInputCols(Array("sentences","tokens","embeddings")) .setOutputCol("ner_tags") val ner_converter = new NerConverterInternal() .setInputCols(Array("sentences","tokens","ner_tags")) .setOutputCol("ner_chunks") - .setWhiteList(Array("DRUG")) + .setWhiteList("DRUG") val chunkToDoc = new Chunk2Doc() .setInputCols("ner_chunks") @@ -262,22 +263,22 @@ val chunkToDoc = new Chunk2Doc() val sbert_embedder = BertSentenceEmbeddings .pretrained("sbiobert_base_cased_mli","en","clinical/models") - .setInputCols(Array("ner_chunks_doc")) + .setInputCols("ner_chunks_doc") .setOutputCol("sbert_embeddings") .setCaseSensitive(false) val rxnorm_resolver = SentenceEntityResolverModel .pretrained("sbiobertresolve_rxnorm_augmented","en","clinical/models") - .setInputCols(Array("sbert_embeddings")) + .setInputCols("sbert_embeddings") .setOutputCol("rxnorm_code") .setDistanceFunction("EUCLIDEAN") val resolver2chunk = new Resolution2Chunk() - .setInputCols(Array("rxnorm_code")) + .setInputCols("rxnorm_code") .setOutputCol("rxnorm_chunk") val chunkerMapper = ChunkMapperModel.pretrained("rxnorm_drug_brandname_mapper","en","clinical/models") - .setInputCols(Array("rxnorm_chunk")) + .setInputCols("rxnorm_chunk") .setOutputCol("rxnorm_drug_brandname_mapper") .setRels(Array("rxnorm_brandname")) @@ -297,14 +298,7 @@ val pipeline = new Pipeline().setStages(Array( val text ="""The doctor prescribed Sinequan 150 MG for depression and Zonalon 50 mg for managing skin itching""" val data = Seq(text).toDF("text") -val result= mapper_pipeline.fit(data).transform(data) - -result.select(F.explode(F.arrays_zip(result.ner_chunks.result, - result.rxnorm_code.result, - result.rxnorm_drug_brandname_mapper.result)).alias("cols"))\ - .select(F.expr("cols['0']").alias("ner_chunks"), - F.expr("cols['1']").alias("rxnorm_code"), - F.expr("cols['2']").alias("rxnorm_drug_brandname_mapper")).show(15, truncate=100) +val result= mapper_pipeline.fit(data).transform(data) +----------+-----------+----------------------------+ |ner_chunks|rxnorm_code|rxnorm_drug_brandname_mapper| @@ -339,7 +333,7 @@ val ner_converter = new NerConverter() .setOutputCol("ner_chunk") val CM = ChunkMapperModel.pretrained("finmapper_nasdaq_ticker_stock_screener","en","finance/models") - .setInputCols(Array("ner_chunk")) + .setInputCols("ner_chunk") .setOutputCol("mappings") val pipeline = new Pipeline().setStages(Array( @@ -353,9 +347,7 @@ val pipeline = new Pipeline().setStages(Array( val text ="""There are some serious purchases and sales of AMZN stock today.""" val data = Seq(text).toDF("text") -val result = pipeline.fit(data).transform(data) - -result.select("ner_chunk.result","mappings.result") .show(truncate=false) +val result = pipeline.fit(data).transform(data) +------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |result|result | @@ -390,7 +382,7 @@ val ner_converter = new NerConverter() .setWhiteList(Array("CARDINAL")) val CM = ChunkMapperModel.pretrained("legmapper_edgar_irs","en","legal/models") -.setInputCols(Array("ner_chunk")) +.setInputCols("ner_chunk") .setOutputCol("mappings") val pipeline = new Pipeline().setStages(Array( @@ -404,9 +396,7 @@ val pipeline = new Pipeline().setStages(Array( val text ="""873474341 is an American multinational corporation that is engaged in the design,development,manufacturing,and worldwide marketing and sales of footwear,apparel,equipment,accessories,and services""" val data = Seq(text).toDF("text") -val result= pipeline.fit(data).transform(data) - -result.select("ner_chunk.result","mappings.result") .show(truncate=false) +val result= pipeline.fit(data).transform(data) +-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |result |result | @@ -424,6 +414,9 @@ result.select("ner_chunk.result","mappings.result") .show(truncate=false) [ChunkMapperModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunker/chunkmapper/index.html#sparknlp_jsl.annotator.chunker.chunkmapper.ChunkMapperModel) {%- endcapture -%} +{%- capture model_notebook_link -%} +[ChunkMapperModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkMapperModel.ipynb) +{%- endcapture -%} {%- capture approach_description -%} @@ -446,6 +439,7 @@ LABEL_DEPENDENCY {%- endcapture -%} {%- capture approach_python_medical -%} +from johnsnowlabs import nlp, medical # First, create a dictionay in JSON format following this schema: import json @@ -505,13 +499,14 @@ chunkerMapper = medical.ChunkMapperApproach()\ .setDictionary("/content/sample_drug.json")\ .setRels(["action"]) #or treatment -pipeline = nlp.Pipeline().setStages([document_assembler, - sentence_detector, - tokenizer, - word_embeddings, - clinical_ner, - ner_converter, - chunkerMapper]) +pipeline = nlp.Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter, + chunkerMapper]) text = ["The patient was given 1 unit of metformin daily."] @@ -525,6 +520,7 @@ model.stages[-1].write().save("models/drug_mapper") {%- endcapture -%} {%- capture approach_python_finance -%} +from johnsnowlabs import nlp, finance # First, create a dictionay in JSON format following this schema: import json @@ -601,6 +597,7 @@ model.stages[-1].write().save("models/finance_mapper") {%- endcapture -%} {%- capture approach_python_legal -%} +from johnsnowlabs import nlp, legal # First, create a dictionay in JSON format following this schema: import json @@ -680,13 +677,12 @@ model.stages[-1].write().save("models/legal_mapper") import spark.implicits._ - val document_assembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence_detector = new SentenceDetector() - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() @@ -708,7 +704,7 @@ val ner_converter = new NerConverterInternal() .setWhiteList(Array("DRUG")) val chunkerMapper = new ChunkMapperApproach() - .setInputCols(Array("ner_chunk")) + .setInputCols("ner_chunk") .setOutputCol("mappings") .setDictionary("/content/sample_drug.json") .setRels(Array("action") ) //or treatment @@ -741,7 +737,7 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx") - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() @@ -762,7 +758,7 @@ val ner_converter = new NerConverter() .setWhiteList(Array("ORG") ) // Return only ORG entities val chunkerMapper = new ChunkMapperApproach() - .setInputCols(Array("ner_chunk") ) + .setInputCols("ner_chunk") .setOutputCol("mappings") .setDictionary("/content/sample_json") .setRels(all_rels) @@ -779,7 +775,7 @@ val pipeline = new Pipeline() val text = new Array("AWA Group LP intends to pay dividends on the Common Units on a quarterly basis at an annual rate of 8.00% of the Offering Price. ") -val test_data = seq(Array(text) ).toDF("text") +val test_data = seq(Array(text)).toDF("text") val model = pipeline.fit(test_data) res= model.transform(test_data) @@ -797,7 +793,7 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx") - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() @@ -815,10 +811,10 @@ val legal_ner = LegalNerModel.pretrained("legner_org_per_role_date","en","legal/ val ner_converter = new NerConverter() .setInputCols(Array("sentence","token","ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("ORG") ) // Return only ORG entities + .setWhiteList("ORG") // Return only ORG entities val chunkerMapper = new ChunkMapperApproach() - .setInputCols(Array("ner_chunk")) + .setInputCols("ner_chunk") .setOutputCol("mappings") .setDictionary("/content/sample_json") .setRels(all_rels) @@ -849,6 +845,10 @@ model.stagesArray(-1) .write() .save("models/legal_mapper") [ChunkMapperApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/chunker/chunkmapper/index.html#sparknlp_jsl.annotator.chunker.chunkmapper.ChunkMapperApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[ChunkMapperApproachModel](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkMapperApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -864,6 +864,7 @@ model_scala_finance=model_scala_finance model_scala_legal=model_scala_legal model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -875,4 +876,5 @@ approach_scala_finance=approach_scala_finance approach_scala_legal=approach_scala_legal approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/ChunkMerge.md b/docs/en/licensed_annotator_entries/ChunkMerge.md index 104f5087b2..da0def9316 100644 --- a/docs/en/licensed_annotator_entries/ChunkMerge.md +++ b/docs/en/licensed_annotator_entries/ChunkMerge.md @@ -15,6 +15,8 @@ The decision on which chunk to select is made according to the chunk indices in (chunks with longer lengths and highest information will be kept from each source) Labels can be changed by setReplaceDictResource. +Parameters: + - `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. - `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. - `mergeOverlapping`: (Boolean) Sets whether to merge overlapping matched chunks. Default `True`. @@ -41,7 +43,8 @@ CHUNK {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical + # Annotator that transforms a text column from dataframe into an Annotation ready for NLP documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -62,7 +65,6 @@ word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en" .setInputCols(["sentence", "token"])\ .setOutputCol("embeddings") - # 1- ner_clinical model clinical_ner = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models") \ .setInputCols(["sentence", "token", "embeddings"]) \ @@ -140,7 +142,7 @@ model.selectExpr("explode(merged_ner_chunk) as a") \ {%- endcapture -%} {%- capture approach_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -186,7 +188,7 @@ chunk_merge = finance.ChunkMergeApproach()\ .setInputCols("ner_finner_chunk", "ner_chunk")\ .setOutputCol("deid_merged_chunk") -nlpPipeline = Pipeline(stages=[ +nlpPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, @@ -217,7 +219,7 @@ result.select(F.explode(F.arrays_zip(result.deid_merged_chunk.result, {%- endcapture -%} {%- capture approach_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -258,7 +260,7 @@ chunk_merge = legal.ChunkMergeApproach()\ .setInputCols("ner_signer_chunk", "ner_chunk")\ .setOutputCol("deid_merged_chunk") -nlpPipeline = Pipeline(stages=[ +nlpPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, @@ -304,12 +306,12 @@ val documentAssembler = new DocumentAssembler() // Sentence Detector annotator,processes various sentences per line val sentenceDetector = new SentenceDetector() - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") // Tokenizer splits words in a relevant format for NLP val tokenizer = new Tokenizer() - .setInputCols(Array("sentence")) + .setInputCols("sentence") .setOutputCol("token") // Clinical word embeddings trained on PubMED dataset @@ -352,7 +354,7 @@ val female_entity_extractor = new TextMatcher() // Chunk Merge annotator is used to merge columns val chunk_merger = new ChunkMergeApproach() - .setInputCols("posology_ner_chunk","clinical_ner_chunk","female_entities") + .setInputCols(Array("posology_ner_chunk","clinical_ner_chunk","female_entities")) .setOutputCol("merged_ner_chunk") val nlpPipeline = new Pipeline().setStages(Array( @@ -371,11 +373,7 @@ val text ="""The lady was treated with a five-day course of amoxicillin for a re She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG .""" val data = Seq(text).toDF("text") -val model = nlpPipeline.fit(data).transform(data) - -model.selectExpr("explode(merged_ner_chunk) as a") \ - .selectExpr("a.begin","a.end","a.result as chunk","a.metadata.entity as entity") \ - .show(10, False) +val model = nlpPipeline.fit(data).transform(data) +-----+---+-----------------------------+-------------+ |begin|end|chunk |entity | @@ -423,7 +421,7 @@ val fin_ner = FinanceNerModel.pretrained('finner_deid', "en", "finance/models") val ner_converter = new NerConverterInternal() .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk")\ + .setOutputCol("ner_chunk") .setReplaceLabels({"ORG": "PARTY"}) # Replace "ORG" entity as "PARTY" val ner_finner = FinanceNerModel.pretrained("finner_org_per_role_date", "en", "finance/models")\ @@ -455,11 +453,8 @@ val nlpPipeline = new Pipeline().setStages(Array( val data = Seq(("Jeffrey Preston Bezos is an American entrepreneur, founder and CEO of Amazon")).toDF("text") # Show results -result = nlpPipeline.fit(data).transform(data).cache() -result.select(F.explode(F.arrays_zip(result.deid_merged_chunk.result, - result.deid_merged_chunk.metadata)).alias("cols")) \ - .select(F.expr("cols['0']").alias("chunk"), - F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False) +result = nlpPipeline.fit(data).transform(data) + +---------------------+---------+ |chunk |ner_label| +---------------------+---------+ @@ -527,11 +522,8 @@ val data = Seq(("ENTIRE AGREEMENT. This Agreement contains the entire understan 2THEMART.COM, INC.: I-ESCROW, INC.: By:Dominic J. Magliarditi By:Sanjay Bajaj Name: Dominic J. Magliarditi Name: Sanjay Bajaj Title: President Title: VP Business Development Date: 6/21/99 Date: 6/11/99 ")).toDF("text") # Show results -result = nlpPipeline.fit(data).transform(data).cache() -result.select(F.explode(F.arrays_zip(result.deid_merged_chunk.result, - result.deid_merged_chunk.metadata)).alias("cols")) \ - .select(F.expr("cols['0']").alias("chunk"), - F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False) +result = nlpPipeline.fit(data).transform(data) + +-----------------------+--------------+ |chunk |ner_label | +-----------------------+--------------+ @@ -569,4 +561,4 @@ approach_scala_finance=approach_scala_finance approach_scala_legal=approach_scala_legal approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link -%} \ No newline at end of file +%} diff --git a/docs/en/licensed_annotator_entries/DateNormalizer.md b/docs/en/licensed_annotator_entries/DateNormalizer.md index 89822abae5..faf8ac4fc5 100644 --- a/docs/en/licensed_annotator_entries/DateNormalizer.md +++ b/docs/en/licensed_annotator_entries/DateNormalizer.md @@ -14,7 +14,7 @@ For the relative dates (next year, past month, etc.), you can define an achor da The resultant chunk date will contain a metada indicating whether the normalization was successful or not (True / False). -Parametres; +Parametres: - `anchorDateYear`: (Int) Sets an anchor year for the relative dates such as a day after tomorrow. If not set it will use the current year. @@ -177,11 +177,7 @@ dates = [ df = spark.createDataFrame(dates, StringType()).toDF("original_date") result = pipeline.fit(df).transform(df) -result.selectExpr( - "date.result as normalized_date", - "original_date", - "date.metadata[0].normalized as metadata", -).show() + +---------------+-------------+--------+ |normalized_date|original_date|metadata| @@ -202,6 +198,7 @@ result.selectExpr( {%- capture model_scala_legal -%} +import spark.implicits._ val document_assembler = new DocumentAssembler() .setInputCol("original_date") @@ -359,7 +356,7 @@ val result = pipeline.fit(df).transform(df) {%- endcapture -%} {%- capture model_notebook_link -%} -[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DateNormalizer.ipynb) +[DateNormalizerNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DateNormalizer.ipynb) {%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md diff --git a/docs/en/licensed_annotator_entries/MedicalDistilBertForSequenceClassification.md b/docs/en/licensed_annotator_entries/DistilBertForSequenceClassification.md similarity index 78% rename from docs/en/licensed_annotator_entries/MedicalDistilBertForSequenceClassification.md rename to docs/en/licensed_annotator_entries/DistilBertForSequenceClassification.md index 0bd02a0bfd..ec26cffea6 100644 --- a/docs/en/licensed_annotator_entries/MedicalDistilBertForSequenceClassification.md +++ b/docs/en/licensed_annotator_entries/DistilBertForSequenceClassification.md @@ -1,5 +1,5 @@ {%- capture title -%} -MedicalDistilBertForSequenceClassification +DistilBertForSequenceClassification {%- endcapture -%} {%- capture model -%} @@ -8,7 +8,17 @@ model {%- capture model_description -%} - `MedicalDistilBertForSequenceClassification` can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. + `DistilBertForSequenceClassification` can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. + +Parameters: + +- `batchSize`', 'Size of every batch': default: 8, + +- `coalesceSentences`': "Instead of 1 class per sentence (if inputCols is '''sentence''' output 1 class per document by averaging probabilities in all sentences." default: False, + +- `maxSentenceLength`', 'Max sentence length to process', default: 128 + +- `caseSensitive`', 'whether to ignore case in tokens for embeddings matching',default: True, {%- endcapture -%} @@ -93,11 +103,11 @@ val result = pipeline.fit(data).transform(data) {%- capture model_python_api_link -%} -[MedicalDistilBertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/medical_distilbert_for_sequence_classification/index.html) +[DistilBertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/medical_distilbert_for_sequence_classification/index.html) {%- endcapture -%} {%- capture model_scala_api_link -%} -[MedicalDistilBertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/MedicalDistilBertForSequenceClassification.html) +[DistilBertForSequenceClassification](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/MedicalDistilBertForSequenceClassification.html) {%- endcapture -%} diff --git a/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md b/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md index f25b10f057..fd83de93cb 100644 --- a/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md +++ b/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md @@ -10,9 +10,10 @@ model Converts `DOCUMENT`, `TOKEN` typed annotations into `CHUNK` type with the contents of a `chunkCol`. Chunk text must be contained within input `DOCUMENT`. May be either `StringType` or `ArrayType[StringType]` (using `setIsArray`). Useful for annotators that require a CHUNK type input. -Parameters; +Parameters: - `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. + - `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. @@ -270,7 +271,7 @@ val result = pipeline.fit(data).transform(data) {%- endcapture -%} {%- capture model_notebook_link -%} -[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/Doc2ChunkInternal.ipynb) +[Doc2ChunkInternalNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/Doc2ChunkInternal.ipynb) {%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md diff --git a/docs/en/licensed_annotator_entries/DocumentHashCoder.md b/docs/en/licensed_annotator_entries/DocumentHashCoder.md index e939682db2..4b32b8ad87 100644 --- a/docs/en/licensed_annotator_entries/DocumentHashCoder.md +++ b/docs/en/licensed_annotator_entries/DocumentHashCoder.md @@ -12,7 +12,7 @@ This annotator can replace dates in a column of `DOCUMENT` type according with t If the specified column contains strings that can be parsed to integers, use those numbers to make the shift in the data accordingly. -Parametres; +Parametres: - `PatientIdColumn` *(String)*: Name of the column containing patient ID. diff --git a/docs/en/licensed_annotator_entries/DocumentLogRegClassifier.md b/docs/en/licensed_annotator_entries/DocumentLogRegClassifier.md index 4ed57d1a58..33d0d593ca 100644 --- a/docs/en/licensed_annotator_entries/DocumentLogRegClassifier.md +++ b/docs/en/licensed_annotator_entries/DocumentLogRegClassifier.md @@ -15,8 +15,17 @@ Classifies documents with a Logarithmic Regression algorithm. Currently there are no pretrained models available. Please see DocumentLogRegClassifierApproach to train your own model. -Please check out the -[Models Hub](https://nlp.johnsnowlabs.com/models) for available models in the future. +Parameters: + +- `setMergeChunks(merge)`: Sets whether to merge all chunks in a document or not (Default: false). + +- `setLabels(value)`: Sets array to output the label in the original form. + +- `setVectorizationModel(model)`: Sets a path to the classification model if it has been already trained. + +- `setClassificationModel(model)`: Sets a path to the the classification model if it has been already trained. + +Please check out the [Models Hub](https://nlp.johnsnowlabs.com/models) for available models in the future. {%- endcapture -%} {%- capture model_input_anno -%} diff --git a/docs/en/licensed_annotator_entries/DrugNormalizer.md b/docs/en/licensed_annotator_entries/DrugNormalizer.md index 1b88a031e1..66f3f81dda 100644 --- a/docs/en/licensed_annotator_entries/DrugNormalizer.md +++ b/docs/en/licensed_annotator_entries/DrugNormalizer.md @@ -12,7 +12,7 @@ Removes all dirty characters from text following one or more input regex pattern Can apply non wanted character removal which a specific policy. Can apply lower case normalization. -Parametres; +Parametres: - `lowercase`: (boolean) whether to convert strings to lowercase. Default is False. diff --git a/docs/en/licensed_annotator_entries/EntityChunkEmbeddings.md b/docs/en/licensed_annotator_entries/EntityChunkEmbeddings.md index 0da9b2f80a..2f824c5ef5 100644 --- a/docs/en/licensed_annotator_entries/EntityChunkEmbeddings.md +++ b/docs/en/licensed_annotator_entries/EntityChunkEmbeddings.md @@ -22,7 +22,8 @@ An entity can be defined both as target a entity and as a related entity for som This model is a subclass of `BertSentenceEmbeddings` and shares all parameters with it. It can load any pretrained `BertSentenceEmbeddings` model. -Parametres; +Parametres: + - `targetEntities`: (dict) The target entities mapped to lists of their related entities. A target entity with an empty list of related entities means all other entities are assumed to be related to it. Entity names are case insensitive. *Mandatory to set at least one entity* - `entityWeights`: (dict) The relative weights of drug related entities. If not set, all entities have equal weights. If the list is non-empty and some entity is not in it, then its weight is set to 0. The notation TARGET_ENTITY:RELATED_ENTITY can be used to specify the weight of a entity which is related to specific target entity (e.g. "DRUG:SYMPTOM" -> 0.3f). Entity names are case insensitive. diff --git a/docs/en/licensed_annotator_entries/FeaturesAssembler.md b/docs/en/licensed_annotator_entries/FeaturesAssembler.md index 1b7239e96a..5cc83b3eed 100644 --- a/docs/en/licensed_annotator_entries/FeaturesAssembler.md +++ b/docs/en/licensed_annotator_entries/FeaturesAssembler.md @@ -13,9 +13,10 @@ SparkNLP annotations (if the annotation is an embedding, it takes the embedding, `result` field). The output of the transformer is a `FEATURE_VECTOR` annotation (the numeric vector is in the `embeddings` field). -The parameters below are used for `FeaturesAssembler`. +Parameters: - `inputCols`: The name of the columns containing the input annotations. It can read either a String column name or an Array of strings (column names). + - `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. diff --git a/docs/en/licensed_annotator_entries/GenericClassifier.md b/docs/en/licensed_annotator_entries/GenericClassifier.md index 0d14a34bae..7dd1b16a30 100644 --- a/docs/en/licensed_annotator_entries/GenericClassifier.md +++ b/docs/en/licensed_annotator_entries/GenericClassifier.md @@ -14,6 +14,11 @@ model Creates a generic single-label classifier which uses pre-generated Tensorflow graphs. The model operates on FEATURE_VECTOR annotations which can be produced using FeatureAssembler. Requires the FeaturesAssembler to create the input. + +Parametres: + +- `multiClass` *(Boolean)*: Whether to return all clases or only the one with highest score (Default: False) + {%- endcapture -%} {%- capture model_input_anno -%} @@ -24,6 +29,86 @@ FEATURE_VECTOR CATEGORY {%- endcapture -%} +{%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_embeddings = nlp.BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +features_asm = medical.FeaturesAssembler()\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("features") + +generic_classifier = medical.GenericClassifierModel.pretrained("genericclassifier_sdoh_economics_binary_sbiobert_cased_mli", 'en', 'clinical/models')\ + .setInputCols(["features"])\ + .setOutputCol("classes") + +pipeline = nlp.Pipeline( + stages=[ + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier +]) + +text = """Patient works as a building inspector and remodeler. Married with 2 children. He is a current smoker, 1PPD for 25years. He drinks to beers/night, but has not had any alcohol in past 4 days. No IVDU.""" + +df = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(df).transform(df) +result.select("text", "classes.result").show(truncate=False) + ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+ +|text |result| ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+ +|Patient works as a building inspector and remodeler. Married with 2 children. He is a current smoker, 1PPD for 25years. He drinks to beers/night, but has not had any alcohol in past 4 days. No IVDU.|[True]| ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+ +{%- endcapture -%} + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val features_asm = new FeaturesAssembler() + .setInputCols("sentence_embeddings") + .setOutputCol("features") + +val generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_economics_binary_sbiobert_cased_mli","en","clinical/models") + .setInputCols(Array("features")) + .setOutputCol("classes") + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier )) + +val text = "Patient works as a building inspector and remodeler. Married with 2 children. He is a current smoker,1PPD for 25years. He drinks to beers/night,but has not had any alcohol in past 4 days. No IVDU." + +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) result.select("text","classes.result") .show(truncate=false) + ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+ +|text |result| ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+ +|Patient works as a building inspector and remodeler. Married with 2 children. He is a current smoker, 1PPD for 25years. He drinks to beers/night, but has not had any alcohol in past 4 days. No IVDU.|[True]| ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+ + +{%- endcapture -%} + {%- capture model_api_link -%} [GenericClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/generic_classifier/GenericClassifierModel.html) {%- endcapture -%} @@ -32,11 +117,39 @@ CATEGORY [GenericClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/generic_classifier/generic_classifier/index.html#sparknlp_jsl.annotator.generic_classifier.generic_classifier.GenericClassifierModel) {%- endcapture -%} +{%- capture model_notebook_link -%} +[GenericClassifierModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/GenericClassifierModel.ipynb) +{%- endcapture -%} + {%- capture approach_description -%} Trains a TensorFlow model for generic classification of feature vectors. It takes FEATURE_VECTOR annotations from `FeaturesAssembler` as input, classifies them and outputs CATEGORY annotations. Please see the Parameters section for required training parameters. +Parametres: + +- `batchSize`: (int) Batch size + +- `dropout`: (float) Dropout coefficient + +- `epochsN`: (int) Maximum number of epochs to train + +- `featureScaling`: (str) Feature scaling method. Possible values are 'zscore', 'minmax' or empty (no scaling) + +- `fixImbalance`: (boolean) Fix the imbalance in the training set by replicating examples of under represented categories + +- `labelColumn`: (str) Column with label per each document + +- `learningRate`: (float) Learning Rate + +- `modelFile`: (str) Location of file of the model used for classification + +- `multiClass`: (boolean) If multiClass is set, the model will return all the labels with corresponding scores. By default, multiClass is false. + +- `outputLogsPath`: (str) Folder path to save training logs. If no path is specified, the logs won't be stored in disk. The path can be a local file path, a distributed file path (HDFS, DBFS), or a cloud storage (S3). + +- `validationSplit`: (float) The proportion of training dataset to be used as validation set.The model will be validated against this dataset on each Epoch and will not be used for training. The value should be between 0.0 and 1.0. + For a more extensive example please see the [Spark NLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/8.Generic_Classifier.ipynb). {%- endcapture -%} @@ -69,7 +182,7 @@ gen_clf = medical.GenericClassifierApproach() \ .setOutputLogsPath("logs") \ .setValidationSplit(0.2) # keep 20% of the data for validation purposes -pipeline = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ features_asm, gen_clf ]) @@ -98,7 +211,7 @@ gen_clf = legal.GenericClassifierApproach() \ .setOutputLogsPath("logs") \ .setValidationSplit(0.2) # keep 20% of the data for validation purposes -pipeline = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ features_asm, gen_clf ]) @@ -128,7 +241,7 @@ gen_clf = finance.GenericClassifierApproach() \ .setOutputLogsPath("logs") \ .setValidationSplit(0.2) # keep 20% of the data for validation purposes -pipeline = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ features_asm, gen_clf ]) @@ -138,12 +251,13 @@ clf_model = pipeline.fit(data) {%- endcapture -%} {%- capture approach_scala_medical -%} +import spark.implicits._ -val features_asm = new medical.FeaturesAssembler() +val features_asm = new FeaturesAssembler() .setInputCols(Array("feature_1", "feature_2", "...", "feature_n")) .setOutputCol("features") -val gen_clf = new medical.GenericClassifierApproach() +val gen_clf = new GenericClassifierApproach() .setLabelColumn("target") .setInputCols("features") .setOutputCol("prediction") @@ -167,12 +281,13 @@ val clf_model = pipeline.fit(data) {%- capture approach_scala_legal -%} +import spark.implicits._ -val features_asm = new legal.FeaturesAssembler() +val features_asm = new FeaturesAssembler() .setInputCols(Array("feature_1", "feature_2", "...", "feature_n")) .setOutputCol("features") -val gen_clf = new legal.GenericClassifierApproach() +val gen_clf = new GenericClassifierApproach() .setLabelColumn("target") .setInputCols("features") .setOutputCol("prediction") @@ -196,12 +311,13 @@ val clf_model = pipeline.fit(data) {%- capture approach_scala_finance -%} +import spark.implicits._ -val features_asm = new finance.FeaturesAssembler() +val features_asm = new FeaturesAssembler() .setInputCols(Array("feature_1", "feature_2", "...", "feature_n")) .setOutputCol("features") -val gen_clf = new finance.GenericClassifierApproach() +val gen_clf = new GenericClassifierApproach() .setLabelColumn("target") .setInputCols("features") .setOutputCol("prediction") @@ -231,6 +347,10 @@ val clf_model = pipeline.fit(data) [GenericClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/generic_classifier/generic_classifier/index.html#sparknlp_jsl.annotator.generic_classifier.generic_classifier.GenericClassifierApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[GenericClassifierApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/GenericClassifierApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title approach=approach @@ -238,8 +358,11 @@ model=model model_description=model_description model_input_anno=model_input_anno model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -251,4 +374,5 @@ approach_scala_legal=approach_scala_legal approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/NerChunker.md b/docs/en/licensed_annotator_entries/NerChunker.md index 9128c79497..550960ba61 100644 --- a/docs/en/licensed_annotator_entries/NerChunker.md +++ b/docs/en/licensed_annotator_entries/NerChunker.md @@ -11,7 +11,7 @@ Extracts phrases that fits into a known pattern using the NER tags. Useful for e when there is no pretrained NER model to address certain issues. A Regex needs to be provided to extract the tokens between entities. -Parametres; +Parametres: - `setRegexParsers`: Array of grammar based chunk parsers. {%- endcapture -%} diff --git a/docs/en/licensed_annotator_entries/NerConverterInternal.md b/docs/en/licensed_annotator_entries/NerConverterInternal.md index cf1ed77a9f..ef6135a52f 100644 --- a/docs/en/licensed_annotator_entries/NerConverterInternal.md +++ b/docs/en/licensed_annotator_entries/NerConverterInternal.md @@ -14,6 +14,25 @@ Chunks with no associated entity (tagged "O") are filtered out. This licensed annotator adds extra functionality to the open-source version by adding the following parameters: `blackList`, `greedyMode`, `threshold`, and `ignoreStopWords` that are not available in the [NerConverter](https://nlp.johnsnowlabs.com/docs/en/annotators#nerconverter) annotator. See also [Inside–outside–beginning (tagging)](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) for more information. + +Parameters: + +- `setThreshold`: Confidence threshold. + +- `setWhiteList`: If defined, list of entities to process. + +- `setBlackList`: If defined, list of entities to ignore. + +- `setReplaceLabels`: If defined, contains a dictionary for entity replacement. + +- `setPreservePosition`: Whether to preserve the original position of the tokens in the original document or use the modified tokens. + +- `setReplaceDictResource`: If defined, path to the file containing a dictionary for entity replacement. + +- `setIgnoreStopWords`: If defined, list of stop words to ignore. + +- `setGreedyMode`: (Boolean) Whether to ignore B tags for contiguous tokens of same entity same . + {%- endcapture -%} {%- capture model_input_anno -%} @@ -25,7 +44,7 @@ CHUNK {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -56,7 +75,7 @@ jsl_ner_converter_internal = medical.NerConverterInternal()\ .setOutputCol("replaced_ner_chunk")\ .setReplaceDictResource("replace_dict.csv","text", {"delimiter":","}) -nlpPipeline = Pipeline(stages=[ +nlpPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, @@ -71,44 +90,40 @@ result = nlpPipeline.fit(data).transform(data) {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel +val sentenceDetector = SentenceDetectorDLModel .pretrained("sentence_detector_dl_healthcare","en","clinical/models") .setInputCols("document") .setOutputCol("sentence") - -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") - -val word_embeddings = nlp.WordEmbeddingsModel +val word_embeddings = WordEmbeddingsModel .pretrained("embeddings_clinical", "en","clinical/models") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") - -val jsl_ner = medical.NerModel +val jsl_ner = MedicalNerModel .pretrained("ner_jsl", "en", "clinical/models") .setInputCols(Array("sentence", "token","embeddings")) .setOutputCol("jsl_ner") - -val jsl_ner_converter = new nlp.NerConverter() +val jsl_ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "jsl_ner")) .setOutputCol("jsl_ner_chunk") -val jsl_ner_converter_internal = new medical.NerConverterInternal() +val jsl_ner_converter_internal = new NerConverterInternal() .setInputCols(Array("sentence", "token", "jsl_ner")) .setOutputCol("replaced_ner_chunk") .setReplaceDictResource("replace_dict.csv","text", {"delimiter":","}) - val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, @@ -125,7 +140,7 @@ val result = pipeline.fit(data).transform(data) {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -154,7 +169,7 @@ ner_converter = legal.NerConverterInternal() \ .setOutputCol("ner_chunk")\ .setReplaceLabels({"ALIAS": "PARTY"}) # "ALIAS" are secondary names of companies, so let's extract them also as PARTY -nlpPipeline = Pipeline(stages=[ +nlpPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, @@ -168,40 +183,36 @@ result = nlpPipeline.fit(data).transform(data) {%- capture model_scala_legal -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel +val sentenceDetector = SentenceDetectorDLModel .pretrained("sentence_detector_dl","xx") .setInputCols("document") .setOutputCol("sentence") - -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") - -val embeddings = nlp.RoBertaEmbeddings +val embeddings = RoBertaEmbeddings .pretrained("roberta_embeddings_legal_roberta_base", "en") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") - -val legal_ner = legal.NerModel +val legal_ner = LegalNerModel .pretrained("legner_contract_doc_parties", "en", "legal/models") .setInputCols(Array("sentence", "token","embeddings")) .setOutputCol("ner") - -val ner_converter = new legal.NerConverterInternal() +val ner_converter = new NerConverterInternal() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") .setReplaceLabels({"ALIAS": "PARTY"}) - val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, @@ -219,7 +230,7 @@ val result = pipeline.fit(data).transform(data) {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -248,7 +259,7 @@ ner_converter = finance.NerConverterInternal() \ .setOutputCol("ner_chunk")\ .setReplaceLabels({"ORG": "PARTY"}) # Replace "ORG" entity as "PARTY" -nlpPipeline = Pipeline(stages=[ +nlpPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, @@ -257,46 +268,40 @@ nlpPipeline = Pipeline(stages=[ ner_converter]) result = nlpPipeline.fit(data).transform(data) - - {%- endcapture -%} {%- capture model_scala_finance -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel +val sentenceDetector = SentenceDetectorDLModel .pretrained("sentence_detector_dl","xx") .setInputCols("document") .setOutputCol("sentence") +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") - - -val embeddings = nlp.RoBertaEmbeddings +val embeddings = RoBertaEmbeddings .pretrained("roberta_embeddings_legal_roberta_base", "en") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") - -val fin_ner = finance.NerModel +val fin_ner = FinanceNerModel .pretrained("finner_deid", "en", "finance/models") .setInputCols(Array("sentence", "token","embeddings")) .setOutputCol("ner") - -val ner_converter = new finance.NerConverterInternal() +val ner_converter = new NerConverterInternal() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") .setReplaceLabels({"ORG": "PARTY"}) - val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, @@ -318,6 +323,10 @@ val result = pipeline.fit(data).transform(data) [NerConverterInternal](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/ner/ner_converter_internal/index.html#sparknlp_jsl.annotator.ner.ner_converter_internal.NerConverterInternal) {%- endcapture -%} +{%- capture model_notebook_link -%} +[NerConverterInternalNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/NerConverterInternal.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -332,4 +341,5 @@ model_python_finance=model_python_finance model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/NerDisambiguator.md b/docs/en/licensed_annotator_entries/NerDisambiguator.md index df130d1b6d..f18b2fd039 100644 --- a/docs/en/licensed_annotator_entries/NerDisambiguator.md +++ b/docs/en/licensed_annotator_entries/NerDisambiguator.md @@ -18,365 +18,195 @@ Instantiated / pretrained model of the NerDisambiguator. Links words of interest, such as names of persons, locations and companies, from an input text document to a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs), mentions, or surface forms. -{%- endcapture -%} -{%- capture model_input_anno -%} -CHUNK, SENTENCE_EMBEDDINGS -{%- endcapture -%} +Parameters: -{%- capture model_output_anno -%} -DISAMBIGUATION -{%- endcapture -%} +- `embeddingTypeParam`: (String) ‘bow’ for word embeddings or ‘sentence’ for sentences. -{%- capture model_api_link -%} -[NerDisambiguatorModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/disambiguation/NerDisambiguatorModel.html) -{%- endcapture -%} +- `numFirstChars`: (Int) number of characters to be considered for initial prefix search in the knowledge base. -{%- capture model_python_api_link -%} -[NerDisambiguatorModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/disambiguation/ner_disambiguator/index.html#sparknlp_jsl.annotator.disambiguation.ner_disambiguator.NerDisambiguatorModel) -{%- endcapture -%} +- `tokenSearch`: (BooleanParam) mechanism of search - by token or by - chunk in knowledge base (token is recommended ==> Default value: True). + +- `narrowWithApproximateMatching`: (BooleanParam) narrow down the prefix search results with Levenshtein distance based matching (True is recommended). + +- `levenshteinDistanceThresholdParam`: (Float) value of the +Levenshtein distance threshold to narrow results from prefix search (default value: 0.1). + +- `nearMatchingGapParam`: (Int) allows to define a limit on the string length (by trimming the candidate chunks) during Levenshtein distance-based narrowing, {len(candidate) - len(entity chunk) > nearMatchingGap} (default value: 4). + +- `predictionsLimit`: (BooleanParam) allows to limit the number of predictions N for top N predictions. + +- `s3KnowledgeBaseName`: (String) the name of the Knowledge Base name in S3. -{%- capture approach_description -%} -Links words of interest, such as names of persons, locations and companies, from an input text document to -a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs), -mentions, or surface forms. -The model needs extracted CHUNKS and SENTENCE_EMBEDDINGS type input from e.g. -[SentenceEmbeddings](/docs/en/annotators#sentenceembeddings) and -[NerConverter](/docs/en/annotators#nerconverter). {%- endcapture -%} -{%- capture approach_input_anno -%} +{%- capture model_input_anno -%} CHUNK, SENTENCE_EMBEDDINGS {%- endcapture -%} -{%- capture approach_output_anno -%} +{%- capture model_output_anno -%} DISAMBIGUATION {%- endcapture -%} -{%- capture approach_python_medical -%} -from johnsnowlabs import * -# Extracting Person identities -# First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -# Extracting Person identities -# First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -data = spark.createDataFrame([["The show also had a contestant named Donald Trump who later defeated Christina Aguilera ..."]]) \ - .toDF("text") +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ - .setOutputCol("document") + .setInputCol("text") \ + .setOutputCol("document") + sentenceDetector = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") + .setInputCols(["document"]) \ + .setOutputCol("sentence") + tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") + .setInputCols(["sentence"]) \ + .setOutputCol("token") + word_embeddings = nlp.WordEmbeddingsModel.pretrained() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + sentence_embeddings = nlp.SentenceEmbeddings() \ - .setInputCols(["sentence","embeddings"]) \ - .setOutputCol("sentence_embeddings") -ner_model = nlp.NerDLModel.pretrained() \ - .setInputCols(["sentence", "token", "embeddings"]) \ - .setOutputCol("ner") + .setInputCols(["sentence","embeddings"]) \ + .setOutputCol("sentence_embeddings") + +ner_model = medical.NerModel.pretrained() \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + ner_converter = nlp.NerConverter() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") \ - .setWhiteList(["PER"]) + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") \ + .setWhiteList(["PER"]) -# Then the extracted entities can be disambiguated. disambiguator = medical.NerDisambiguator() \ - .setS3KnowledgeBaseName("i-per") \ - .setInputCols(["ner_chunk", "sentence_embeddings"]) \ - .setOutputCol("disambiguation") \ - .setNumFirstChars(5) - -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - sentence_embeddings, - ner_model, - ner_converter, - disambiguator]) - -model = nlpPipeline.fit(data) -result = model.transform(data) - -# Show results -result.selectExpr("explode(disambiguation)") - .selectExpr("col.metadata.chunk as chunk", "col.result as result").show(5, False) -+------------------+------------------------------------------------------------------------------------------------------------------------+ -|chunk |result | -+------------------+------------------------------------------------------------------------------------------------------------------------+ -|Donald Trump |http:#en.wikipedia.org/?curid=4848272, http:#en.wikipedia.org/?curid=31698421, http:#en.wikipedia.org/?curid=55907961 | -|Christina Aguilera|http:#en.wikipedia.org/?curid=144171, http:#en.wikipedia.org/?curid=6636454 | -+------------------+------------------------------------------------------------------------------------------------------------------------+ -{%- endcapture -%} + .setS3KnowledgeBaseName("i-per") \ + .setInputCols(["ner_chunk", "sentence_embeddings"]) \ + .setOutputCol("disambiguation") \ + .setTokenSearch(False) -{%- capture approach_python_legal -%} -from johnsnowlabs import * -# Extracting Person identities -# First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -# Extracting Person identities -# First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. +pipeline = nlp.Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + sentence_embeddings, + ner_model, + ner_converter, + disambiguator]) -documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ - .setOutputCol("document") -sentenceDetector = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") -tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") -word_embeddings = nlp.WordEmbeddingsModel.pretrained() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") -sentence_embeddings = nlp.SentenceEmbeddings() \ - .setInputCols(["sentence","embeddings"]) \ - .setOutputCol("sentence_embeddings") -ner_model = legal.NerModel.pretrained("legner_orgs_prods_alias", "en", "legal/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") -ner_converter = nlp.NerConverter() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") \ - .setWhiteList(["PER"]) - -# Then the extracted entities can be disambiguated. -disambiguator = legal.NerDisambiguator() \ - #.setS3KnowledgeBaseName("i-per") \ - .setInputCols(["ner_chunk", "sentence_embeddings"]) \ - .setOutputCol("disambiguation") \ - .setNumFirstChars(5) - -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - sentence_embeddings, - ner_model, - ner_converter, - disambiguator]) - -{%- endcapture -%} +text = """The show also had a contestant named Donald Trump who later defeated Christina Aguilera ...""" +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) -{%- capture approach_python_finance -%} -from johnsnowlabs import * -# Extracting Person identities -# First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -# Extracting Person identities -# First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ - .setOutputCol("document") -sentenceDetector = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") -tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") -word_embeddings = nlp.WordEmbeddingsModel.pretrained() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") -sentence_embeddings = nlp.SentenceEmbeddings() \ - .setInputCols(["sentence","embeddings"]) \ - .setOutputCol("sentence_embeddings") -ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias","en","finance/models")\ - .setInputCols(["sentence", "token", "embeddings"]) \ - .setOutputCol("ner") -ner_converter = nlp.NerConverter() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") \ - .setWhiteList(["PER"]) - -# Then the extracted entities can be disambiguated. -disambiguator = finance.NerDisambiguator() \ - #.setS3KnowledgeBaseName("i-per") \ - .setInputCols(["ner_chunk", "sentence_embeddings"]) \ - .setOutputCol("disambiguation") \ - .setNumFirstChars(5) - -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - sentence_embeddings, - ner_model, - ner_converter, - disambiguator]) +# Result +result.selectExpr("explode(disambiguation)") \ + .selectExpr("col.metadata.chunk as chunk", "col.result as result").show(5, truncate=False) ++------------------+------------------------------------------------------------------------------------------------------------------------+ +|chunk |result | ++------------------+------------------------------------------------------------------------------------------------------------------------+ +|Donald Trump |http://en.wikipedia.org/?curid=55907961, http://en.wikipedia.org/?curid=31698421, http://en.wikipedia.org/?curid=4848272| +|Christina Aguilera|http://en.wikipedia.org/?curid=6636454, http://en.wikipedia.org/?curid=144171 | ++------------------+------------------------------------------------------------------------------------------------------------------------+ {%- endcapture -%} +{%- capture model_scala_medical -%} + +import spark.implicits._ +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") -{%- capture approach_scala_medical -%} -from johnsnowlabs import * -// Extracting Person identities -// First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -val data = Seq("The show also had a contestant named Donald Trump who later defeated Christina Aguilera ...") - .toDF("text") -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector() +val sentenceDetector = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") -val word_embeddings = nlp.WordEmbeddingsModel.pretrained() - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") -val sentence_embeddings = new nlp.SentenceEmbeddings() - .setInputCols(Array("sentence","embeddings")) - .setOutputCol("sentence_embeddings") -val ner_model = nlp.NerDLModel.pretrained() - .setInputCols(Array("sentence", "token", "embeddings")) - .setOutputCol("ner") -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - .setWhiteList("PER") - -// Then the extracted entities can be disambiguated. -val disambiguator = new medical.NerDisambiguator() - .setS3KnowledgeBaseName("i-per") - .setInputCols(Array("ner_chunk", "sentence_embeddings")) - .setOutputCol("disambiguation") - .setNumFirstChars(5) - -val nlpPipeline = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - sentence_embeddings, - ner_model, - ner_converter, - disambiguator)) - -val model = nlpPipeline.fit(data) -val result = model.transform(data) - -// Show results -// -// result.selectExpr("explode(disambiguation)") -// .selectExpr("col.metadata.chunk as chunk", "col.result as result").show(5, false) -// +------------------+------------------------------------------------------------------------------------------------------------------------+ -// |chunk |result | -// +------------------+------------------------------------------------------------------------------------------------------------------------+ -// |Donald Trump |https://en.wikipedia.org/?curid=4848272, https://en.wikipedia.org/?curid=31698421, https://en.wikipedia.org/?curid=55907961| -// |Christina Aguilera|https://en.wikipedia.org/?curid=144171, https://en.wikipedia.org/?curid=6636454 | -// +------------------+------------------------------------------------------------------------------------------------------------------------+ -// -{%- endcapture -%} +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained() + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val sentence_embeddings = new SentenceEmbeddings() + .setInputCols(Array("sentence","embeddings")) + .setOutputCol("sentence_embeddings") + +val ner_model = NerDLModel.pretrained() + .setInputCols(Array("sentence","token","embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence","token","ner")) + .setOutputCol("ner_chunk") + .setWhiteList(Array("PER")) + +val disambiguator = new NerDisambiguator() + .setS3KnowledgeBaseName("i-per") + .setInputCols(Array("ner_chunk","sentence_embeddings")) + .setOutputCol("disambiguation") + .setTokenSearch(false) + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + sentence_embeddings, + ner_model, + ner_converter, + disambiguator)) + +val text = "The show also had a contestant named Donald Trump who later defeated Christina Aguilera ..." + +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) + +// Result -{%- capture approach_scala_legal -%} -from johnsnowlabs import * -// Extracting Person identities -// First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -val data = Seq("The show also had a contestant named Donald Trump who later defeated Christina Aguilera ...") - .toDF("text") -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector() - .setInputCols("document") - .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") -val word_embeddings = nlp.WordEmbeddingsModel.pretrained() - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") -val sentence_embeddings = new nlp.SentenceEmbeddings() - .setInputCols(Array("sentence","embeddings")) - .setOutputCol("sentence_embeddings") -val ner_model = legal.NerModel.pretrained("legner_orgs_prods_alias", "en", "legal/models")\ - .setInputCols(Array("sentence", "token", "embeddings"))\ - .setOutputCol("ner") -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - .setWhiteList("PER") - -// Then the extracted entities can be disambiguated. -val disambiguator = new legal.NerDisambiguator() - #.setS3KnowledgeBaseName("i-per") - .setInputCols(Array("ner_chunk", "sentence_embeddings")) - .setOutputCol("disambiguation") - .setNumFirstChars(5) - -val nlpPipeline = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - sentence_embeddings, - ner_model, - ner_converter, - disambiguator)) ++------------------+------------------------------------------------------------------------------------------------------------------------+ +|chunk |result | ++------------------+------------------------------------------------------------------------------------------------------------------------+ +|Donald Trump |http://en.wikipedia.org/?curid=55907961, http://en.wikipedia.org/?curid=31698421, http://en.wikipedia.org/?curid=4848272| +|Christina Aguilera|http://en.wikipedia.org/?curid=6636454, http://en.wikipedia.org/?curid=144171 | ++------------------+------------------------------------------------------------------------------------------------------------------------+ +{%- endcapture -%} +{%- capture model_api_link -%} +[NerDisambiguatorModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/disambiguation/NerDisambiguatorModel.html) {%- endcapture -%} -{%- capture approach_scala_finance -%} -from johnsnowlabs import * -// Extracting Person identities -// First define pipeline stages that extract entities and embeddings. Entities are filtered for PER type entities. -val data = Seq("The show also had a contestant named Donald Trump who later defeated Christina Aguilera ...") - .toDF("text") -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector() - .setInputCols("document") - .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") -val word_embeddings = nlp.WordEmbeddingsModel.pretrained() - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") -val sentence_embeddings = new nlp.SentenceEmbeddings() - .setInputCols(Array("sentence","embeddings")) - .setOutputCol("sentence_embeddings") -val ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias","en","finance/models")\ - .setInputCols(Array("sentence", "token", "embeddings")) \ - .setOutputCol("ner") -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - .setWhiteList("PER") - -// Then the extracted entities can be disambiguated. -val disambiguator = new finance.NerDisambiguator() - #.setS3KnowledgeBaseName("i-per") - .setInputCols(Array("ner_chunk", "sentence_embeddings")) - .setOutputCol("disambiguation") - .setNumFirstChars(5) - -val nlpPipeline = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - sentence_embeddings, - ner_model, - ner_converter, - disambiguator)) +{%- capture model_python_api_link -%} +[NerDisambiguatorModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/disambiguation/ner_disambiguator/index.html#sparknlp_jsl.annotator.disambiguation.ner_disambiguator.NerDisambiguatorModel) +{%- endcapture -%} +{%- capture model_notebook_link -%} +[NerDisambiguatorModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/NerDisambiguatorModel.ipynb) {%- endcapture -%} +{%- capture approach_description -%} +Links words of interest, such as names of persons, locations and companies, from an input text document to +a corresponding unique entity in a target Knowledge Base (KB). Words of interest are called Named Entities (NEs), +mentions, or surface forms. +The model needs extracted CHUNKS and SENTENCE_EMBEDDINGS type input from e.g. +[SentenceEmbeddings](/docs/en/annotators#sentenceembeddings) and +[NerConverter](/docs/en/annotators#nerconverter). +{%- endcapture -%} +{%- capture approach_input_anno -%} +CHUNK, SENTENCE_EMBEDDINGS +{%- endcapture -%} +{%- capture approach_output_anno -%} +DISAMBIGUATION +{%- endcapture -%} {%- capture approach_api_link -%} [NerDisambiguator](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/disambiguation/NerDisambiguator.html) @@ -393,17 +223,14 @@ model=model model_description=model_description model_input_anno=model_input_anno model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno -approach_python_medical=approach_python_medical -approach_python_legal=approach_python_legal -approach_python_finance=approach_python_finance -approach_scala_medical=approach_scala_medical -approach_scala_legal=approach_scala_legal -approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link %} diff --git a/docs/en/licensed_annotator_entries/NerModel.md b/docs/en/licensed_annotator_entries/NerModel.md index 7cadbb7104..9e45552ac9 100644 --- a/docs/en/licensed_annotator_entries/NerModel.md +++ b/docs/en/licensed_annotator_entries/NerModel.md @@ -11,28 +11,25 @@ model {%- endcapture -%} {%- capture model_description -%} -This Named Entity recognition annotator is a generic NER model based on Neural Networks. +`NerModel` is the Named Entity Recognition (NER) annotator that allows to train generic NER model based on Neural Networks. The architecture of the neural network is a Char CNNs - BiLSTM - CRF that achieves state-of-the-art in most datasets. +Note that some pre-trained models require specific types of embeddings, depending on which they were trained. -Pretrained models can be loaded with `pretrained` of the companion object: -``` -val nerModel = nlp.NerDLModel.pretrained() - .setInputCols("sentence", "token", "embeddings") - .setOutputCol("ner") -``` -The default model is `"ner_clinical"`, if no name is provided. +Parameters: -For available pretrained models please see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition). -Additionally, pretrained pipelines are available for this module, see [Pipelines](https://nlp.johnsnowlabs.com/docs/en/pipelines). +- `setBatchSize`: (int) number of samples used in one iteration of training (Default: `32`). -Note that some pretrained models require specific types of embeddings, depending on which they were trained on. -For example, the default model `"ner_dl"` requires the -[WordEmbeddings](/docs/en/annotators#wordembeddings) `"ner_clinical"`. +- `setIncludeConfidence`: (Boolean) whether to include confidence scores in annotation metadata (`Default`: False). +- `setConfigProtoBytes`: (int) ConfigProto from tensorflow, serialized into byte array. -For extended examples of usage, see the [Spark NLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb) -(sections starting with `Training a Clinical NER`) +- `setIncludeAllConfidenceScores`: (Boolean) whether to include confidence scores for all tags rather than just for the predicted one. +- `setMinProbability` (Float) define the minimum probability value. + +For available pretrained models please see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition). +Additionally, pretrained pipelines are available for this module, see the [Pipelines](https://nlp.johnsnowlabs.com/docs/en/pipelines). +For extended examples of usage, see the [Spark NLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master) {%- endcapture -%} {%- capture model_input_anno -%} @@ -44,7 +41,7 @@ NAMED_ENTITY {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -52,7 +49,7 @@ documentAssembler = nlp.DocumentAssembler()\ sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \ .setInputCols(["document"]) \ - .setOutputCol("sentence") + .setOutputCol("sentence") tokenizer = nlp.Tokenizer()\ .setInputCols(["sentence"])\ @@ -62,254 +59,407 @@ word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en" .setInputCols(["sentence", "token"])\ .setOutputCol("embeddings") - jsl_ner = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \ .setInputCols(["sentence", "token", "embeddings"]) \ .setOutputCol("jsl_ner") - -jsl_ner_converter = nlp.NerConverter() \ + +jsl_ner_converter = medical.NerConverterInternal() \ .setInputCols(["sentence", "token", "jsl_ner"]) \ - .setOutputCol("jsl_ner_chunk") + .setOutputCol("ner_chunk") -jsl_ner_pipeline = Pipeline(stages=[ - documentAssembler, +jsl_ner_pipeline = nlp.Pipeline(stages=[ + documentAssembler, sentenceDetector, tokenizer, word_embeddings, jsl_ner, jsl_ner_converter]) -result = jsl_ner_pipeline.fit(data).transform(data) +text = ''' +A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, and associated with an acute hepatitis, presented with a one-week history of polyuria, poor appetite, and vomiting. +She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months at the time of presentation. +Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent laboratory findings on admission were: serum glucose 111 mg/dl, creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, and venous pH 7.27. +''' +data = spark.createDataFrame([[text]]).toDF("text") +result = jsl_ner_pipeline.fit(data).transform(data) +result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols"))\ + .select(F.expr("cols['0']").alias("chunk"), + F.expr("cols['1']['entity']").alias("ner_label")).show(100, truncate=False) + ++-----------------------------+----------------------------+ +|chunk |ner_label | ++-----------------------------+----------------------------+ +|28-year-old |Age | +|female |Gender | +|gestational diabetes mellitus|Diabetes | +|eight years prior |RelativeDate | +|type two diabetes mellitus |Diabetes | +|T2DM |Diabetes | +|HTG-induced pancreatitis |Disease_Syndrome_Disorder | +|three years prior |RelativeDate | +|acute |Modifier | +|hepatitis |Disease_Syndrome_Disorder | +|one-week |Duration | +|polyuria |Symptom | +|poor appetite |Symptom | +|vomiting |Symptom | +|She |Gender | +|metformin |Drug_Ingredient | +|glipizide |Drug_Ingredient | +|dapagliflozin |Drug_Ingredient | +|T2DM |Diabetes | +|atorvastatin |Drug_Ingredient | +|gemfibrozil |Drug_Ingredient | +|HTG |Hyperlipidemia | +|She |Gender | +|dapagliflozin |Drug_Ingredient | +|for six months |Duration | +|dry oral mucosa |Symptom | +|her |Gender | +|abdominal |External_body_part_or_region| +|tenderness |Symptom | +|guarding |Symptom | +|rigidity |Symptom | +|admission |Admission_Discharge | +|serum glucose |Test | +|111 mg/dl |Test_Result | +|creatinine |Test | +|0.4 mg/dL |Test_Result | +|triglycerides |Triglycerides | +|508 mg/dL |Test_Result | +|total cholesterol 122 mg/dL |Total_Cholesterol | +|venous pH |Test | +|7.27 |Test_Result | ++-----------------------------+----------------------------+ {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel - .pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") - .setOutputCol("sentence") +val sentenceDetector = SentenceDetector.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") + .setInputCols(Array("document")) + .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) .setOutputCol("token") -val word_embeddings = nlp.WordEmbeddingsModel - .pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") - -val jsl_ner = medical.NerModel - .pretrained("ner_jsl", "en", "clinical/models") - .setInputCols(Array("sentence", "token","embeddings")) - .setOutputCol("jsl_ner") - -val jsl_ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "jsl_ner")) - .setOutputCol("jsl_ner_chunk") - -val pipeline = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - jsl_ner, - jsl_ner_converter -)) - -val result = pipeline.fit(data).transform(data) - +val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") +val jslNer = NerModel.pretrained("ner_jsl", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("jsl_ner") + +val jslNerConverter = new NerConverter() + .setInputCols(Array("sentence", "token", "jsl_ner")) + .setOutputCol("ner_chunk") + +val jslNerPipeline = new Pipeline() + .setStages(Array(documentAssembler, + sentenceDetector, + tokenizer, + wordEmbeddings, + jslNer, + jslNerConverter)) + +val text = "A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, and associated with an acute hepatitis, presented with a one-week history of polyuria, poor appetite, and vomiting. +She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months at the time of presentation. +Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent laboratory findings on admission were: serum glucose 111 mg/dl, creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, and venous pH 7.27." + +val data = Seq(text).toDF("text") + +val result = jslNerPipeline.fit(data).transform(data) + ++-----------------------------+----------------------------+ +|chunk |ner_label | ++-----------------------------+----------------------------+ +|28-year-old |Age | +|female |Gender | +|gestational diabetes mellitus|Diabetes | +|eight years prior |RelativeDate | +|type two diabetes mellitus |Diabetes | +|T2DM |Diabetes | +|HTG-induced pancreatitis |Disease_Syndrome_Disorder | +|three years prior |RelativeDate | +|acute |Modifier | +|hepatitis |Disease_Syndrome_Disorder | +|one-week |Duration | +|polyuria |Symptom | +|poor appetite |Symptom | +|vomiting |Symptom | +|She |Gender | +|metformin |Drug_Ingredient | +|glipizide |Drug_Ingredient | +|dapagliflozin |Drug_Ingredient | +|T2DM |Diabetes | +|atorvastatin |Drug_Ingredient | +|gemfibrozil |Drug_Ingredient | +|HTG |Hyperlipidemia | +|She |Gender | +|dapagliflozin |Drug_Ingredient | +|for six months |Duration | +|dry oral mucosa |Symptom | +|her |Gender | +|abdominal |External_body_part_or_region| +|tenderness |Symptom | +|guarding |Symptom | +|rigidity |Symptom | +|admission |Admission_Discharge | +|serum glucose |Test | +|111 mg/dl |Test_Result | +|creatinine |Test | +|0.4 mg/dL |Test_Result | +|triglycerides |Triglycerides | +|508 mg/dL |Test_Result | +|total cholesterol 122 mg/dL |Total_Cholesterol | +|venous pH |Test | +|7.27 |Test_Result | ++-----------------------------+----------------------------+ {%- endcapture -%} - {%- capture model_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal documentAssembler = nlp.DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") - + .setInputCol("text")\ + .setOutputCol("document") + sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ - .setInputCols(["document"])\ - .setOutputCol("sentence") + .setInputCols(["document"])\ + .setOutputCol("sentence") tokenizer = nlp.Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token") + .setInputCols(["sentence"])\ + .setOutputCol("token") -embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") +embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en") \ + .setInputCols("sentence", "token") \ + .setOutputCol("embeddings")\ -ner_model = legal.NerModel.pretrained("legner_headers", "en", "legal/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") +ner_model = legal.NerModel.pretrained("legner_contract_doc_parties", "en", "legal/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") ner_converter = nlp.NerConverter()\ - .setInputCols(["sentence","token","ner"])\ - .setOutputCol("ner_chunk") - -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter]) - - - -result = nlpPipeline.fit(data).transform(data) - + .setInputCols(["sentence","token","ner"])\ + .setOutputCol("ner_chunk") +pipeline = nlp.Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +text = """EXCLUSIVE DISTRIBUTOR AGREEMENT (" Agreement ") dated as April 15, 1994 by and between IMRS OPERATIONS INC., a Delaware corporation with its principal place of business at 777 Long Ridge Road, Stamford, Connecticut 06902, U.S.A. (hereinafter referred to as " Developer ") and Delteq Pte Ltd, a Singapore company (and a subsidiary of Wuthelam Industries (S) Pte LTD ) with its principal place of business at 215 Henderson Road , #101-03 Henderson Industrial Park , Singapore 0315 ( hereinafter referred to as " Distributor ").""" + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols"))\ + .select(F.expr("cols['0']").alias("chunk"), + F.expr("cols['1']['entity']").alias("ner_label")).show(100, truncate=False) + ++-------------------------------+---------+ +|chunk |ner_label| ++-------------------------------+---------+ +|EXCLUSIVE DISTRIBUTOR AGREEMENT|DOC | +|April 15, 1994 |EFFDATE | +|IMRS OPERATIONS INC |PARTY | +|Developer |ALIAS | +|Delteq Pte Ltd |PARTY | +|Distributor |ALIAS | ++-------------------------------+---------+ {%- endcapture -%} {%- capture model_scala_legal -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel - .pretrained("sentence_detector_dl","xx") - .setInputCols("document") - .setOutputCol("sentence") - +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols(Array("document")) + .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) .setOutputCol("token") - -val embeddings = nlp.RoBertaEmbeddings - .pretrained("roberta_embeddings_legal_roberta_base", "en") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") - - -val ner_model = legal.NerModel - .pretrained("legner_headers", "en", "legal/models") - .setInputCols(Array("sentence", "token","embeddings")) - .setOutputCol("ner") - - -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") +val nerModel = NerModel.pretrained("legner_contract_doc_parties", "en", "legal/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") +val nerConverter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, tokenizer, embeddings, - ner_model, - ner_converter -)) - -val result = pipeline.fit(data).transform(data) - - -{%- endcapture -%} + nerModel, + nerConverter)) +val text = """EXCLUSIVE DISTRIBUTOR AGREEMENT ("Agreement") dated as April 15, 1994 by and between IMRS OPERATIONS INC., a Delaware corporation with its principal place of business at 777 Long Ridge Road, Stamford, Connecticut 06902, U.S.A. (hereinafter referred to as "Developer") and Delteq Pte Ltd, a Singapore company (and a subsidiary of Wuthelam Industries (S) Pte LTD) with its principal place of business at 215 Henderson Road, #101-03 Henderson Industrial Park, Singapore 0315 (hereinafter referred to as "Distributor").""" +val data = Seq(text).toDF("text") +val result = pipeline.fit(data).transform(data) ++-------------------------------+---------+ +|chunk |ner_label| ++-------------------------------+---------+ +|EXCLUSIVE DISTRIBUTOR AGREEMENT|DOC | +|April 15, 1994 |EFFDATE | +|IMRS OPERATIONS INC |PARTY | +|Developer |ALIAS | +|Delteq Pte Ltd |PARTY | +|Distributor |ALIAS | ++-------------------------------+---------+ +{%- endcapture -%} {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance documentAssembler = nlp.DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") - -sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ - .setInputCols(["document"])\ - .setOutputCol("sentence") + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") tokenizer = nlp.Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token") + .setInputCols(["sentence"])\ + .setOutputCol("token") -embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ - .setInputCols(["sentence", "token"]) \ +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en")\ + .setInputCols(["sentence", "token"])\ .setOutputCol("embeddings") -ner_model = finance.NerModel.pretrained("finner_headers", "en", "finance/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") - -ner_converter = nlp.NerConverter()\ - .setInputCols(["sentence","token","ner"])\ - .setOutputCol("ner_chunk") - -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter]) - - +ner_model = finance.NerModel.pretrained("finner_sec_conll", "en", "finance/models") \ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") -result = nlpPipeline.fit(data).transform(data) +ner_converter = finance.NerConverterInternal()\ + .setInputCols(["sentence","token","ner"])\ + .setOutputCol("ner_chunk") +pipeline = nlp.Pipeline(stages=[ + documentAssembler, + sentence_detector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +text = '''December 2007 SUBORDINATED LOAN AGREEMENT. THIS LOAN AGREEMENT is made on 7th December, 2007 BETWEEN: (1) SILICIUM DE PROVENCE S.A.S., a private company with limited liability, incorporated under the laws of France, whose registered office is situated at Usine de Saint Auban, France, represented by Mr.Frank Wouters, hereinafter referred to as the "Borrower", and ( 2 ) EVERGREEN SOLAR INC., a company incorporated in Delaware, U.S.A., with registered number 2426798, whose registered office is situated at Bartlett Street, Marlboro, Massachusetts, U.S.A. represented by Richard Chleboski, hereinafter referred to as "Lender" ''' + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols"))\ + .select(F.expr("cols['0']").alias("chunk"), + F.expr("cols['1']['entity']").alias("ner_label")).show(100, truncate=False) ++--------------------------+---------+ +|chunk |ner_label| ++--------------------------+---------+ +|SILICIUM DE PROVENCE S.A.S|ORG | +|France |LOC | +|Usine de Saint Auban |LOC | +|France |LOC | +|Mr.Frank Wouters |PER | +|Borrower |PER | +|EVERGREEN SOLAR INC |ORG | +|Delaware |LOC | +|U.S.A |LOC | +|Bartlett Street |LOC | +|Marlboro |LOC | +|Massachusetts |LOC | +|U.S.A |LOC | +|Richard Chleboski |PER | +|Lender |PER | ++--------------------------+---------+ {%- endcapture -%} + {%- capture model_scala_finance -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel - .pretrained("sentence_detector_dl","xx") - .setInputCols("document") - .setOutputCol("sentence") - +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols(Array("document")) + .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) .setOutputCol("token") - -val embeddings = nlp.BertEmbeddings - .pretrained("bert_embeddings_sec_bert_base", "en") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") - - -val ner_model = finance.NerModel - .pretrained("finner_headers", "en", "finance/models") - .setInputCols(Array("sentence", "token","embeddings")) - .setOutputCol("ner") - +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased", "en") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") +val nerModel = NerModel.pretrained("finner_sec_conll", "en", "finance/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") +val nerConverter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, tokenizer, embeddings, - ner_model, - ner_converter -)) + nerModel, + nerConverter)) -val result = pipeline.fit(data).transform(data) +val text = '''December 2007 SUBORDINATED LOAN AGREEMENT. THIS LOAN AGREEMENT is made on 7th December, 2007 BETWEEN: (1) SILICIUM DE PROVENCE S.A.S., a private company with limited liability, incorporated under the laws of France, whose registered office is situated at Usine de Saint Auban, France, represented by Mr.Frank Wouters, hereinafter referred to as the "Borrower", and ( 2 ) EVERGREEN SOLAR INC., a company incorporated in Delaware, U.S.A., with registered number 2426798, whose registered office is situated at Bartlett Street, Marlboro, Massachusetts, U.S.A. represented by Richard Chleboski, hereinafter referred to as "Lender" ''' +val data = Seq((text)).toDF("text") + +val result = pipeline.fit(data).transform(data) ++--------------------------+---------+ +|chunk |ner_label| ++--------------------------+---------+ +|SILICIUM DE PROVENCE S.A.S|ORG | +|France |LOC | +|Usine de Saint Auban |LOC | +|France |LOC | +|Mr.Frank Wouters |PER | +|Borrower |PER | +|EVERGREEN SOLAR INC |ORG | +|Delaware |LOC | +|U.S.A |LOC | +|Bartlett Street |LOC | +|Marlboro |LOC | +|Massachusetts |LOC | +|U.S.A |LOC | +|Richard Chleboski |PER | +|Lender |PER | ++--------------------------+---------+ {%- endcapture -%} {%- capture model_api_link -%} @@ -320,6 +470,10 @@ val result = pipeline.fit(data).transform(data) [MedicalNerModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/ner/medical_ner/index.html#sparknlp_jsl.annotator.ner.medical_ner.MedicalNerModel) {%- endcapture -%} +{%- capture model_notebook_link -%} +[MedicalNerModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/MedicalNerModel.ipynb) +{%- endcapture -%} + {%- capture approach_description -%} This Named Entity recognition annotator allows to train generic NER model based on Neural Networks. @@ -341,8 +495,6 @@ For extended examples of usage, see the [Spark NLP Workshop](https://github.com/ {%- endcapture -%} - - {%- capture approach_input_anno -%} DOCUMENT, TOKEN, WORD_EMBEDDINGS {%- endcapture -%} @@ -352,7 +504,7 @@ NAMED_ENTITY {%- endcapture -%} {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # First extract the prerequisites for the NerDLApproach documentAssembler = nlp.DocumentAssembler() \ @@ -388,7 +540,7 @@ nerTagger = medical.NerApproach()\ .setGraphFolder('medical_ner_graphs')\ .setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch -pipeline = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ documentAssembler, sentence, tokenizer, @@ -405,7 +557,7 @@ pipelineModel = pipeline.fit(trainingData) {%- endcapture -%} {%- capture approach_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal # First extract the prerequisites for the NerDLApproach documentAssembler = nlp.DocumentAssembler() \ @@ -441,7 +593,7 @@ nerTagger = legal.NerApproach()\ .setGraphFolder('medical_ner_graphs')\ .setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch -pipeline = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ documentAssembler, sentence, tokenizer, @@ -451,7 +603,7 @@ nerTagger {%- endcapture -%} {%- capture approach_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance # First extract the prerequisites for the NerDLApproach documentAssembler = nlp.DocumentAssembler() \ @@ -487,7 +639,7 @@ nerTagger = finance.NerApproach()\ .setGraphFolder('medical_ner_graphs')\ .setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch -pipeline = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ documentAssembler, sentence, tokenizer, @@ -499,27 +651,28 @@ nerTagger {%- capture approach_scala_medical -%} -from johnsnowlabs import * +import spark.implicits._ + // First extract the prerequisites for the NerDLApproach -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentence = new nlp.SentenceDetector() +val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel +val embeddings = WordEmbeddingsModel .pretrained('embeddings_clinical', "en", "clinical/models") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") // Then the training can start -val nerTagger =new medical.NerApproach() +val nerTagger =new MedicalNerApproach() .setInputCols(Array("sentence", "token", "embeddings")) .setLabelColumn("label") .setOutputCol("ner") @@ -550,27 +703,28 @@ val pipelineModel = pipeline.fit(trainingData) {%- capture approach_scala_legal -%} -from johnsnowlabs import * +import spark.implicits._ + // First extract the prerequisites for the NerDLApproach -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentence = new nlp.SentenceDetector() +val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel +val embeddings = WordEmbeddingsModel .pretrained('embeddings_clinical', "en", "clinical/models") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") // Then the training can start -val nerTagger =new legal.NerApproach() +val nerTagger =new LegalNerApproach() .setInputCols(Array("sentence", "token", "embeddings")) .setLabelColumn("label") .setOutputCol("ner") @@ -593,27 +747,28 @@ val pipeline = new Pipeline().setStages(Array( {%- endcapture -%} {%- capture approach_scala_finance -%} -from johnsnowlabs import * +import spark.implicits._ + // First extract the prerequisites for the NerDLApproach -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val sentence = new nlp.SentenceDetector() +val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("sentence") .setOutputCol("token") -val embeddings = nlp.WordEmbeddingsModel +val embeddings = WordEmbeddingsModel .pretrained('embeddings_clinical', "en", "clinical/models") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") // Then the training can start -val nerTagger =new finance.NerApproach() +val nerTagger =new FinanceNerApproach() .setInputCols(Array("sentence", "token", "embeddings")) .setLabelColumn("label") .setOutputCol("ner") @@ -644,6 +799,10 @@ val pipeline = new Pipeline().setStages(Array( [MedicalNerApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/ner/medical_ner/index.html#sparknlp_jsl.annotator.ner.medical_ner.MedicalNerApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[MedicalNerApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/MedicalNerApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -659,6 +818,7 @@ model_python_finance=model_python_finance model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -670,4 +830,5 @@ approach_scala_legal=approach_scala_legal approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/NerQuestionGenerator.md b/docs/en/licensed_annotator_entries/NerQuestionGenerator.md index 95b8f94c9f..8ec21b7c7d 100644 --- a/docs/en/licensed_annotator_entries/NerQuestionGenerator.md +++ b/docs/en/licensed_annotator_entries/NerQuestionGenerator.md @@ -15,9 +15,13 @@ The question is generated in the form of `[QUESTIONPRONOUN] [ENTITY1] [ENTITY2] Parametres: - `questionPronoun`: Pronoun to be used in the question. E.g., 'When', 'Where', 'Why', 'How', 'Who', 'What'. + - `strategyType`: Strategy for the proccess, either `Paired` (default) or `Combined`. + - `questionMark`: Whether to add a question mark at the end of the question. + - `entities1`: List with the entity types of entities that appear first in the question. + - `entities2`: List with the entity types of entities that appear second in the question. diff --git a/docs/en/licensed_annotator_entries/QuestionAnswering.md b/docs/en/licensed_annotator_entries/QuestionAnswering.md index eebeab5dc9..f0a8ec30d9 100644 --- a/docs/en/licensed_annotator_entries/QuestionAnswering.md +++ b/docs/en/licensed_annotator_entries/QuestionAnswering.md @@ -10,6 +10,7 @@ model QuestionAnswering is a GPT-based model for answering questions given a context. Unlike span-based models, it generates the answers to the questions, rather than selecting phrases from the given context. The model is capable of answering various types of questions, including yes-no or full-text ones. Types of questions are supported: `"short"` (producing yes/no/maybe) answers and `"long"` (full answers). Parameters: + - `questionType`: Question type, e.g. “short” or “long”. The question types depend on the model. - `maxNewTokens`: Maximum number of of new tokens to generate, by default 30 @@ -106,8 +107,6 @@ val question = "Is there an optimal time of acid suppression for maximal healing val data = Seq(paperAbstract, question).toDF("context", "question") -data.show() - +------------------------------------------------------------+------------------------------------------------------------+ | context| question| +------------------------------------------------------------+------------------------------------------------------------+ @@ -116,7 +115,6 @@ data.show() val result = pipeline.fit(data).transform(data) -result.selectExpr("document_question.result as Question", "answer.result as Long_Answer").show(false) +-------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ |Question |Long_Answer | @@ -215,8 +213,6 @@ val pipeline = new Pipeline().setStages(Array(documentAssembler, legQA)) val result = pipeline.fit(data).transform(data) -result.selectExpr("document_question.result as Question", "answer.result as Answer").show(false) - +----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ |Question |Answer | +----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ @@ -303,8 +299,6 @@ val questions = Seq( val data = questions.map(q => (context, q)).toDF("context", "question") -data.show(false) - +------------------------------------------------------------------+--------------------------------------------------------------------------------+ | question| context| +------------------------------------------------------------------+--------------------------------------------------------------------------------+ @@ -315,8 +309,6 @@ data.show(false) val result = pipeline.fit(data).transform(data) -result.selectExpr("question.result", "answer.result").show(false) - +------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |question |result | +------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -334,10 +326,6 @@ result.selectExpr("question.result", "answer.result").show(false) [MedicalQuestionAnswering](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/qa/medical_qa/index.html#sparknlp_jsl.annotator.qa.medical_qa.MedicalQuestionAnswering) {%- endcapture -%} -{%- capture model_notebook_link -%} - -{%- endcapture -%} - {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -352,4 +340,4 @@ model_python_finance=model_python_finance model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link -model_notebook_link=model_notebook_link%} +%} diff --git a/docs/en/licensed_annotator_entries/RelationExtraction.md b/docs/en/licensed_annotator_entries/RelationExtraction.md index 44d98abb49..6819b9c796 100644 --- a/docs/en/licensed_annotator_entries/RelationExtraction.md +++ b/docs/en/licensed_annotator_entries/RelationExtraction.md @@ -13,6 +13,22 @@ model {%- capture model_description -%} Extracts and classifies instances of relations between named entities. +Parameters: + +- `predictionThreshold` *(Float)*: Sets minimal activation of the target unit to encode a new relation instance. + +- `relationPairs` *(List[Str])*: List of dash-separated pairs of named entities. For example, [“Biomarker-RelativeDay”] will process all relations between entities of type “Biomarker” and “RelativeDay”. + +- `relationPairsCaseSensitive` *(Bool)*: Determines whether relation pairs are case sensitive. + +- `relationTypePerPair` *dict[str, list[str]]*: List of entity pairs per relations which limit the entities can form a relation. For example, {“CAUSE”: [“PROBLEM”, “SYMPTOM”]} which only let a “CAUSE” relation to hold between a problem (“PROBLEM) and a symptom (“SYMTOM”). + +- `maxSyntacticDistance` *(Int)*: Maximal syntactic distance, as threshold (Default: 0). Determine how far the “from entity” can be from the “to entity” in the text. Increasing this value will increase recall, but also increase the number of false positives. + +- `customLabels` *(dict[str, str])*: Custom relation labels. + +- `multiClass` *(Bool)*: If multiClass is set, the model will return all the labels with corresponding scores (Default: False) + For pretrained models please see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Relation+Extraction) for available models. @@ -28,186 +44,162 @@ CATEGORY {%- capture model_python_medical -%} from johnsnowlabs import nlp, medical -# Relation Extraction between body parts -# Define pipeline stages to extract entities -documenter = nlp.DocumentAssembler() \ - .setInputCol("text") \ + +documenter = nlp.DocumentAssembler()\ + .setInputCol("text")\ .setOutputCol("document") -sentencer = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ +sentencer = nlp.SentenceDetector()\ + .setInputCols(["document"])\ .setOutputCol("sentences") -tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentences"]) \ +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentences"])\ .setOutputCol("tokens") -words_embedder = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ - .setInputCols(["sentences", "tokens"]) \ +words_embedder = nlp.WordEmbeddingsModel()\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ .setOutputCol("embeddings") -pos_tagger = nlp.PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ - .setInputCols(["sentences", "tokens"]) \ +pos_tagger = nlp.PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ .setOutputCol("pos_tags") -dependency_parser = nlp.DependencyParserModel.pretrained("dependency_conllu", "en") \ - .setInputCols(["sentences", "pos_tags", "tokens"]) \ - .setOutputCol("dependencies") - -clinical_ner_tagger = medical.NerModel.pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") \ - .setInputCols(["sentences", "tokens", "embeddings"]) \ +ner_tagger = medical.NerModel()\ + .pretrained("ner_posology", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ .setOutputCol("ner_tags") -ner_chunker = nlp.NerConverter() \ - .setInputCols(["sentences", "tokens", "ner_tags"]) \ +ner_chunker = medical.NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ .setOutputCol("ner_chunks") -# Define the relations that are to be extracted -relationPairs = [ - "direction-external_body_part_or_region", - "external_body_part_or_region-direction", - "direction-internal_organ_or_component", - "internal_organ_or_component-direction" -] - -re_model = medical.RelationExtractionModel.pretrained("re_bodypart_directions", "en", "clinical/models") \ - .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"]) \ - .setOutputCol("relations") \ - .setRelationPairs(relationPairs) \ - .setMaxSyntacticDistance(4) \ - .setPredictionThreshold(0.9) - -pipeline = Pipeline().setStages([ +dependency_parser = nlp.DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +reModel = medical.RelationExtractionModel()\ + .pretrained("posology_re")\ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ + .setOutputCol("relations")\ + .setMaxSyntacticDistance(4) + +pipeline = nlp.Pipeline(stages=[ documenter, sentencer, tokenizer, words_embedder, pos_tagger, - clinical_ner_tagger, + ner_tagger, ner_chunker, dependency_parser, - re_model + reModel ]) -data = spark.createDataFrame([["MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia"]]).toDF("text") -result = pipeline.fit(data).transform(data) +text = """ +The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also +given 1 unit of Metformin daily. +He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , +12 units of insulin lispro with meals , and metformin 1000 mg two times a day. +""" +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) # Show results -# -result.selectExpr("explode(relations) as relations") - .select( - "relations.metadata.chunk1", - "relations.metadata.entity1", - "relations.metadata.chunk2", - "relations.metadata.entity2", - "relations.result" - ) - .where("result != 0") - .show(truncate=False) +result.select(F.explode(F.arrays_zip( + result.relations.result, + result.relations.metadata)).alias("cols"))\ +.select( + F.expr("cols['1']['chunk1']").alias("chunk1"), + F.expr("cols['1']['chunk2']").alias("chunk2"), + F.expr("cols['1']['entity1']").alias("entity1"), + F.expr("cols['1']['entity2']").alias("entity2"), + F.expr("cols['0']").alias("relations"), + F.expr("cols['1']['confidence']").alias("confidence")).show(5, truncate=False) + ++---------+----------------+-------+---------+--------------+----------+ +|chunk1 |chunk2 |entity1|entity2 |relations |confidence| ++---------+----------------+-------+---------+--------------+----------+ +|1 unit |Advil |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Advil |for 5 days |DRUG |DURATION |DRUG-DURATION |1.0 | +|1 unit |Metformin |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Metformin|daily |DRUG |FREQUENCY|DRUG-FREQUENCY|1.0 | +|40 units |insulin glargine|DOSAGE |DRUG |DOSAGE-DRUG |1.0 | ++---------+----------------+-------+---------+--------------+----------+ -# Show results -result.selectExpr("explode(relations) as relations") \ - .select( - "relations.metadata.chunk1", - "relations.metadata.entity1", - "relations.metadata.chunk2", - "relations.metadata.entity2", - "relations.result" - ).where("result != 0") \ - .show(truncate=False) -+------+---------+-------------+---------------------------+------+ -|chunk1|entity1 |chunk2 |entity2 |result| -+------+---------+-------------+---------------------------+------+ -|upper |Direction|brain stem |Internal_organ_or_component|1 | -|left |Direction|cerebellum |Internal_organ_or_component|1 | -|right |Direction|basil ganglia|Internal_organ_or_component|1 | -+------+---------+-------------+---------------------------+------+ {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * -// Relation Extraction between body parts -// Define pipeline stages to extract entities -val documenter = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") +import spark.implicits._ -val sentencer = new nlp.SentenceDetector() - .setInputCols("document") - .setOutputCol("sentences") +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentences") - .setOutputCol("tokens") +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") -val words_embedder = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(Array("sentences", "tokens")) - .setOutputCol("embeddings") +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") -val pos_tagger = nlp.PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") - .setInputCols(Array("sentences", "tokens")) - .setOutputCol("pos_tags") +val words_embedder = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentences","tokens")) + .setOutputCol("embeddings") -val dependency_parser = nlp.DependencyParserModel.pretrained("dependency_conllu", "en") - .setInputCols(Array("sentences", "pos_tags", "tokens")) - .setOutputCol("dependencies") +val pos_tagger = PerceptronModel.pretrained("pos_clinical","en","clinical/models") + .setInputCols(Array("sentences","tokens")) + .setOutputCol("pos_tags") -val clinical_ner_tagger = medical.NerModel.pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") - .setInputCols(Array("sentences", "tokens", "embeddings")) - .setOutputCol("ner_tags") +val ner_tagger = MedicalNerModel.pretrained("ner_posology","en","clinical/models") + .setInputCols("sentences","tokens","embeddings") + .setOutputCol("ner_tags") -val ner_chunker = new nlp.NerConverter() - .setInputCols(Array("sentences", "tokens", "ner_tags")) - .setOutputCol("ner_chunks") +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences","tokens","ner_tags")) + .setOutputCol("ner_chunks") -// Define the relations that are to be extracted -val relationPairs = Array("direction-external_body_part_or_region", - "external_body_part_or_region-direction", - "direction-internal_organ_or_component", - "internal_organ_or_component-direction") +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu","en") + .setInputCols(Array("sentences","pos_tags","tokens")) + .setOutputCol("dependencies") -val re_model = medical.RelationExtractionModel.pretrained("re_bodypart_directions", "en", "clinical/models") - .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) - .setOutputCol("relations") - .setRelationPairs(relationPairs) - .setMaxSyntacticDistance(4) - .setPredictionThreshold(0.9f) +val reModel = RelationExtractionModel.pretrained("posology_re") + .setInputCols(Array("embeddings","pos_tags","ner_chunks","dependencies")) + .setOutputCol("relations") + .setMaxSyntacticDistance(4) val pipeline = new Pipeline().setStages(Array( - documenter, - sentencer, - tokenizer, - words_embedder, - pos_tagger, - clinical_ner_tagger, - ner_chunker, - dependency_parser, - re_model -)) - -val data = Seq("MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia").toDF("text") -val result = pipeline.fit(data).transform(data) + documenter, + sentencer, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_chunker, + dependency_parser, + reModel )) + +val text = " The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also given 1 unit of Metformin daily. He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals ,and metformin 1000 mg two times a day. " + +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) // Show results -// -// result.selectExpr("explode(relations) as relations") -// .select( -// "relations.metadata.chunk1", -// "relations.metadata.entity1", -// "relations.metadata.chunk2", -// "relations.metadata.entity2", -// "relations.result" -// ) -// .where("result != 0") -// .show(truncate=false) -// +------+---------+-------------+---------------------------+------+ -// |chunk1|entity1 |chunk2 |entity2 |result| -// +------+---------+-------------+---------------------------+------+ -// |upper |Direction|brain stem |Internal_organ_or_component|1 | -// |left |Direction|cerebellum |Internal_organ_or_component|1 | -// |right |Direction|basil ganglia|Internal_organ_or_component|1 | -// +------+---------+-------------+---------------------------+------+ -// + ++---------+----------------+-------+---------+--------------+----------+ +|chunk1 |chunk2 |entity1|entity2 |relations |confidence| ++---------+----------------+-------+---------+--------------+----------+ +|1 unit |Advil |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Advil |for 5 days |DRUG |DURATION |DRUG-DURATION |1.0 | +|1 unit |Metformin |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Metformin|daily |DRUG |FREQUENCY|DRUG-FREQUENCY|1.0 | +|40 units |insulin glargine|DOSAGE |DRUG |DOSAGE-DRUG |1.0 | ++---------+----------------+-------+---------+--------------+----------+ + {%- endcapture -%} @@ -219,6 +211,10 @@ val result = pipeline.fit(data).transform(data) [RelationExtractionModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/re/relation_extraction/index.html#sparknlp_jsl.annotator.re.relation_extraction.RelationExtractionModel) {%- endcapture -%} +{%- capture model_notebook_link -%} +[RelationExtractionModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/RelationExtractionModel.ipynb) +{%- endcapture -%} + {%- capture approach_description -%} Trains a TensorFlow model for relation extraction. @@ -227,6 +223,60 @@ To train a custom relation extraction model, you need to first creat a Tensorflo If the parameter `relationDirectionCol` is set, the model will be trained using the direction information (see the parameter decription for details). Otherwise, the model won't have direction between the relation of the entities. After training a model (using the `.fit()` method), the resulting object is of class `RelationExtractionModel`. + +Parameters: + +- `FromEntity`: (begin_col: str, end_col: str, label_col: str) Sets from entity + +- `begin_col` Column that has a reference of where the chunk begins + +- `end_col`: Column that has a reference of where the chunk ends + +- `label_col`: Column that has a reference what are the type of chunk + +- `ToEntity`: (begin_col: str, end_col: str, label_col: str) Sets to entity + +- `begin_col` Column that has a reference of where the chunk begins + +- `end_col`: Column that has a reference of where the chunk ends + +- `label_col`: Column that has a reference what are the type of chunk + +- `CustomLabels`: (labels: dict[str, str]) Sets custom relation labels + +- `labels`: Dictionary which maps old to new labels + +- `RelationDirectionCol`: (col: str) Relation direction column (possible values are: "none", "left" or "right"). If this parameter is not set, the model will not have direction between the relation of the entities + +- `col` Column contains the relation direction values + +- `PretrainedModelPath` (value: str) Path to an already trained model saved to disk, which is used as a starting point for training the new model + +- `ОverrideExistingLabels` (bool) Whether to override already learned labels when using a pretrained model to initialize the new model. Default is ‘true’ + +- `batchSize`: (Int) Size for each batch in the optimization process + +- `EpochsNumber` (Int) Maximum number of epochs to train + +- `Dropout`: (Float) Dropout at the output of each layer + +- `LearningRate`: (Float) Learning rate for the optimization process + +- `OutputLogsPath`: (Str) Folder path to save training logs. If no path is specified, the logs won't be stored in disk. The path can be a local file path, a distributed file path (HDFS, DBFS), or a cloud storage (S3). + +- `ModelFile`: (Str) The path to the Tensorflow graph + +- `FixImbalance` (Float) Fix the imbalance in the training set by replicating examples of under represented categories + +- `ValidationSplit` (Float) The proportion of training dataset to be used as validation set + +- `OverrideExistingLabels` (Boolean) Controls whether to override already learned lebels when using a pretrained model to initialize the new model. A value of true will override existing labels + +- `MultiClass` (Boolean) If multiClass is set, the model will return all the labels with corresponding scores. By default, multiClass is false. + +- `ModelFile` (Str) Location of file of the model used for classification + +- `MaxSyntacticDistance` (Int) Maximal syntactic distance, as threshold (Default: 0) {%- endcapture -%} {%- capture approach_input_anno -%} @@ -238,7 +288,7 @@ NONE {%- endcapture -%} {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical # Defining pipeline stages to extract entities first documentAssembler = nlp.DocumentAssembler() \ .setInputCol("text") \ @@ -295,7 +345,7 @@ finisher = nlp.Finisher() \ .setOutputAsArray(False) # Define complete pipeline and start training -pipeline = Pipeline(stages=[ +pipeline = nlp.Pipeline(stages=[ documentAssembler, tokenizer, embedder, @@ -311,41 +361,43 @@ model = pipeline.fit(trainData) {%- endcapture -%} {%- capture approach_scala_medical -%} +import spark.implicits._ + // Defining pipeline stages to extract entities first -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") -val tokenizer = new nlp.Tokenizer() +val tokenizer = new Tokenizer() .setInputCols("document") .setOutputCol("tokens") -val embedder = nlp.WordEmbeddingsModel +val embedder = WordEmbeddingsModel .pretrained("embeddings_clinical", "en", "clinical/models") .setInputCols(Array("document", "tokens")) .setOutputCol("embeddings") -val posTagger = nlp.PerceptronModel +val posTagger = PerceptronModel .pretrained("pos_clinical", "en", "clinical/models") .setInputCols(Array("document", "tokens")) .setOutputCol("posTags") -val nerTagger = medical.NerModel +val nerTagger = MedicalNerModel .pretrained("ner_events_clinical", "en", "clinical/models") .setInputCols(Array("document", "tokens", "embeddings")) .setOutputCol("ner_tags") -val nerConverter = new nlp.NerConverter() +val nerConverter = new NerConverter() .setInputCols(Array("document", "tokens", "ner_tags")) .setOutputCol("nerChunks") -val depencyParser = nlp.DependencyParserModel +val depencyParser = DependencyParserModel .pretrained("dependency_conllu", "en") .setInputCols(Array("document", "posTags", "tokens")) .setOutputCol("dependencies") // Then define `RelationExtractionApproach` and training parameters -val re = new medical.RelationExtractionApproach() +val re = new RelationExtractionApproach() .setInputCols(Array("embeddings", "posTags", "train_ner_chunks", "dependencies")) .setOutputCol("relations_t") .setLabelColumn("target_rel") @@ -358,7 +410,7 @@ val re = new medical.RelationExtractionApproach() .setFromEntity("from_begin", "from_end", "from_label") .setToEntity("to_begin", "to_end", "to_label") -val finisher = new nlp.Finisher() +val finisher = new Finisher() .setInputCols(Array("relations_t")) .setOutputCols(Array("relations")) .setCleanAnnotations(false) @@ -392,6 +444,10 @@ val model = pipeline.fit(trainData) [RelationExtractionApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/re/relation_extraction/index.html#sparknlp_jsl.annotator.re.relation_extraction.RelationExtractionApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[RelationExtractionApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/RelationExtractionApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -403,6 +459,7 @@ model_python_medical=model_python_medical model_scala_medical=model_scala_medical model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -410,4 +467,5 @@ approach_python_medical=approach_python_medical approach_scala_medical=approach_scala_medical approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/ResolverMerger.md b/docs/en/licensed_annotator_entries/ResolverMerger.md index 3f42ee3dc4..3fae90aa7a 100644 --- a/docs/en/licensed_annotator_entries/ResolverMerger.md +++ b/docs/en/licensed_annotator_entries/ResolverMerger.md @@ -18,6 +18,7 @@ However, for a more straightforward approach, we can use a chunk mapper method t Parametres: - `inputCols`: The name of the columns containing the input annotations. It can read an Array of strings. + - `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. diff --git a/docs/en/licensed_annotator_entries/Router.md b/docs/en/licensed_annotator_entries/Router.md index 4056e94f20..4be9fad88a 100644 --- a/docs/en/licensed_annotator_entries/Router.md +++ b/docs/en/licensed_annotator_entries/Router.md @@ -18,9 +18,13 @@ This solution eliminates the need to run `BertSentenceEmbeddings` multiple times Parametres: - `inputCols`: The name of the columns containing the input annotations. It can read an Array of strings. + - `outputCol`: The name of the column in the Document type that is generated. We can specify only one column here. + - `inputType`: The type of entity that you want to filter (by default `sentence_embeddings`). Possible values; `document|token|wordpiece|word_embeddings|sentence_embeddings|category|date|sentiment|pos|chunk|named_entity|regex|dependency|labeled_dependency|language|keyword` + - `metadataField`: The key in the metadata dictionary that you want to filter (by default `entity`) + - `filterFieldsElements`: The `filterfieldsElements` are the allowed values for the metadata field that is being used. All the parameters can be set using the corresponding set method in the camel case. For example, `.setInputcols()`. diff --git a/docs/en/licensed_annotator_entries/SentenceEntityResolver.md b/docs/en/licensed_annotator_entries/SentenceEntityResolver.md index 8e7675a8e2..06729394b6 100644 --- a/docs/en/licensed_annotator_entries/SentenceEntityResolver.md +++ b/docs/en/licensed_annotator_entries/SentenceEntityResolver.md @@ -15,6 +15,18 @@ The model transforms a dataset with Input Annotation type SENTENCE_EMBEDDINGS, c [BertSentenceEmbeddings](/docs/en/transformers#bertsentenceembeddings) and returns the normalized entity for a particular trained ontology / curated dataset (e.g. ICD-10, RxNorm, SNOMED etc.). +Parametres: + +- `distanceFunction`: Determines how the distance between different entities will be calculated. Either `COSINE` or `EUCLIDEAN`. + +- `neighbours`: The number of neighbours to consider when computing the distances. + +- `caseSensitive`: WWhether to consider text casing or not. + +- `threshold`: Threshold of the distance between nodes to consider. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. + For a list of pretrained models, please see the [Models Hub](https://nlp.johnsnowlabs.com/models?task=Entity+Resolution). {%- endcapture -%} @@ -28,42 +40,49 @@ ENTITY {%- endcapture -%} {%- capture model_python_medical -%} -from johnsnowlabs import * -# Resolving CPT -# First define pipeline stages to extract entities -documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ +from johnsnowlabs import nlp, medical + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ .setOutputCol("document") -sentenceDetector = nlp.SentenceDetectorDLModel.pretrained() \ - .setInputCols(["document"]) \ + +sentenceDetector = nlp.SentenceDetectorDLModel.pretrained()\ + .setInputCols(["document"])\ .setOutputCol("sentence") -tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentence"])\ .setOutputCol("token") -word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ - .setInputCols(["sentence", "token"]) \ + +word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ .setOutputCol("embeddings") -clinical_ner = medical.NerModel.pretrained("jsl_ner_wip_clinical", "en", "clinical/models") \ - .setInputCols(["sentence", "token", "embeddings"]) \ + +clinical_ner = medical.NerModel.pretrained("jsl_ner_wip_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ .setOutputCol("ner") -ner_converter = nlp.NerConverter() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") \ + +ner_converter = nlp.NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk")\ .setWhiteList(["Test","Procedure"]) -c2doc = nlp.Chunk2Doc() \ - .setInputCols(["ner_chunk"]) \ + +c2doc = nlp.Chunk2Doc()\ + .setInputCols(["ner_chunk"])\ .setOutputCol("ner_chunk_doc") -sbert_embedder = nlp.BertSentenceEmbeddings \ - .pretrained("sbiobert_base_cased_mli","en","clinical/models") \ - .setInputCols(["ner_chunk_doc"]) \ + +sbert_embedder = nlp.BertSentenceEmbeddings\ + .pretrained("sbiobert_base_cased_mli","en","clinical/models")\ + .setInputCols(["ner_chunk_doc"])\ .setOutputCol("sbert_embeddings") # Then the resolver is defined on the extracted entities and sentence embeddings -cpt_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_cpt_procedures_augmented","en", "clinical/models") \ - .setInputCols(["sbert_embeddings"]) \ - .setOutputCol("cpt_code") \ +cpt_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_cpt_procedures_augmented","en", "clinical/models")\ + .setInputCols(["sbert_embeddings"])\ + .setOutputCol("cpt_code")\ .setDistanceFunction("EUCLIDEAN") -sbert_pipeline_cpt = Pipeline().setStages([ + +pipeline = nlp.Pipeline().setStages([ documentAssembler, sentenceDetector, tokenizer, @@ -74,371 +93,260 @@ sbert_pipeline_cpt = Pipeline().setStages([ sbert_embedder, cpt_resolver]) -sbert_outputs = sbert_pipeline_cpt.fit(data_ner).transform(data) -# Show results -# -# sbert_outputs -# .select("explode(arrays_zip(ner_chunk.result ,ner_chunk.metadata, cpt_code.result, cpt_code.metadata, ner_chunk.begin, ner_chunk.end)) as cpt_code") -# .selectExpr( -# "cpt_code['0'] as chunk", -# "cpt_code['1'].entity as entity", -# "cpt_code['2'] as code", -# "cpt_code['3'].confidence as confidence", -# "cpt_code['3'].all_k_resolutions as all_k_resolutions", -# "cpt_code['3'].all_k_results as all_k_results" -# ).show(5) -# +--------------------+---------+-----+----------+--------------------+--------------------+ -# | chunk| entity| code|confidence| all_k_resolutions| all_k_codes| -# +--------------------+---------+-----+----------+--------------------+--------------------+ -# | heart cath|Procedure|93566| 0.1180|CCA - Cardiac cat...|93566:::62319:::9...| -# |selective coronar...| Test|93460| 0.1000|Coronary angiogra...|93460:::93458:::9...| -# |common femoral an...| Test|35884| 0.1808|Femoral artery by...|35884:::35883:::3...| -# | StarClose closure|Procedure|33305| 0.1197|Heart closure:::H...|33305:::33300:::3...| -# | stress test| Test|93351| 0.2795|Cardiovascular st...|93351:::94621:::9...| -# +--------------------+---------+-----+----------+--------------------+--------------------+ -# +text = """She was admitted to the hospital with chest pain and found to have bilateral pleural effusion, the right greater than the left. CT scan of the chest also revealed a large mediastinal lymph node. +We reviewed the pathology obtained from the pericardectomy in March 2006, which was diagnostic of mesothelioma. +At this time, chest tube placement for drainage of the fluid occurred and thoracoscopy, which were performed, which revealed epithelioid malignant mesothelioma.""" + +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) + +# Show Results ++--------------------+---------+-----+----------+--------------------+--------------------+ +| chunk| entity| code|confidence| all_k_results| all_k_resolutions| ++--------------------+---------+-----+----------+--------------------+--------------------+ +|CT scan of the chest| Test|62284| 0.2028|62284:::76497:::7...|Computed tomograp...| +| pericardectomy|Procedure|33031| 0.3329|33031:::33025:::3...|Pericardectomy [P...| +|chest tube placement|Procedure|39503| 0.9343|39503:::32036:::3...|Insertion of ches...| +|drainage of the f...|Procedure|49405| 0.2476|49405:::49407:::4...|Drainage procedur...| +| thoracoscopy|Procedure|32660| 0.1422|32660:::32667:::1...|Thoracoscopy [Tho...| ++--------------------+---------+-----+----------+--------------------+--------------------+ + {%- endcapture -%} {%- capture model_scala_medical -%} -from johnsnowlabs import * -// Resolving CPT -// First define pipeline stages to extract entities -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel.pretrained() - .setInputCols("document") - .setOutputCol("sentence") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") -val word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") -val clinical_ner = medical.NerModel.pretrained("jsl_ner_wip_clinical", "en", "clinical/models") - .setInputCols(Array("sentence", "token", "embeddings")) - .setOutputCol("ner") -val ner_converter = new nlp.NerConverter() - .setInputCols(array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - .setWhiteList("Test","Procedure") -val c2doc = new nlp.Chunk2Doc() - .setInputCols("ner_chunk") - .setOutputCol("ner_chunk_doc") -val sbert_embedder = nlp.BertSentenceEmbeddings - .pretrained("sbiobert_base_cased_mli","en","clinical/models") - .setInputCols("ner_chunk_doc") - .setOutputCol("sbert_embeddings") - -// Then the resolver is defined on the extracted entities and sentence embeddings -val cpt_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_cpt_procedures_augmented","en", "clinical/models") - .setInputCols(Array("sbert_embeddings")) - .setOutputCol("cpt_code") - .setDistanceFunction("EUCLIDEAN") -val sbert_pipeline_cpt = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - clinical_ner, - ner_converter, - c2doc, - sbert_embedder, - cpt_resolver)) - -// Show results -// -// sbert_outputs -// .select("explode(arrays_zip(ner_chunk.result ,ner_chunk.metadata, cpt_code.result, cpt_code.metadata, ner_chunk.begin, ner_chunk.end)) as cpt_code") -// .selectExpr( -// "cpt_code['0'] as chunk", -// "cpt_code['1'].entity as entity", -// "cpt_code['2'] as code", -// "cpt_code['3'].confidence as confidence", -// "cpt_code['3'].all_k_resolutions as all_k_resolutions", -// "cpt_code['3'].all_k_results as all_k_results" -// ).show(5) -// +--------------------+---------+-----+----------+--------------------+--------------------+ -// | chunk| entity| code|confidence| all_k_resolutions| all_k_codes| -// +--------------------+---------+-----+----------+--------------------+--------------------+ -// | heart cath|Procedure|93566| 0.1180|CCA - Cardiac cat...|93566:::62319:::9...| -// |selective coronar...| Test|93460| 0.1000|Coronary angiogra...|93460:::93458:::9...| -// |common femoral an...| Test|35884| 0.1808|Femoral artery by...|35884:::35883:::3...| -// | StarClose closure|Procedure|33305| 0.1197|Heart closure:::H...|33305:::33300:::3...| -// | stress test| Test|93351| 0.2795|Cardiovascular st...|93351:::94621:::9...| -// +--------------------+---------+-----+----------+--------------------+--------------------+ -// -{%- endcapture -%} +import spark.implicits._ +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") +val sentenceDetector = SentenceDetectorDLModel.pretrained() + .setInputCols(Array("document")) + .setOutputCol("sentence") -{%- capture model_python_legal -%} -from johnsnowlabs import * +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") -documentAssembler = nlp.DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") - -sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ - .setInputCols(["document"])\ - .setOutputCol("sentence") +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") -tokenizer = nlp.Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token") +val clinical_ner = MedicalNerModel.pretrained("jsl_ner_wip_clinical","en","clinical/models") + .setInputCols(Array("sentence","token","embeddings")) + .setOutputCol("ner") -embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") +val ner_converter = new NerConverter() + .setInputCols(Array("sentence","token","ner")) + .setOutputCol("ner_chunk") + .setWhiteList(Array("Test","Procedure")) -ner_model = legal.NerModel.pretrained("legner_orgs_prods_alias", "en", "legal/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") - -ner_converter = nlp.NerConverter()\ - .setInputCols(["sentence","token","ner"])\ - .setOutputCol("ner_chunk") +val c2doc = new Chunk2Doc() + .setInputCols(Array("ner_chunk")) + .setOutputCol("ner_chunk_doc") -chunk2doc = nlp.Chunk2Doc()\ - .setInputCols("ner_chunk")\ - .setOutputCol("ner_chunk_doc") +val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models") + .setInputCols(Array("ner_chunk_doc")) + .setOutputCol("sbert_embeddings") -sentence_embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") \ - .setInputCols("ner_chunk_doc") \ - .setOutputCol("sentence_embeddings") - -resolver = legal.SentenceEntityResolverModel.pretrained("legel_edgar_company_name", "en", "legal/models")\ - .setInputCols(["text", "sentence_embeddings"]) \ - .setOutputCol("resolution")\ - .setDistanceFunction("EUCLIDEAN") +// Then the resolver is defined on the extracted entities and sentence embeddings + +val cpt_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_cpt_procedures_augmented","en","clinical/models") + .setInputCols(Array("sbert_embeddings")) + .setOutputCol("cpt_code") + .setDistanceFunction("EUCLIDEAN") + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter, + c2doc, + sbert_embedder, + cpt_resolver)) -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter, - chunk2doc, - sentence_embeddings, - resolver -]) -result = pipeline.fit(data).transform(data) +val text = "She was admitted to the hospital with chest pain and found to have bilateral pleural effusion,the right greater than the left. CT scan of the chest also revealed a large mediastinal lymph node. We reviewed the pathology obtained from the pericardectomy in March 2006,which was diagnostic of mesothelioma. At this time,chest tube placement for drainage of the fluid occurred and thoracoscopy,which were performed,which revealed epithelioid malignant mesothelioma." +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) {%- endcapture -%} -{%- capture model_scala_legal -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetectorDLModel - .pretrained("sentence_detector_dl","xx") - .setInputCols("document") - .setOutputCol("sentence") +{%- capture model_python_legal -%} +from johnsnowlabs import nlp, legal -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("ner_chunk") - -val embeddings = nlp.BertEmbeddings - .pretrained("bert_embeddings_sec_bert_base", "en") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") +embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en")\ + .setInputCols("ner_chunk")\ + .setOutputCol("sentence_embeddings") + +resolver = legal.SentenceEntityResolverModel.pretrained("legel_edgar_company_name", "en", "legal/models")\ + .setInputCols(["ner_chunk", "sentence_embeddings"])\ + .setOutputCol("irs_code")\ + .setDistanceFunction("EUCLIDEAN") +pipeline = nlp.Pipeline( + stages = [ + documentAssembler, + embeddings, + resolver]) -val ner_model = legal.NerModel - .pretrained("legner_orgs_prods_alias", "en", "legal/models") - .setInputCols(Array("sentence", "token","embeddings")) - .setOutputCol("ner") +text = """CONTACT GOLD""" +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") +# Show Results ++------------+------------------+---------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|chunk |result |code |all_k_results |all_k_resolutions | ++------------+------------------+---------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|CONTACT GOLD|Contact Gold Corp.|981369960:::0:::208273426:::204092640:::0:::0:::270531073:::261918920:::0:::271989147:::0:::0|Contact Gold Corp.:::ISHARES GOLD TRUST:::Minatura Gold:::Mexus Gold US:::BESRA GOLD INC.:::ALAMOS GOLD INC:::JOSHUA GOLD RESOURCES INC:::MIDEX GOLD CORP.:::Gold Mark Stephen:::Guskin Gold Corp.:::CMX GOLD & SILVER CORP.:::Permal Gold Ltd.|Contact Gold Corp.:::ISHARES GOLD TRUST:::Minatura Gold:::Mexus Gold US:::BESRA GOLD INC.:::ALAMOS GOLD INC:::JOSHUA GOLD RESOURCES INC:::MIDEX GOLD CORP.:::Gold Mark Stephen:::Guskin Gold Corp.:::CMX GOLD & SILVER CORP.:::Permal Gold Ltd.| ++------------+------------------+---------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -val chunk2doc = new nlp.Chunk2Doc() - .setInputCols("ner_chunk") - .setOutputCol("ner_chunk_doc") +{%- endcapture -%} -val sentence_embeddings = nlp.UniversalSentenceEncoder - .pretrained("tfhub_use", "en") - .setInputCols("ner_chunk_doc") - .setOutputCol("sentence_embeddings") +{%- capture model_scala_legal -%} +import spark.implicits._ -val resolver = legal.SentenceEntityResolverModel - .pretrained("legel_edgar_company_name", "en", "legal/models") - .setInputCols(Array("text", "sentence_embeddings")) - .setOutputCol("resolution") - .setDistanceFunction("EUCLIDEAN") +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("ner_chunk") +val embeddings = UniversalSentenceEncoder.pretrained("tfhub_use","en") + .setInputCols("ner_chunk") + .setOutputCol("sentence_embeddings") + +val resolver = SentenceEntityResolverModel.pretrained("legel_edgar_company_name","en","legal/models") + .setInputCols(Array("ner_chunk","sentence_embeddings")) + .setOutputCol("irs_code") .setDistanceFunction("EUCLIDEAN") val pipeline = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter, - chunk2doc, - sentence_embeddings, - resolver -)) + documentAssembler, + embeddings, + resolver)) + +val text = "CONTACT GOLD" -val result = pipeline.fit(data).transform(data) +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) {%- endcapture -%} {%- capture model_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance documentAssembler = nlp.DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") - -sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ - .setInputCols(["document"])\ - .setOutputCol("sentence") + .setInputCol("text")\ + .setOutputCol("ner_chunk") -tokenizer = nlp.Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token") +embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") \ + .setInputCols("ner_chunk") \ + .setOutputCol("sentence_embeddings") -embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") +resolver = finance.SentenceEntityResolverModel.pretrained("finel_edgar_company_name", "en", "finance/models")\ + .setInputCols(["ner_chunk", "sentence_embeddings"]) \ + .setOutputCol("normalized")\ + .setDistanceFunction("EUCLIDEAN") -ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias", "en", "finance/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") - -ner_converter = nlp.NerConverter()\ - .setInputCols(["sentence","token","ner"])\ - .setOutputCol("ner_chunk") +pipeline = nlp.Pipeline( + stages = [ + documentAssembler, + embeddings, + resolver]) -chunk2doc = nlp.Chunk2Doc()\ - .setInputCols("ner_chunk")\ - .setOutputCol("ner_chunk_doc") +text = """CONTACT GOLD""" -sentence_embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") \ - .setInputCols("ner_chunk_doc") \ - .setOutputCol("sentence_embeddings") - -resolver = finance.SentenceEntityResolverModel.pretrained("finel_edgar_company_name", "en", "finance/models")\ - .setInputCols(["text", "sentence_embeddings"]) \ - .setOutputCol("resolution")\ - .setDistanceFunction("EUCLIDEAN") +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) -nlpPipeline = Pipeline(stages=[ - documentAssembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter, - chunk2doc, - sentence_embeddings, - resolver -]) +# Show Results ++------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|chunk |result |all_k_results |all_k_resolutions | ++------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|CONTACT GOLD|Contact Gold Corp.|Contact Gold Corp.:::ISHARES GOLD TRUST:::Minatura Gold:::Mexus Gold US:::BESRA GOLD INC.:::ALAMOS GOLD INC:::JOSHUA GOLD RESOURCES INC:::MIDEX GOLD CORP.:::Gold Mark Stephen:::Guskin Gold Corp.:::CMX GOLD & SILVER CORP.:::Permal Gold Ltd.|Contact Gold Corp.:::ISHARES GOLD TRUST:::Minatura Gold:::Mexus Gold US:::BESRA GOLD INC.:::ALAMOS GOLD INC:::JOSHUA GOLD RESOURCES INC:::MIDEX GOLD CORP.:::Gold Mark Stephen:::Guskin Gold Corp.:::CMX GOLD & SILVER CORP.:::Permal Gold Ltd.| ++------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -result = pipeline.fit(data).transform(data) {%- endcapture -%} - {%- capture model_scala_finance -%} -from johnsnowlabs import * -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") +import spark.implicits._ -val sentenceDetector = nlp.SentenceDetectorDLModel - .pretrained("sentence_detector_dl","xx") - .setInputCols("document") - .setOutputCol("sentence") +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("ner_chunk") +val embeddings = UniversalSentenceEncoder.pretrained("tfhub_use","en") + .setInputCols("ner_chunk") + .setOutputCol("sentence_embeddings") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") +val resolver = SentenceEntityResolverModel.pretrained("finel_edgar_company_name","en","finance/models") + .setInputCols(Array("ner_chunk","sentence_embeddings")) + .setOutputCol("normalized") + .setDistanceFunction("EUCLIDEAN") - -val embeddings = nlp.BertEmbeddings - .pretrained("bert_embeddings_sec_bert_base", "en") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + embeddings, + resolver)) +val text = "CONTACT GOLD" +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) +{%- endcapture -%} -val ner_model = finance.NerModel - .pretrained("finner_orgs_prods_alias", "en", "finance/models") - .setInputCols(Array("sentence", "token","embeddings")) - .setOutputCol("ner") +{%- capture model_api_link -%} +[SentenceEntityResolverModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/resolution/SentenceEntityResolverModel.html) +{%- endcapture -%} -val ner_converter = new nlp.NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") +{%- capture model_python_api_link -%} +[SentenceEntityResolverModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/resolution/sentence_entity_resolver/index.html#sparknlp_jsl.annotator.resolution.sentence_entity_resolver.SentenceEntityResolverModel) +{%- endcapture -%} -val chunk2doc = new nlp.Chunk2Doc() - .setInputCols("ner_chunk") - .setOutputCol("ner_chunk_doc") +{%- capture model_notebook_link -%} +[SentenceEntityResolverModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/SentenceEntityResolverApproach_SentenceEntityResolverModel.ipynb) +{%- endcapture -%} -val sentence_embeddings = nlp.UniversalSentenceEncoder - .pretrained("tfhub_use", "en") - .setInputCols("ner_chunk_doc") - .setOutputCol("sentence_embeddings") +{%- capture approach_description -%} +Trains a SentenceEntityResolverModel that maps sentence embeddings to entities in a knowledge base. -val resolver = finance.SentenceEntityResolverModel - .pretrained("finel_edgar_company_name", "en", "finance/models") - .setInputCols(Array("text", "sentence_embeddings")) - .setOutputCol("resolution") - .setDistanceFunction("EUCLIDEAN") +General parameters: +- `labelCol` : Column name for the value we are trying to resolve. Usually this contains the entity ID in the knowledge base (e.g., the ICD-10 code). -val pipeline = new Pipeline().setStages(Array( - documentAssembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter, - chunk2doc, - sentence_embeddings, - resolver -)) +- `normalizedCol`: Column name for the original, normalized description -val result = pipeline.fit(data).transform(data) +- `aux_label_col`: Auxiliary label which maps resolved entities to additional labels +- `useAuxLabel`: Whether to use the auxiliary column or not. Default value is False. -{%- endcapture -%} +- `distanceFunction`: Determines how the distance between different entities will be calculated. +- `confidenceFunction`: What function to use to calculate confidence: Either ` `INVERSE` or `SOFTMAX`. -{%- capture model_api_link -%} -[SentenceEntityResolverModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/resolution/SentenceEntityResolverModel.html) -{%- endcapture -%} +- `caseSensitive`: whether to ignore case in tokens for embeddings matching (Default: `False`) -{%- capture model_python_api_link -%} -[SentenceEntityResolverModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/resolution/sentence_entity_resolver/index.html#sparknlp_jsl.annotator.resolution.sentence_entity_resolver.SentenceEntityResolverModel) -{%- endcapture -%} +- `threshold`: Threshold value for the last distance calculated (default: 5.0) -{%- capture approach_description -%} -Trains a SentenceEntityResolverModel that maps sentence embeddings to entities in a knowledge base. +- `missAsEmpty`: whether or not to return an empty annotation on unmatched chunks (default: `True`) + + +When finetuning an existing model, there are additional parameters: -To train a custom model, you need to provide a dataset with the following columns: -- ``label``: Entity name -- ``chunk``: Occurrence of the entity in the text, without standartization -- ``sentence_embeddings``: Sentence embeddings from, e.g., the BertSentenceEmbeddings -annotator. +- `pretrainedModelPath`: Path to an already trained SentenceEntityResolverModel.This pretrained model will be used as a starting point for training the new one. The path can be a local file path, a distributed file path (HDFS, DBFS), or a cloud storage (S3). -Optionally, you can also provide the following columns: -- ``aux_label``: Auxiliary label which maps resolved entities to additional labels. -If you have ground truth of the knowledge base entities, setting this column will help -the model to learn better. +- `overrideExistingCodes`: Whether to override the existing codes with new data while continue the training from a pretrained model. Default value is `False` (keep all the codes). + +- `dropCodesList`: A list of codes in a pretrained model that will be omitted when the training process begins with a pretrained model. You can find pretrained Sentence Embeddings (using BERT or other architecgture) in the `NLP Models Hub `_. @@ -469,7 +377,7 @@ bertEmbeddings = nlp.BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base .setInputCols(["sentence"]) \ .setOutputCol("bert_embeddings") -snomedTrainingPipeline = Pipeline(stages=[ +snomedTrainingPipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, bertEmbeddings @@ -508,7 +416,7 @@ bertEmbeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_uncased_l .setInputCols(["sentence"]) \ .setOutputCol("bert_embeddings") -preprocessing_pipeline = Pipeline(stages=[ +preprocessing_pipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, bertEmbeddings @@ -547,7 +455,7 @@ bertEmbeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_large_cased") .setInputCols(["sentence"]) \ .setOutputCol("bert_embeddings") -preprocessing_pipeline = Pipeline(stages=[ +preprocessing_pipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, bertEmbeddings @@ -571,30 +479,33 @@ model = bertExtractor.fit(processed_data) {%- endcapture -%} {%- capture approach_scala_medical -%} -from johnsnowlabs import * +import spark.implicits._ + // Training a SNOMED resolution model using BERT sentence embeddings // Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels. -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("normalized_text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetector() +val sentenceDetector = SentenceDetector() .setInputCols("document") .setOutputCol("sentence") - val bertEmbeddings = nlp.BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") + val bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") .setInputCols("sentence") .setOutputCol("bert_embeddings") + val snomedTrainingPipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, bertEmbeddings )) + val snomedTrainingModel = snomedTrainingPipeline.fit(data) val snomedData = snomedTrainingModel.transform(data).cache() // Then the Resolver can be trained with -val bertExtractor = new medical.SentenceEntityResolverApproach() +val bertExtractor = new SentenceEntityResolverApproach() .setNeighbours(25) .setThreshold(1000) .setInputCols("bert_embeddings") @@ -609,20 +520,22 @@ val snomedModel = bertExtractor.fit(snomedData) {%- endcapture -%} {%- capture approach_scala_legal -%} -from johnsnowlabs import * +import spark.implicits._ + // Training a SNOMED resolution model using BERT sentence embeddings // Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels. -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("normalized_text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetector() +val sentenceDetector = SentenceDetector() .setInputCols("document") .setOutputCol("sentence") - val bertEmbeddings = nlp.BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") + val bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") .setInputCols("sentence") .setOutputCol("bert_embeddings") + val snomedTrainingPipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, @@ -632,7 +545,7 @@ val sentenceDetector = nlp.SentenceDetector() val snomedData = snomedTrainingModel.transform(data).cache() // Then the Resolver can be trained with -val bertExtractor = new legal.SentenceEntityResolverApproach() +val bertExtractor = new SentenceEntityResolverApproach() .setNeighbours(25) .setThreshold(1000) .setInputCols("bert_embeddings") @@ -647,20 +560,22 @@ val snomedModel = bertExtractor.fit(snomedData) {%- endcapture -%} {%- capture approach_scala_finance -%} -from johnsnowlabs import * +import spark.implicits._ + // Training a SNOMED resolution model using BERT sentence embeddings // Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels. -val documentAssembler = new nlp.DocumentAssembler() +val documentAssembler = new DocumentAssembler() .setInputCol("normalized_text") .setOutputCol("document") -val sentenceDetector = nlp.SentenceDetector() +val sentenceDetector = SentenceDetector() .setInputCols("document") .setOutputCol("sentence") - val bertEmbeddings = nlp.BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") + val bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") .setInputCols("sentence") .setOutputCol("bert_embeddings") + val snomedTrainingPipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, @@ -670,7 +585,7 @@ val sentenceDetector = nlp.SentenceDetector() val snomedData = snomedTrainingModel.transform(data).cache() // Then the Resolver can be trained with -val bertExtractor = new finance.SentenceEntityResolverApproach() +val bertExtractor = new SentenceEntityResolverApproach() .setNeighbours(25) .setThreshold(1000) .setInputCols("bert_embeddings") @@ -692,6 +607,10 @@ val snomedModel = bertExtractor.fit(snomedData) [SentenceEntityResolverApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/resolution/sentence_entity_resolver/index.html#sparknlp_jsl.annotator.resolution.sentence_entity_resolver.SentenceEntityResolverApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[SentenceEntityResolverApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/SentenceEntityResolverApproach_SentenceEntityResolverModel.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -707,6 +626,7 @@ model_python_finance=model_python_finance model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -718,4 +638,5 @@ approach_scala_legal=approach_scala_legal approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/WindowedSentenceModel.md b/docs/en/licensed_annotator_entries/WindowedSentenceModel.md index a228bef499..47950fe63d 100644 --- a/docs/en/licensed_annotator_entries/WindowedSentenceModel.md +++ b/docs/en/licensed_annotator_entries/WindowedSentenceModel.md @@ -11,6 +11,11 @@ This annotator that helps you to merge the previous and following sentences of a Inferring the class from sentence X may be a much harder task sometime, due to the lack of context, than to infer the class of sentence X-1 + sentence X + sentence X+1. In this example, the window is 1, that’s why we augment sentence with 1 neighbour from behind and another from ahead. Window size can be configured so that each piece of text/sentence get a number of previous and posterior sentences as context, equal to the windows size. +Parameters: + +- `setWindowSize`: Sets size of the sliding window. + +- `setGlueString`: Sets string to use to join the neighboring elements together. {%- endcapture -%} {%- capture model_input_anno -%} diff --git a/docs/en/licensed_annotators.md b/docs/en/licensed_annotators.md index 5dfae59b3d..1e79f9ca1b 100644 --- a/docs/en/licensed_annotators.md +++ b/docs/en/licensed_annotators.md @@ -40,31 +40,60 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a {% include templates/licensed_table_entry.md name="AssertionDL" summary="AssertionDL is a deep Learning based approach used to extract Assertion Status from extracted entities and text."%} {% include templates/licensed_table_entry.md name="AssertionFilterer" summary="Filters entities coming from ASSERTION type annotations and returns the CHUNKS."%} {% include templates/licensed_table_entry.md name="AssertionLogReg" summary="Logistic Regression is used to extract Assertion Status from extracted entities and text."%} +{% include templates/licensed_table_entry.md name="AverageEmbeddings" summary="Computes the mean of vector embeddings for two sentences of equal size, producing a unified representation"%} +{% include templates/licensed_table_entry.md name="BertForSequenceClassification" summary="Can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks."%} +{% include templates/licensed_table_entry.md name="BertForTokenClassifier" summary="Can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) for Named-Entity-Recognition (NER) tasks."%} +{% include templates/licensed_table_entry.md name="BertSentenceChunkEmbeddings" summary="This annotator combines sentence and NER chunk embeddings to enhance resolution codes, leveraging contextual information in the embeddings for more precise results. It takes sentence context and NER chunks as input and produces embeddings for each chunk, facilitating input for the resolution model."%} {% include templates/licensed_table_entry.md name="Chunk2Token" summary="A feature transformer that converts the input array of strings (annotatorType CHUNK) into an array of chunk-based tokens (annotatorType TOKEN)."%} +{% include templates/licensed_table_entry.md name="ChunkConverter" summary="This annotator merges NER-detected entities with RegexMatcher-based rules for unified processing in the pipeline."%} {% include templates/licensed_table_entry.md name="ChunkEntityResolver" summary="Returns a normalized entity for a particular trained ontology / curated dataset (e.g. clinical ICD-10, RxNorm, SNOMED; financial SEC's EDGAR database, etc)."%} {% include templates/licensed_table_entry.md name="ChunkFilterer" summary="Filters entities coming from CHUNK annotations."%} {% include templates/licensed_table_entry.md name="ChunkKeyPhraseExtraction" summary="Uses Bert Sentence Embeddings to determine the most relevant key phrases describing a text."%} +{% include templates/licensed_table_entry.md name="ChunkMapper" summary="We can use ChunkMapper to map entities with their associated code/reference based on pre-defined dictionaries."%} +{% include templates/licensed_table_entry.md name="ChunkMapperFilterer" summary="Annotator to be used after `ChunkMapper` that allows to filter chunks based on the results of the mapping, whether it was successful or failed."%} {% include templates/licensed_table_entry.md name="ChunkMerge" summary="Merges entities coming from different CHUNK annotations."%} +{% include templates/licensed_table_entry.md name="ChunkSentenceSplitter" summary="Annotator can split the documents into chunks according to separators given as `CHUNK` columns. It is useful when you need to perform different models or analysis in different sections of your document"%} {% include templates/licensed_table_entry.md name="ContextualParser" summary="Extracts entity from a document based on user defined rules."%} +{% include templates/licensed_table_entry.md name="DateNormalizer" summary="This annotator transforms date mentions to a common standard format: YYYY/MM/DD. It is useful when using data from different sources, some times from different countries that has different formats to represent dates."%} {% include templates/licensed_table_entry.md name="DeIdentification" summary="Deidentifies Input Annotations of types DOCUMENT, TOKEN and CHUNK, by either masking or obfuscating the given CHUNKS."%} +{% include templates/licensed_table_entry.md name="DistilBertForSequenceClassification" summary="Can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks."%} +{% include templates/licensed_table_entry.md name="Doc2ChunkInternal" summary="Converts `DOCUMENT`, `TOKEN` typed annotations into `CHUNK` type with the contents of a `chunkCol`."%} +{% include templates/licensed_table_entry.md name="DocMapper" summary="Uses the text representation of document annotations to map clinical codes to other codes or relevant information. "%} +{% include templates/licensed_table_entry.md name="DocumentHashCoder" summary="This annotator swaps dates in a document column with hash codes from another column, creating a new column with shifted day information. The subsequent `DeIdentification` annotator anonymizes the document, incorporating the altered dates. "%} {% include templates/licensed_table_entry.md name="DocumentLogRegClassifier" summary="Classifies documents with a Logarithmic Regression algorithm."%} -{% include templates/licensed_table_entry.md name="DrugNormalizer" summary="Annotator which normalizes raw text from documents, e.g. scraped web pages or xml documents"%} +{% include templates/licensed_table_entry.md name="DocumentMLClassifier" summary="classifies documents with a Logarithmic Regression algorithm."%} +{% include templates/licensed_table_entry.md name="DrugNormalizer" summary="Annotator which normalizes raw text from documents, e.g. scraped web pages or xml documents."%} +{% include templates/licensed_table_entry.md name="EntityChunkEmbeddings" summary="Entity Chunk Embeddings uses BERT Sentence embeddings to compute a weighted average vector represention of related entity chunks."%} {% include templates/licensed_table_entry.md name="FeaturesAssembler" summary="Collects features from different columns."%} +{% include templates/licensed_table_entry.md name="FewShotClassifier" summary="This Annotator specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data. +"%} {% include templates/licensed_table_entry.md name="GenericClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%} +{% include templates/licensed_table_entry.md name="GenericLogRegClassifier" summary="Is a derivative of GenericClassifier which implements a multinomial logistic regression."%} +{% include templates/licensed_table_entry.md name="GenericSVMClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%} +{% include templates/licensed_table_entry.md name="InternalDocumentSplitter" summary="This annotator splits large documents into small documents."%} {% include templates/licensed_table_entry.md name="IOBTagger" summary="Merges token tags and NER labels from chunks in the specified format."%} +{% include templates/licensed_table_entry.md name="NameChunkObfuscator" summary="This annotator allows to transform a dataset with an Input Annotation of type CHUNK, into its obfuscated version of by obfuscating the given CHUNKS."%} {% include templates/licensed_table_entry.md name="NerChunker" summary="Extracts phrases that fits into a known pattern using the NER tags."%} {% include templates/licensed_table_entry.md name="NerConverterInternal" summary="Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label."%} {% include templates/licensed_table_entry.md name="NerDisambiguator" summary="Links words of interest, such as names of persons, locations and companies, from an input text document to a corresponding unique entity in a target Knowledge Base (KB)."%} -{% include templates/licensed_table_entry.md name="MedicalNer" summary="This Named Entity recognition annotator is a generic NER model based on Neural Networks.."%} +{% include templates/licensed_table_entry.md name="NerModel" summary="This Named Entity recognition annotator is a generic NER model based on Neural Networks."%} +{% include templates/licensed_table_entry.md name="NerQuestionGenerator" summary="This annotator takes an NER chunk (obtained by, e.g., `NerConverterInternal`) and generates a questions based on two entity types, a pronoun and a strategy."%} {% include templates/licensed_table_entry.md name="QuestionAnswering" summary="GPT-based model for answering questions given a context."%} -{% include templates/licensed_table_entry.md name="RENerChunksFilter" summary="Filters and outputs combinations of relations between extracted entities, for further processing."%} {% include templates/licensed_table_entry.md name="ReIdentification" summary="Reidentifies obfuscated entities by DeIdentification."%} {% include templates/licensed_table_entry.md name="RelationExtraction" summary="Extracts and classifies instances of relations between named entities."%} {% include templates/licensed_table_entry.md name="RelationExtractionDL" summary="Extracts and classifies instances of relations between named entities."%} +{% include templates/licensed_table_entry.md name="RENerChunksFilter" summary="Filters and outputs combinations of relations between extracted entities, for further processing."%} +{% include templates/licensed_table_entry.md name="Replacer" summary="This annotator allows to replace entities in the original text with the ones extracted by the annotators `NameChunkObfuscatorApproach` or `DateNormalizer`."%} +{% include templates/licensed_table_entry.md name="Resolution2Chunk" summary="This annotator is responsible for converting the annotations generated by entity resolver models (typically labeled as ENTITY) into a format compatible with subsequent stages of the pipeline, such as the ChunkMapperModel."%} +{% include templates/licensed_table_entry.md name="ResolverMerger" summary="This annotator is provide the ability to merge sentence enitity resolver and chunk mapper model output columns."%} +{% include templates/licensed_table_entry.md name="Router" summary="This annotator is provide the ability to split an output of an annotator for a selected metadata field and the value for that field."%} {% include templates/licensed_table_entry.md name="SentenceEntityResolver" summary="Returns the normalized entity for a particular trained ontology / curated dataset (e.g. clinical ICD-10, RxNorm, SNOMED; financial SEC's EDGAR database, etc) based on sentence embeddings."%} {% include templates/licensed_table_entry.md name="Summarizer" summary="Helps to quickly summarize complex medical information."%} {% include templates/licensed_table_entry.md name="TextGenerator" summary="Uses the basic BioGPT model to perform various tasks related to medical text abstraction."%} {% include templates/licensed_table_entry.md name="TFGraphBuilder" summary="Creates Tensorflow graphs."%} +{% include templates/licensed_table_entry.md name="WindowedSentenceModel" summary="This annotator that helps you to merge the previous and following sentences of a given piece of text, so that you add the context surrounding them."%} +{% include templates/licensed_table_entry.md name="ZeroShotNerModel" summary="This is a zero-shot named entity recognition using `RoBertaForQuestionAnswering`. It identifies entities across diverse data without domain-specific fine-tuning."%} +{% include templates/licensed_table_entry.md name="ZeroShotRelationExtractionModel" summary="This annotator implements zero-shot binary relations extraction by utilizing `BERT` transformer models trained on the NLI (Natural Language Inference) task."%}