Skip to content

Commit

Permalink
Added new annotator (#811)
Browse files Browse the repository at this point in the history
  • Loading branch information
Meryem1425 authored Dec 7, 2023
1 parent 693ee12 commit 26fed1d
Show file tree
Hide file tree
Showing 8 changed files with 1,724 additions and 26 deletions.
88 changes: 62 additions & 26 deletions docs/_posts/2020-09-23-assertion_dl_healthcare_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,40 +42,76 @@ Use as part of an nlp pipeline with the following stages: DocumentAssembler, Sen
{% include programmingLanguageSelectScalaPythonNLU.html %}

```python
...
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
ner_converter = NerConverter() \
.setInputCols(["sentence", "token", "ner"]) \
.setOutputCol("ner_chunk")
clinical_assertion = AssertionDLModel.pretrained("assertion_dl_healthcare","en","clinical/models")\
.setInputCols(["document","ner_chunk","embeddings"])\
.setOutputCol("assertion")
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentenceDetector = SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")

tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, clinical_assertion])
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_healthcare", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")

ner_converter = NerConverter() \
.setInputCols(["sentence", "token", "ner"]) \
.setOutputCol("ner_chunk")

clinical_assertion = AssertionDLModel.pretrained("assertion_dl_healthcare", "en", "clinical/models") \
.setInputCols(["sentence", "ner_chunk", "embeddings"]) \
.setOutputCol("assertion")

nlpPipeline = Pipeline(stages=[
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
clinical_ner,
ner_converter,
clinical_assertion
])

model = nlpPipeline.fit(spark.createDataFrame([['Patient has a headache for the last 2 weeks and appears anxious when she walks fast. No alopecia noted. She denies pain']]).toDF("text"))
results = model.transform(data)
```

```scala
...
val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner")
val ner_converter = NerConverter()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val sentenceDetector = new SentenceDetector()
.setInputCols("document")
.setOutputCol("sentence")

val tokenizer = new Tokenizer()
.setInputCols("sentence")
.setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")

val clinical_ner = MedicalNerModel.pretrained("ner_healthcare", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner")

val ner_converter = new NerConverter()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")

val clinical_assertion = AssertionDLModel.pretrained("assertion_dl_healthcare","en","clinical/models")
.setInputCols("document","ner_chunk","embeddings")
.setOutputCol("assertion")
.setInputCols("document","ner_chunk","embeddings")
.setOutputCol("assertion")

val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, clinical_assertion))

Expand Down
229 changes: 229 additions & 0 deletions docs/en/licensed_annotator_entries/DocumentMLClassifier.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{%- capture title -%}
DocumentMLClassifier
{%- endcapture -%}

{%- capture approach -%}
approach
{%- endcapture -%}

{%- capture model -%}
model
{%- endcapture -%}

{%- capture model_description -%}
`DocumentMLClassifier` classifies documents with a Logarithmic Regression algorithm.
{%- endcapture -%}

{%- capture model_input_anno -%}
TOKEN
{%- endcapture -%}

{%- capture model_output_anno -%}
CATEGORY
{%- endcapture -%}

{%- capture model_python_medical -%}

from johnsnowlabs import nlp, medical

document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = nlp.Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")

classifier_ml = medical.DocumentMLClassifierModel.pretrained("classifierml_ade", "en", "clinical/models")\
.setInputCols("token")\
.setOutputCol("prediction")

clf_Pipeline = nlp.Pipeline(stages=[
document_assembler,
tokenizer,
classifier_ml])

data = spark.createDataFrame([["""I feel great after taking tylenol."""], ["""Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient."""]]).toDF("text")

result = clf_Pipeline.fit(data).transform(data)


# Show results
result.select('text','prediction.result').show(truncate=False)

+----------------------------------------------------------------------------------------+-------+
|text |result |
+----------------------------------------------------------------------------------------+-------+
|Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient.|[False]|
|I feel great after taking tylenol. |[False]|
+----------------------------------------------------------------------------------------+-------+

{%- endcapture -%}


{%- capture model_scala_medical -%}

import spark.implicits._

val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val tokenizer = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")

val classifier_ml = DocumentMLClassifierModel.pretrained("classifierml_ade", "en", "clinical/models")
.setInputCols("token")
.setOutputCol("prediction")

val clf_Pipeline = new Pipeline().setStages(Array(
document_assembler,
tokenizer,
classifier_ml))

val data = Seq(
"I feel great after taking tylenol.",
"Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient.").toDF("text")

val result = clf_Pipeline.fit(data).transform(data)

// Show results

+----------------------------------------------------------------------------------------+-------+
|text |result |
+----------------------------------------------------------------------------------------+-------+
|Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient.|[False]|
|I feel great after taking tylenol. |[False]|
+----------------------------------------------------------------------------------------+-------+

{%- endcapture -%}


{%- capture model_api_link -%}
[DocumentMLClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/DocumentMLClassifierModel.html)
{%- endcapture -%}

{%- capture model_python_api_link -%}
[DocumentMLClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/document_ml_classifier/index.html#sparknlp_jsl.annotator.classification.document_ml_classifier.DocumentMLClassifierModel)
{%- endcapture -%}

{%- capture model_notebook_link -%}
[DocumentMLClassifierModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DocumentMLClassifierApproach_DocumentMLClassifierModel.ipynb)
{%- endcapture -%}


{%- capture approach_description -%}

Trains a model to classify documents with a Logarithmic Regression algorithm. Training data requires columns for text and their label. The result is a trained DocumentMLClassifierModel.

Parametres:

- `labelCol`: (str) Sets column with the value result we are trying to predict.
- `maxIter`: (Int) Sets maximum number of iterations.
- `tol`: (float) Sets convergence tolerance after each iteration.
- `fitIntercept`: (str) Sets whether to fit an intercept term, default is true.
- `vectorizationModelPath`: (str) Sets a path to the classification model if it has been already trained.
- `classificationModelPath`: (str) Sets a path to the classification model if it has been already trained.
- `classificationModelClass`: (str) Sets a the classification model class from SparkML to use; possible values are: logreg, svm.
- `minTokenNgram`: (int) Sets minimum number of tokens for Ngrams.
- `maxTokenNgram`: (int) Sets maximum number of tokens for Ngrams.
- `mergeChunks`: (boolean) whether to merge all chunks in a document or not (Default: false)

{%- endcapture -%}

{%- capture approach_input_anno -%}
TOKEN
{%- endcapture -%}

{%- capture approach_output_anno -%}
CATEGORY
{%- endcapture -%}

{%- capture approach_python_medical -%}

from johnsnowlabs import nlp, medical

document = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

token = nlp.Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")

classifier_logreg = medical.DocumentMLClassifierApproach() \
.setInputCols("token") \
.setLabelCol("category") \
.setOutputCol("prediction") \
.setClassificationModelClass("logreg")\
.setFitIntercept(True)

pipeline = nlp.Pipeline(stages=[document, token, classifier_logreg])

result_logreg = pipeline.fit(train_data).transform(test_data).cache()

{%- endcapture -%}


{%- capture approach_scala_medical -%}

import spark.implicits._

val document = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val token = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")

val classifier_logreg = new DocumentMLClassifierApproach()
.setInputCols("token")
.setLabelCol("category")
.setOutputCol("prediction")
.setClassificationModelClass("logreg")
.setFitIntercept(true)

val pipeline = new Pipeline().setStages(Array(
document,
token,
classifier_logreg))

val result_logreg = pipeline.fit(train_data).transform(test_data).cache()
{%- endcapture -%}


{%- capture approach_api_link -%}
[DocumentMLClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/DocumentMLClassifierApproach.html)
{%- endcapture -%}

{%- capture approach_python_api_link -%}
[DocumentMLClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/document_ml_classifier/index.html#sparknlp_jsl.annotator.classification.document_ml_classifier.DocumentMLClassifierApproach)
{%- endcapture -%}

{%- capture approach_notebook_link -%}
[DocumentMLClassifierApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DocumentMLClassifierApproach_DocumentMLClassifierModel.ipynb)
{%- endcapture -%}

{% include templates/licensed_approach_model_medical_fin_leg_template.md
title=title
model=model
approach=approach
model_description=model_description
model_input_anno=model_input_anno
model_output_anno=model_output_anno
model_python_medical=model_python_medical
model_scala_medical=model_scala_medical
model_api_link=model_api_link
model_python_api_link=model_python_api_link
model_notebook_link=model_notebook_link
approach_description=approach_description
approach_input_anno=approach_input_anno
approach_output_anno=approach_output_anno
approach_python_medical=approach_python_medical
approach_scala_medical=approach_scala_medical
approach_api_link=approach_api_link
approach_python_api_link=approach_python_api_link
approach_notebook_link=approach_notebook_link
%}
Loading

0 comments on commit 26fed1d

Please sign in to comment.