diff --git a/docs/_posts/2020-09-23-assertion_dl_healthcare_en.md b/docs/_posts/2020-09-23-assertion_dl_healthcare_en.md index 887cfc68d5..7817e2ec03 100644 --- a/docs/_posts/2020-09-23-assertion_dl_healthcare_en.md +++ b/docs/_posts/2020-09-23-assertion_dl_healthcare_en.md @@ -42,40 +42,76 @@ Use as part of an nlp pipeline with the following stages: DocumentAssembler, Sen {% include programmingLanguageSelectScalaPythonNLU.html %} ```python -... -word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ -.setInputCols(["sentence", "token"])\ -.setOutputCol("embeddings") -clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \ -.setInputCols(["sentence", "token", "embeddings"]) \ -.setOutputCol("ner") -ner_converter = NerConverter() \ -.setInputCols(["sentence", "token", "ner"]) \ -.setOutputCol("ner_chunk") -clinical_assertion = AssertionDLModel.pretrained("assertion_dl_healthcare","en","clinical/models")\ -.setInputCols(["document","ner_chunk","embeddings"])\ -.setOutputCol("assertion") +documentAssembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") -nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, clinical_assertion]) +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +clinical_ner = MedicalNerModel.pretrained("ner_healthcare", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +clinical_assertion = AssertionDLModel.pretrained("assertion_dl_healthcare", "en", "clinical/models") \ + .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ + .setOutputCol("assertion") + +nlpPipeline = Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter, + clinical_assertion + ]) model = nlpPipeline.fit(spark.createDataFrame([['Patient has a headache for the last 2 weeks and appears anxious when she walks fast. No alopecia noted. She denies pain']]).toDF("text")) results = model.transform(data) ``` ```scala -... -val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") -.setInputCols(Array("sentence", "token")) -.setOutputCol("embeddings") -val clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") -.setInputCols(Array("sentence", "token", "embeddings")) -.setOutputCol("ner") -val ner_converter = NerConverter() -.setInputCols(Array("sentence", "token", "ner")) -.setOutputCol("ner_chunk") +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val clinical_ner = MedicalNerModel.pretrained("ner_healthcare", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + val clinical_assertion = AssertionDLModel.pretrained("assertion_dl_healthcare","en","clinical/models") -.setInputCols("document","ner_chunk","embeddings") -.setOutputCol("assertion") + .setInputCols("document","ner_chunk","embeddings") + .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, clinical_assertion)) diff --git a/docs/en/licensed_annotator_entries/DocumentMLClassifier.md b/docs/en/licensed_annotator_entries/DocumentMLClassifier.md new file mode 100644 index 0000000000..f9c286f141 --- /dev/null +++ b/docs/en/licensed_annotator_entries/DocumentMLClassifier.md @@ -0,0 +1,229 @@ +{%- capture title -%} +DocumentMLClassifier +{%- endcapture -%} + +{%- capture approach -%} +approach +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`DocumentMLClassifier` classifies documents with a Logarithmic Regression algorithm. +{%- endcapture -%} + +{%- capture model_input_anno -%} +TOKEN +{%- endcapture -%} + +{%- capture model_output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +classifier_ml = medical.DocumentMLClassifierModel.pretrained("classifierml_ade", "en", "clinical/models")\ + .setInputCols("token")\ + .setOutputCol("prediction") + +clf_Pipeline = nlp.Pipeline(stages=[ + document_assembler, + tokenizer, + classifier_ml]) + +data = spark.createDataFrame([["""I feel great after taking tylenol."""], ["""Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient."""]]).toDF("text") + +result = clf_Pipeline.fit(data).transform(data) + + +# Show results +result.select('text','prediction.result').show(truncate=False) + ++----------------------------------------------------------------------------------------+-------+ +|text |result | ++----------------------------------------------------------------------------------------+-------+ +|Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient.|[False]| +|I feel great after taking tylenol. |[False]| ++----------------------------------------------------------------------------------------+-------+ + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val classifier_ml = DocumentMLClassifierModel.pretrained("classifierml_ade", "en", "clinical/models") + .setInputCols("token") + .setOutputCol("prediction") + +val clf_Pipeline = new Pipeline().setStages(Array( + document_assembler, + tokenizer, + classifier_ml)) + +val data = Seq( + "I feel great after taking tylenol.", + "Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient.").toDF("text") + +val result = clf_Pipeline.fit(data).transform(data) + +// Show results + ++----------------------------------------------------------------------------------------+-------+ +|text |result | ++----------------------------------------------------------------------------------------+-------+ +|Detection of activated eosinophils in nasal polyps of an aspirin-induced asthma patient.|[False]| +|I feel great after taking tylenol. |[False]| ++----------------------------------------------------------------------------------------+-------+ + +{%- endcapture -%} + + +{%- capture model_api_link -%} +[DocumentMLClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/DocumentMLClassifierModel.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[DocumentMLClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/document_ml_classifier/index.html#sparknlp_jsl.annotator.classification.document_ml_classifier.DocumentMLClassifierModel) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[DocumentMLClassifierModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DocumentMLClassifierApproach_DocumentMLClassifierModel.ipynb) +{%- endcapture -%} + + +{%- capture approach_description -%} + +Trains a model to classify documents with a Logarithmic Regression algorithm. Training data requires columns for text and their label. The result is a trained DocumentMLClassifierModel. + +Parametres: + +- `labelCol`: (str) Sets column with the value result we are trying to predict. +- `maxIter`: (Int) Sets maximum number of iterations. +- `tol`: (float) Sets convergence tolerance after each iteration. +- `fitIntercept`: (str) Sets whether to fit an intercept term, default is true. +- `vectorizationModelPath`: (str) Sets a path to the classification model if it has been already trained. +- `classificationModelPath`: (str) Sets a path to the classification model if it has been already trained. +- `classificationModelClass`: (str) Sets a the classification model class from SparkML to use; possible values are: logreg, svm. +- `minTokenNgram`: (int) Sets minimum number of tokens for Ngrams. +- `maxTokenNgram`: (int) Sets maximum number of tokens for Ngrams. +- `mergeChunks`: (boolean) whether to merge all chunks in a document or not (Default: false) + +{%- endcapture -%} + +{%- capture approach_input_anno -%} +TOKEN +{%- endcapture -%} + +{%- capture approach_output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture approach_python_medical -%} + +from johnsnowlabs import nlp, medical + +document = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +token = nlp.Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +classifier_logreg = medical.DocumentMLClassifierApproach() \ + .setInputCols("token") \ + .setLabelCol("category") \ + .setOutputCol("prediction") \ + .setClassificationModelClass("logreg")\ + .setFitIntercept(True) + +pipeline = nlp.Pipeline(stages=[document, token, classifier_logreg]) + +result_logreg = pipeline.fit(train_data).transform(test_data).cache() + +{%- endcapture -%} + + +{%- capture approach_scala_medical -%} + +import spark.implicits._ + +val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val token = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val classifier_logreg = new DocumentMLClassifierApproach() + .setInputCols("token") + .setLabelCol("category") + .setOutputCol("prediction") + .setClassificationModelClass("logreg") + .setFitIntercept(true) + +val pipeline = new Pipeline().setStages(Array( + document, + token, + classifier_logreg)) + +val result_logreg = pipeline.fit(train_data).transform(test_data).cache() +{%- endcapture -%} + + +{%- capture approach_api_link -%} +[DocumentMLClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/DocumentMLClassifierApproach.html) +{%- endcapture -%} + +{%- capture approach_python_api_link -%} +[DocumentMLClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/document_ml_classifier/index.html#sparknlp_jsl.annotator.classification.document_ml_classifier.DocumentMLClassifierApproach) +{%- endcapture -%} + +{%- capture approach_notebook_link -%} +[DocumentMLClassifierApproachNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DocumentMLClassifierApproach_DocumentMLClassifierModel.ipynb) +{%- endcapture -%} + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +approach=approach +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +approach_description=approach_description +approach_input_anno=approach_input_anno +approach_output_anno=approach_output_anno +approach_python_medical=approach_python_medical +approach_scala_medical=approach_scala_medical +approach_api_link=approach_api_link +approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/FewShotClassifier.md b/docs/en/licensed_annotator_entries/FewShotClassifier.md new file mode 100644 index 0000000000..0b43d39a49 --- /dev/null +++ b/docs/en/licensed_annotator_entries/FewShotClassifier.md @@ -0,0 +1,240 @@ +{%- capture title -%} +FewShotClassifier +{%- endcapture -%} + +{%- capture approach -%} +approach +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`FewShotClassifier` annotators specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data. + +These annotators provide a valuable capability for handling scenarios where labeled data is scarce or expensive to obtain. By effectively utilizing limited labeled examples, the few-shot classification approach enables the creation of models that can generalize and classify new instances accurately, even with minimal training data. + +The FewShotClassifier is designed to process sentence embeddings as input. It generates category annotations, providing labels along with confidence scores that range from 0 to 1. +{%- endcapture -%} + +{%- capture model_input_anno -%} +SENTENCE EMBEDDINGS +{%- endcapture -%} + +{%- capture model_output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +bert_sent = nlp.BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +few_shot_classifier = medical.FewShotClassifierModel.pretrained("few_shot_classifier_age_group_sbiobert_cased_mli", "en", "clinical/models")\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("prediction") + +clf_Pipeline = nlp.Pipeline(stages=[ + document_assembler, + bert_sent, + few_shot_classifier +]) + +data = spark.createDataFrame([ + ["""A patient presented with complaints of chest pain and shortness of breath. The medical history revealed the patient had a smoking habit for over 30 years, and was diagnosed with hypertension two years ago. After a detailed physical examination, the doctor found a noticeable wheeze on lung auscultation and prescribed a spirometry test, which showed irreversible airway obstruction. The patient was diagnosed with Chronic obstructive pulmonary disease (COPD) caused by smoking."""], + ["""Hi, wondering if anyone has had a similar situation. My 1 year old daughter has the following; loose stools/ pale stools, elevated liver enzymes, low iron. 5 months and still no answers from drs. """], + ["""Hi have chronic gastritis from 4 month(confirmed by endoscopy).I do not have acid reflux.Only dull ache above abdomen and left side of chest.I am on reberprozole and librax.My question is whether chronic gastritis is curable or is it a lifetime condition?I am loosing hope because this dull ache is not going away.Please please reply"""] + ]).toDF("text") + +result = clf_Pipeline.fit(data).transform(data) + + +# Show results +result.select('prediction.result','text').show(truncate=150) + ++---------+------------------------------------------------------------------------------------------------------------------------------------------------------+ +| result| text| ++---------+------------------------------------------------------------------------------------------------------------------------------------------------------+ +| [Adult]|A patient presented with complaints of chest pain and shortness of breath. The medical history revealed the patient had a smoking habit for over 30...| +| [Child]|Hi, wondering if anyone has had a similar situation. My 1 year old daughter has the following; loose stools/ pale stools, elevated liver enzymes, l...| +|[Unknown]|Hi have chronic gastritis from 4 month(confirmed by endoscopy).I do not have acid reflux.Only dull ache above abdomen and left side of chest.I am o...| ++---------+------------------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val bert_sent = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val few_shot_classifier = FewShotClassifierModel.pretrained("few_shot_classifier_age_group_sbiobert_cased_mli", "en", "clinical/models") + .setInputCols("sentence_embeddings") + .setOutputCol("prediction") + +val clf_Pipeline = new Pipeline().setStages(Array( + document_assembler, + bert_sent, + few_shot_classifier)) + +val data = Seq( + ("""A patient presented with complaints of chest pain and shortness of breath. The medical history revealed the patient had a smoking habit for over 30 years, and was diagnosed with hypertension two years ago. After a detailed physical examination, the doctor found a noticeable wheeze on lung auscultation and prescribed a spirometry test, which showed irreversible airway obstruction. The patient was diagnosed with Chronic obstructive pulmonary disease (COPD) caused by smoking."""), + ("""Hi, wondering if anyone has had a similar situation. My 1 year old daughter has the following; loose stools/ pale stools, elevated liver enzymes, low iron. 5 months and still no answers from drs. """), + ("""Hi have chronic gastritis from 4 month(confirmed by endoscopy).I do not have acid reflux.Only dull ache above abdomen and left side of chest.I am on reberprozole and librax.My question is whether chronic gastritis is curable or is it a lifetime condition?I am loosing hope because this dull ache is not going away.Please please reply""")).toDF("text") + +val result = clf_Pipeline.fit(data).transform(data) + +// Show results + + ++---------+------------------------------------------------------------------------------------------------------------------------------------------------------+ +| result| text| ++---------+------------------------------------------------------------------------------------------------------------------------------------------------------+ +| [Adult]|A patient presented with complaints of chest pain and shortness of breath. The medical history revealed the patient had a smoking habit for over 30...| +| [Child]|Hi, wondering if anyone has had a similar situation. My 1 year old daughter has the following; loose stools/ pale stools, elevated liver enzymes, l...| +|[Unknown]|Hi have chronic gastritis from 4 month(confirmed by endoscopy).I do not have acid reflux.Only dull ache above abdomen and left side of chest.I am o...| ++---------+------------------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + + +{%- capture model_api_link -%} +[FewShotClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/FewShotClassifierModel.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[FewShotClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/few_shot_classifier/index.html#sparknlp_jsl.annotator.classification.few_shot_classifier.FewShotClassifierModel) +{%- endcapture -%} + + + +{%- capture approach_description -%} +`FewShotClassifier` annotators specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data. + +These annotators provide a valuable capability for handling scenarios where labeled data is scarce or expensive to obtain. By effectively utilizing limited labeled examples, the few-shot classification approach enables the creation of models that can generalize and classify new instances accurately, even with minimal training data. + +The FewShotClassifier is designed to process sentence embeddings as input. It generates category annotations, providing labels along with confidence scores that range from 0 to 1. +{%- endcapture -%} + +{%- capture approach_input_anno -%} +SENTENCE EMBEDDINGS +{%- endcapture -%} + +{%- capture approach_output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture approach_python_medical -%} + +from johnsnowlabs import nlp, medical + +document_asm = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("sentence") + +sentence_embeddings = nlp.BertSentenceEmbeddings\ +.pretrained("sbiobert_base_cased_mli","en","clinical/models")\ + .setInputCols(["sentence"])\ + .setOutputCol("sentence_embeddings") + +graph_builder = medical.TFGraphBuilder()\ + .setModelName("fewshot_classifier")\ + .setInputCols(["sentence_embeddings"]) \ + .setLabelColumn("label")\ + .setGraphFolder("/tmp")\ + .setGraphFile("log_reg_graph.pb")\ + +few_shot_approach = medical.FewShotClassifierApproach()\ + .setLabelColumn("label")\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("prediction")\ + .setModelFile(f"/tmp/log_reg_graph.pb")\ + .setEpochsNumber(10)\ + .setBatchSize(1)\ + .setLearningRate(0.001) + +pipeline = nlp.Pipeline( + stages=[ + document_asm, + sentence_embeddings, + graph_builder, + few_shot_approach + ]) + +model = pipeline.fit(train_data) +{%- endcapture -%} + + +{%- capture approach_scala_medical -%} + +import spark.implicits._ + +val document_asm = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("sentence") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models") + .setInputCols("sentence") + .setOutputCol("sentence_embeddings") + +val few_shot_approach = new FewShotClassifierApproach() + .setLabelColumn("label") + .setInputCols("sentence_embeddings") + .setOutputCol("prediction") + .setModelFile("tmp/log_reg_graph.pb") + .setEpochsNumber(10) + .setBatchSize(1) + .setLearningRate(0.001) + +val pipeline = new Pipeline().setStages(Array( + document_asm, + sentence_embeddings, + few_shot_approach )) + +val result = pipeline.fit(train_data).transform(test_data).cache() +{%- endcapture -%} + + +{%- capture approach_api_link -%} +[FewShotClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/FewShotClassifierApproach.html) +{%- endcapture -%} + +{%- capture approach_python_api_link -%} +[FewShotClassifierApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/few_shot_classifier/index.html#sparknlp_jsl.annotator.classification.few_shot_classifier.FewShotClassifierApproach) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +approach=approach +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +approach_description=approach_description +approach_input_anno=approach_input_anno +approach_output_anno=approach_output_anno +approach_python_medical=approach_python_medical +approach_scala_medical=approach_scala_medical +approach_api_link=approach_api_link +approach_python_api_link=approach_python_api_link +%} diff --git a/docs/en/licensed_annotator_entries/InternalDocumentSplitter.md b/docs/en/licensed_annotator_entries/InternalDocumentSplitter.md new file mode 100644 index 0000000000..be86e2cf1d --- /dev/null +++ b/docs/en/licensed_annotator_entries/InternalDocumentSplitter.md @@ -0,0 +1,179 @@ +{%- capture title -%} +InternalDocumentSplitter +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`InternalDocumentSplitter` splits large documents into small documents. `InternalDocumentSplitter` has setSplitMode method to decide how to split documents. + +If splitMode is `recursive`, It takes the separators in order and splits subtexts if they are over the chunk length, considering optional overlap of the chunks. + +Additionally, you can set +- custom patterns with setSplitPatterns +- whether patterns should be interpreted as regex with setPatternsAreRegex +- whether to keep the separators with setKeepSeparators +- whether to trim whitespaces with setTrimWhitespace +- whether to explode the splits to individual rows with setExplodeSplits + +Parametres: + +- `chunkSize`: Size of each chunk of text. This param is applicable only for "recursive" splitMode. +- `chunkOverlap`: Length of the overlap between text chunks, by default `0`. This param is applicable only for `recursive` splitMode. +- `splitPatterns`: Patterns to split the document. +patternsAreRegex. Whether to interpret the split patterns as regular expressions, by default `True`. +- `keepSeparators`: Whether to keep the separators in the final result , by default `True`. This param is applicable only for "recursive" splitMode. +- `explodeSplits`: Whether to explode split chunks to separate rows , by default `False`. +- `trimWhitespace`: Whether to trim whitespaces of extracted chunks , by default `True`. +- `splitMode`: The split mode to determine how text should be segmented. Default: 'regex'. It should be one of the following values: + - "char": Split text based on individual characters. + - "token": Split text based on tokens. You should supply tokens from inputCols. + - "sentence": Split text based on sentences. You should supply sentences from inputCols. + - "recursive": Split text recursively using a specific algorithm. + - "regex": Split text based on a regular expression pattern. +- `sentenceAwareness`: Whether to split the document by sentence awareness if possible. + - If true, it can stop the split process before maxLength. + - If true, you should supply sentences from inputCols. Default: False. + - This param is not applicable only for `regex` and `recursive` splitMode. +- `maxLength`: The maximum length allowed for spitting. The mode in which the maximum length is specified: + - "char": Maximum length is measured in characters. Default: `512` + - "token": Maximum length is measured in tokens. Default: `128` + - "sentence": Maximum length is measured in sentences. Default: `8` +- `customBoundsStrategy`: The custom bounds strategy for text splitting using regular expressions. This param is applicable only for `regex` splitMode. +- `caseSensitive`: Whether to use case sensitive when matching regex, by default `False`. This param is applicable only for `regex` splitMode. +- `metaDataFields`: Metadata fields to add specified data in columns to the metadata of the split documents. You should set column names to read columns. + +{%- endcapture -%} + +{%- capture model_input_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture model_output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +document_splitter = medical.InternalDocumentSplitter()\ + .setInputCols("document")\ + .setOutputCol("splits")\ + .setSplitMode("recursive")\ + .setChunkSize(100)\ + .setChunkOverlap(3)\ + .setExplodeSplits(True)\ + .setPatternsAreRegex(False)\ + .setSplitPatterns(["\n\n", "\n", " "])\ + .setKeepSeparators(False)\ + .setTrimWhitespace(True) + +pipeline = nlp.Pipeline().setStages([ + document_assembler, + document_splitter +]) + +df = spark.createDataFrame([[( + "The patient is a 28-year-old, who is status post gastric bypass surgery" + " nearly one year ago. \nHe has lost about 200 pounds and was otherwise doing well" + " until yesterday evening around 7:00-8:00 when he developed nausea and right upper quadrant pain," + " which apparently wrapped around toward his right side and back. He feels like he was on it" + " but has not done so. He has overall malaise and a low-grade temperature of 100.3." + " \n\nHe denies any prior similar or lesser symptoms. His last normal bowel movement was yesterday." + " He denies any outright chills or blood per rectum." +)]]).toDF("text") + + +pipeline_df = pipeline.fit(df).transform(df).select("splits").show(truncate=False) + +## Result + ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|splits | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|[{document, 0, 92, The patient is a 28-year-old, who is status post gastric bypass surgery nearly one year ago., {sentence -> 0, document -> 0}, []}] | +|[{document, 94, 192, He has lost about 200 pounds and was otherwise doing well until yesterday evening around 7:00-8:00, {sentence -> 0, document -> 1}, []}] | +|[{document, 193, 291, when he developed nausea and right upper quadrant pain, which apparently wrapped around toward his, {sentence -> 0, document -> 2}, []}] | +|[{document, 288, 387, his right side and back. He feels like he was on it but has not done so. He has overall malaise and, {sentence -> 0, document -> 3}, []}]| +|[{document, 384, 421, and a low-grade temperature of 100.3., {sentence -> 0, document -> 4}, []}] | +|[{document, 424, 520, He denies any prior similar or lesser symptoms. His last normal bowel movement was yesterday. He, {sentence -> 0, document -> 5}, []}] | +|[{document, 518, 568, He denies any outright chills or blood per rectum., {sentence -> 0, document -> 6}, []}] | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val document_splitter = new InternalDocumentSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setSplitMode("recursive") + .setChunkSize(100) + .setChunkOverlap(3) + .setExplodeSplits(true) + .setPatternsAreRegex(false) + .setSplitPatterns(Array("\n\n", "\n", " ")) + .setKeepSeparators(false) + .setTrimWhitespace(true) + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + document_splitter )) + + +val test_data = Seq( "The patient is a 28-year-old, who is status post gastric bypass surgery" + " nearly one year ago. \nHe has lost about 200 pounds and was otherwise doing well" + " until yesterday evening around 7:00-8:00 when he developed nausea and right upper quadrant pain," + " which apparently wrapped around toward his right side and back. He feels like he was on it" + " but has not done so. He has overall malaise and a low-grade temperature of 100.3." + " \n\nHe denies any prior similar or lesser symptoms. His last normal bowel movement was yesterday." + " He denies any outright chills or blood per rectum.").toDF("text") + +val res = mapperPipeline.fit(test_data).transform(test_data) + +// Show results + ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|splits | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|[{document, 0, 92, The patient is a 28-year-old, who is status post gastric bypass surgery nearly one year ago., {sentence -> 0, document -> 0}, []}] | +|[{document, 94, 192, He has lost about 200 pounds and was otherwise doing well until yesterday evening around 7:00-8:00, {sentence -> 0, document -> 1}, []}] | +|[{document, 193, 291, when he developed nausea and right upper quadrant pain, which apparently wrapped around toward his, {sentence -> 0, document -> 2}, []}] | +|[{document, 288, 387, his right side and back. He feels like he was on it but has not done so. He has overall malaise and, {sentence -> 0, document -> 3}, []}]| +|[{document, 384, 421, and a low-grade temperature of 100.3., {sentence -> 0, document -> 4}, []}] | +|[{document, 424, 520, He denies any prior similar or lesser symptoms. His last normal bowel movement was yesterday. He, {sentence -> 0, document -> 5}, []}] | +|[{document, 518, 568, He denies any outright chills or blood per rectum., {sentence -> 0, document -> 6}, []}] | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + + +{%- capture model_notebook_link -%} +[InternalDocumentSplitterNotebook](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/38.InternalDocumentSplitter.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/NerQuestionGenerator.md b/docs/en/licensed_annotator_entries/NerQuestionGenerator.md new file mode 100644 index 0000000000..95b8f94c9f --- /dev/null +++ b/docs/en/licensed_annotator_entries/NerQuestionGenerator.md @@ -0,0 +1,186 @@ +{%- capture title -%} +NerQuestionGenerator +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} + +`NerQuestionGenerator` takes an NER chunk (obtained by, e.g., `NerConverterInternal`) and generates a questions based on two entity types, a pronoun and a strategy. + +The question is generated in the form of `[QUESTIONPRONOUN] [ENTITY1] [ENTITY2] [QUESTIONMARK]`. The generated question can be used by `QuestionAnswerer` or `ZeroShotNer` annotators to answer the question or find NER entities. + +Parametres: + +- `questionPronoun`: Pronoun to be used in the question. E.g., 'When', 'Where', 'Why', 'How', 'Who', 'What'. +- `strategyType`: Strategy for the proccess, either `Paired` (default) or `Combined`. +- `questionMark`: Whether to add a question mark at the end of the question. +- `entities1`: List with the entity types of entities that appear first in the question. +- `entities2`: List with the entity types of entities that appear second in the question. + + +All the parameters can be set using the corresponding set method in camel case. For example, `.setQuestionPronoun(True)`. +{%- endcapture -%} + +{%- capture model_input_anno -%} +CHUNK +{%- endcapture -%} + +{%- capture model_output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical +import json + +entities = [ + { + "label": "Person", + "patterns": ["Jon", "John", "John's"] + }, + { + "label": "Organization", + "patterns": ["St. Mary's Hospital", "St. Mary's"] + }, + { + "label": "Condition", + "patterns": ["vital signs", "heartbeat", "oxygen saturation levels"] + } +] + +with open('./entities.json', 'w') as jsonfile: + json.dump(entities, jsonfile) + + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +entity_ruler = nlp.EntityRulerApproach() \ + .setInputCols(["document"]) \ + .setOutputCol("entity") \ + .setPatternsResource("./entities.json")\ + .setCaseSensitive(False) + +qagenerator = medical.NerQuestionGenerator()\ + .setInputCols(["entity"])\ + .setOutputCol("question")\ + .setQuestionPronoun("How is")\ + .setEntities1(["Person"])\ + .setEntities2(["Condition"])\ + .setStrategyType("Paired")\ + .setQuestionMark(True) + +prep_pipeline = nlp.Pipeline(stages=[ + document_assembler, + entity_ruler, + qagenerator +]) + +example_text = """At St. Mary's Hospital, the healthcare team closely monitored John's vital signs with unwavering attention. They recorded his heartbeat and oxygen saturation levels, promptly addressing any deviations from normal. Their dedication and expertise at St. Mary's played a vital role in ensuring John's stability and fostering a swift recovery.""" + +df = spark.createDataFrame([[example_text]]).toDF("text") + +result = prep_pipeline.fit(df).transform(df) + +result.select("question").show(truncate=False) + +## Result + ++--------------------------------------------------------------------------------------------------------------------------------------------+ +|question | ++--------------------------------------------------------------------------------------------------------------------------------------------+ +|[{document, 62, 79, How is John's vital signs ?, {sentence -> 0}, []}, {document, 291, 134, How is John's heartbeat ?, {sentence -> 0}, []}]| ++--------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +/* entities.json file +entities = [ + { + "label": "Person", + "patterns": ["Jon", "John", "John's"] + }, + { + "label": "Organization", + "patterns": ["St. Mary's Hospital", "St. Mary's"] + }, + { + "label": "Condition", + "patterns": ["vital signs", "heartbeat", "oxygen saturation levels"] + } +] +*/ + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val entity_ruler = new EntityRulerApproach() + .setInputCols("document") + .setOutputCol("entity") + .setPatternsResource("./entities.json") + .setCaseSensitive(false) + +val qagenerator = new NerQuestionGenerator() + .setInputCols("entity") + .setOutputCol("question") + .setQuestionPronoun("How is") + .setEntities1("Person") + .setEntities2("Condition") + .setStrategyType("Paired") + .setQuestionMark(true) + +val prep_pipeline = new Pipeline().setStages(Array( + document_assembler, + entity_ruler, + qagenerator )) + +val test_data = Seq("""At St. Mary's Hospital, the healthcare team closely monitored John's vital signs with unwavering attention. They recorded his heartbeat and oxygen saturation levels, promptly addressing any deviations from normal. Their dedication and expertise at St. Mary's played a vital role in ensuring John's stability and fostering a swift recovery.""").toDF("text") + +val res = mapperPipeline.fit(test_data).transform(test_data) + +// Show results + ++--------------------------------------------------------------------------------------------------------------------------------------------+ +|question | ++--------------------------------------------------------------------------------------------------------------------------------------------+ +|[{document, 62, 79, How is John's vital signs ?, {sentence -> 0}, []}, {document, 291, 134, How is John's heartbeat ?, {sentence -> 0}, []}]| ++--------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_api_link -%} +[NerQuestionGenerator](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/qa/NerQuestionGenerator.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[NerQuestionGenerator](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/qa/qa_ner_generator/index.html#sparknlp_jsl.annotator.qa.qa_ner_generator.NerQuestionGenerator) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[NerQuestionGeneratorNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/NerQuestionGenerator.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/Replacer.md b/docs/en/licensed_annotator_entries/Replacer.md new file mode 100644 index 0000000000..8995d82608 --- /dev/null +++ b/docs/en/licensed_annotator_entries/Replacer.md @@ -0,0 +1,271 @@ +{%- capture title -%} +Replacer +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`Replacer` allows to replace entities in the original text with the ones extracted by the annotators `NameChunkObfuscatorApproach` or `DateNormalizer`. + +`Replacer` is most often used in conjunction with the `DateNormalizer` annotator or in deidentification pipelines. + +With the dates, the `Replacer` annotator is used to replace specific tokens in a text with another token or string. The `DateNormalizer` annotator, on the other hand, is used to normalize dates and times to a standardized format. + +Obfuscation in healthcare is the act of making healthcare data difficult to understand or use without authorization. This can be done by replacing or removing identifying information, such as names, dates of birth, and Social Security numbers. Obfuscation can also be used to hide the contents of healthcare records, such as diagnoses, medications, and treatment plans. + +In the **deidentification** process, the `Replacer` annotator is used to replace certain tokens or patterns in the text with specified values. For example, it can be used to replace all instances of a person's name with a placeholder like "PERSON". + +The `NameChunkObfuscatorApproach` annotator is used to identify and obfuscate sensitive named entities in the text, such as people's names, addresses, dates of birth, SSNs etc. + +Parametres: + +- `setUseReplacement`: (Boolean) Select what output format should be used. By default it will use the current day. + +{%- endcapture -%} + +{%- capture model_input_anno -%} +DOCUMENT, CHUNK +{%- endcapture -%} + +{%- capture model_output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +names = """Mitchell#NAME +Clifford#NAME +Jeremiah#NAME +Lawrence#NAME +Brittany#NAME +Patricia#NAME +Samantha#NAME +Jennifer#NAME +Jackson#NAME +Leonard#NAME +Randall#NAME +Camacho#NAME +Ferrell#NAME +Mueller#NAME +Bowman#NAME +Hansen#NAME +Acosta#NAME +Gillespie#NAME +Zimmerman#NAME +Gillespie#NAME +Chandler#NAME +Bradshaw#NAME +Ferguson#NAME +Jacobson#NAME +Figueroa#NAME +Chandler#NAME +Schaefer#NAME +Matthews#NAME +Ferguson#NAME +Bradshaw#NAME +Figueroa#NAME +Delacruz#NAME +Gallegos#NAME +Villarreal#NAME +Williamson#NAME +Montgomery#NAME +Mclaughlin#NAME +Blankenship#NAME +Fitzpatrick#NAME +""" + +with open('names_test.txt', 'w') as file: + file.write(names) + + +# Annotator that transforms a text column from dataframe into an Annotation ready for NLP +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("sentence")\ + +# Tokenizer splits words in a relevant format for NLP +tokenizer = nlp.Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token")\ + +# Clinical word embeddings trained on PubMED dataset +word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +# NER model trained on n2c2 (de-identification and Heart Disease Risk Factors Challenge) datasets) +clinical_ner = medical.NerModel.pretrained("ner_deid_generic_augmented", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter_name = medical.NerConverterInternal()\ + .setInputCols(["sentence","token","ner"])\ + .setOutputCol("ner_chunk") + +nameChunkObfuscator = medical.NameChunkObfuscatorApproach()\ + .setInputCols("ner_chunk")\ + .setOutputCol("replacement")\ + .setRefFileFormat("csv")\ + .setObfuscateRefFile("names_test.txt")\ + .setRefSep("#")\ + +replacer_name = medical.Replacer()\ + .setInputCols("replacement","sentence")\ + .setOutputCol("obfuscated_document_name")\ + .setUseReplacement(True) + +nlpPipeline = nlp.Pipeline(stages=[ + documentAssembler, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter_name, + nameChunkObfuscator, + replacer_name + ]) + +sample_text = "John Davies is a 62 y.o. patient admitted. Mr. Davies was seen by attending physician Dr. Lorand and was scheduled for emergency assessment." + +data = spark.createDataFrame([[sample_text]]).toDF("text") +result = nlpPipeline.fit(data).transform(data) + +## Result + +Original text. : John Davies is a 62 y.o. patient admitted. Mr. Davies was seen by attending physician Dr. Lorand and was scheduled for emergency assessment. + +Obfuscated text : Joseeduardo is a 62 y.o. patient admitted. Mr. Teigan was seen by attending physician Dr. Mayson and was scheduled for emergency assessment. + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +/* names.txt file + +names = """Mitchell#NAME +Clifford#NAME +Jeremiah#NAME +Lawrence#NAME +Brittany#NAME +Patricia#NAME +Samantha#NAME +Jennifer#NAME +Jackson#NAME +Leonard#NAME +Randall#NAME +Camacho#NAME +Ferrell#NAME +Mueller#NAME +Bowman#NAME +Hansen#NAME +Acosta#NAME +Gillespie#NAME +Zimmerman#NAME +Gillespie#NAME +Chandler#NAME +Bradshaw#NAME +Ferguson#NAME +Jacobson#NAME +Figueroa#NAME +Chandler#NAME +Schaefer#NAME +Matthews#NAME +Ferguson#NAME +Bradshaw#NAME +Figueroa#NAME +Delacruz#NAME +Gallegos#NAME +Villarreal#NAME +Williamson#NAME +Montgomery#NAME +Mclaughlin#NAME +Blankenship#NAME +Fitzpatrick#NAME +""" +*/ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val clinical_ner = MedicalNerModel.pretrained("ner_deid_generic_augmented","en","clinical/models") + .setInputCols(Array("sentence","token","embeddings")) + .setOutputCol("ner") + +val ner_converter_name = new NerConverterInternal() + .setInputCols(Array("sentence","token","ner")) + .setOutputCol("ner_chunk") + +val nameChunkObfuscator = new NameChunkObfuscatorApproach() + .setInputCols("ner_chunk") + .setOutputCol("replacement") + .setRefFileFormat("csv") + .setObfuscateRefFile("names_test.txt") + .setRefSep("//") + +val replacer_name = new Replacer() + .setInputCols("replacement","sentence") + .setOutputCol("obfuscated_document_name") + .setUseReplacement(true) + +val nlpPipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter_name, + nameChunkObfuscator, + replacer_name)) + + +val test_data = Seq("""John Davies is a 62 y.o. patient admitted. Mr. Davies was seen by attending physician Dr. Lorand and was scheduled for emergency assessment.""").toDF("text") + +val res = mapperPipeline.fit(test_data).transform(test_data) + +// Show results + +Original text. : John Davies is a 62 y.o. patient admitted. Mr. Davies was seen by attending physician Dr. Lorand and was scheduled for emergency assessment. + +Obfuscated text : Joseeduardo is a 62 y.o. patient admitted. Mr. Teigan was seen by attending physician Dr. Mayson and was scheduled for emergency assessment. + +{%- endcapture -%} + +{%- capture model_api_link -%} +[Replacer](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/deid/Replacer.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[Replacer](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/deid/replacer/index.html#sparknlp_jsl.annotator.deid.replacer.Replacer) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[ReplacerNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/Replacer.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/ResolverMerger.md b/docs/en/licensed_annotator_entries/ResolverMerger.md new file mode 100644 index 0000000000..3f42ee3dc4 --- /dev/null +++ b/docs/en/licensed_annotator_entries/ResolverMerger.md @@ -0,0 +1,249 @@ +{%- capture title -%} +ResolverMerger +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`ResolverMerger` provides the ability to merge sentence enitity resolver and chunk mapper model output columns. + +To convert a sentence or document into a vector for tasks like semantic search or recommendation systems, a common approach is to utilize transformer models like BERT. These models provide embeddings for each token in the text. One option is to extract the embedding vector of the CLS token, which represents the overall meaning of the text. Another option is to average the embeddings of all tokens. + +Alternatively, we can use fine-tuned Siamese network variants like SBERT, which are specifically designed to generate embeddings that bring similar sentences or documents closer together in the embedding space while separating dissimilar ones. These embeddings can be applied in "Sentence Entity Resolver Models" to perform entity mapping. + +However, for a more straightforward approach, we can use a chunk mapper method to extract entities from the text. In addition, by combining resolver models and mapper models using the `ResolverMerger` annotator, we can further enhance the performance and accuracy of the resolver system. + +Parametres: + +- `inputCols`: The name of the columns containing the input annotations. It can read an Array of strings. +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. + +{%- endcapture -%} + +{%- capture model_input_anno -%} +ENTITY, LABEL_DEPENDENCY +{%- endcapture -%} + +{%- capture model_output_anno -%} +ENTITY +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol('text')\ + .setOutputCol('document') + +sentence_detector = nlp.SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = nlp.Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = medical.NerModel.pretrained("ner_posology_greedy", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = medical.NerConverterInternal()\ + .setInputCols("sentence", "token", "ner")\ + .setOutputCol("chunk") + +chunkerMapper = medical.ChunkMapperModel.pretrained("rxnorm_mapper", "en", "clinical/models")\ + .setInputCols(["chunk"])\ + .setOutputCol("RxNorm_Mapper")\ + .setRel("rxnorm_code") + +cfModel = medical.ChunkMapperFilterer() \ + .setInputCols(["chunk", "RxNorm_Mapper"]) \ + .setOutputCol("chunks_fail") \ + .setReturnCriteria("fail") + +chunk2doc = nlp.Chunk2Doc() \ + .setInputCols("chunks_fail") \ + .setOutputCol("doc_chunk") + +sbert_embedder = nlp.BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\ + .setInputCols(["doc_chunk"])\ + .setOutputCol("sentence_embeddings")\ + .setCaseSensitive(False) + +resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_rxnorm_augmented", "en", "clinical/models") \ + .setInputCols(["sentence_embeddings"]) \ + .setOutputCol("resolver_code") \ + .setDistanceFunction("EUCLIDEAN") + +resolverMerger = medical.ResolverMerger()\ + .setInputCols(["resolver_code","RxNorm_Mapper"])\ + .setOutputCol("RxNorm") + +mapper_pipeline = nlp.Pipeline( + stages = [ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner_model, + ner_converter, + chunkerMapper, + chunkerMapper, + cfModel, + chunk2doc, + sbert_embedder, + resolver, + resolverMerger + ]) + +sample_text = [ + ["The patient was given Adapin 10 MG, coumadn 5 mg"], + ["The patient was given Avandia 4 mg, Tegretol, zitiga"], +] + +data = spark.createDataFrame(sample_text).toDF("text") + +result = mapper_pipeline.fit(data).transform(data) + +result.selectExpr( + "chunk.result as chunk", + "RxNorm_Mapper.result as RxNorm_Mapper", + "chunks_fail.result as chunks_fail", + "resolver_code.result as resolver_code", + "RxNorm.result as RxNorm", +).show(truncate=False) + + +## Result + ++--------------------------------+----------------------+--------------+-------------+------------------------+ +|chunk |RxNorm_Mapper |chunks_fail |resolver_code|RxNorm | ++--------------------------------+----------------------+--------------+-------------+------------------------+ +|[Adapin 10 MG, coumadn 5 mg] |[1000049, NONE] |[coumadn 5 mg]|[200883] |[1000049, 200883] | +|[Avandia 4 mg, Tegretol, zitiga]|[261242, 203029, NONE]|[zitiga] |[220989] |[261242, 203029, 220989]| ++--------------------------------+----------------------+--------------+-------------+------------------------+ + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_posology_greedy","en","clinical/models") + .setInputCols(Array("sentence","token","embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence","token","ner")) + .setOutputCol("chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("rxnorm_mapper","en","clinical/models") + .setInputCols("chunk") + .setOutputCol("RxNorm_Mapper") + .setRel("rxnorm_code") + +val cfModel = new ChunkMapperFilterer() + .setInputCols(Array("chunk","RxNorm_Mapper")) + .setOutputCol("chunks_fail") + .setReturnCriteria("fail") + +val chunk2doc = new Chunk2Doc() + .setInputCols("chunks_fail") + .setOutputCol("doc_chunk") + +val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models") + .setInputCols("doc_chunk") + .setOutputCol("sentence_embeddings") + .setCaseSensitive(false) + +val resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_rxnorm_augmented","en","clinical/models") + .setInputCols("sentence_embeddings") + .setOutputCol("resolver_code") + .setDistanceFunction("EUCLIDEAN") + +val resolverMerger = new ResolverMerger() + .setInputCols(Array("resolver_code","RxNorm_Mapper")) + .setOutputCol("RxNorm") + +val mapper_pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner_model, + ner_converter, + chunkerMapper, + chunkerMapper, + cfModel, + chunk2doc, + sbert_embedder, + resolver, + resolverMerger)) + + +val data = Seq(("""The patient was given Adapin 10 MG, coumadn 5 mg"""),("""The patient was given Avandia 4 mg, Tegretol, zitiga""")).toDF("text") + +val res = mapperPipeline.fit(data).transform(data) + +// Show results + ++--------------------------------+----------------------+--------------+-------------+------------------------+ +|chunk |RxNorm_Mapper |chunks_fail |resolver_code|RxNorm | ++--------------------------------+----------------------+--------------+-------------+------------------------+ +|[Adapin 10 MG, coumadn 5 mg] |[1000049, NONE] |[coumadn 5 mg]|[200883] |[1000049, 200883] | +|[Avandia 4 mg, Tegretol, zitiga]|[261242, 203029, NONE]|[zitiga] |[220989] |[261242, 203029, 220989]| ++--------------------------------+----------------------+--------------+-------------+------------------------+ + +{%- endcapture -%} + +{%- capture model_api_link -%} +[ResolverMerger](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/resolution/ResolverMerger.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[ResolverMerger](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/resolution/resolver_merger/index.html#module-sparknlp_jsl.annotator.resolution.resolver_merger) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[ResolverMergerNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ResolverMerger.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/Router.md b/docs/en/licensed_annotator_entries/Router.md new file mode 100644 index 0000000000..4056e94f20 --- /dev/null +++ b/docs/en/licensed_annotator_entries/Router.md @@ -0,0 +1,308 @@ +{%- capture title -%} +Router +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`Router` provides the ability to split an output of an annotator for a selected metadata field and the value for that field. + +When we need to use multiple sentence entity resolver models in the same pipeline, we typically had to run the `BertSentenceEmbeddings` annotator multiple times based on the number of resolver models. This meant that the heavy process of generating sentence embeddings using BERT was repeated multiple times. + +To address this issue, Spark NLP Healthcare Library has introduced a solution using the `Router` annotator. With this new approach, we can provide all the named entity recognition (NER) chunks to the `BertSentenceEmbeddings` annotator at once. The annotator generates the sentence embeddings for all the chunks together. Then, the output of the sentence embeddings is routed to the specific resolver models that are required for further processing. + +This solution eliminates the need to run `BertSentenceEmbeddings` multiple times, reducing the computational overhead and improving the efficiency of the pipeline. + +Parametres: + +- `inputCols`: The name of the columns containing the input annotations. It can read an Array of strings. +- `outputCol`: The name of the column in the Document type that is generated. We can specify only one column here. +- `inputType`: The type of entity that you want to filter (by default `sentence_embeddings`). Possible values; `document|token|wordpiece|word_embeddings|sentence_embeddings|category|date|sentiment|pos|chunk|named_entity|regex|dependency|labeled_dependency|language|keyword` +- `metadataField`: The key in the metadata dictionary that you want to filter (by default `entity`) +- `filterFieldsElements`: The `filterfieldsElements` are the allowed values for the metadata field that is being used. + +All the parameters can be set using the corresponding set method in the camel case. For example, `.setInputcols()`. + +{%- endcapture -%} + +{%- capture model_input_anno -%} +ENTITY, LABEL_DEPENDENCY +{%- endcapture -%} + +{%- capture model_output_anno -%} +ENTITY +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = nlp.SentenceDetector()\ + .setInputCols("document")\ + .setOutputCol("sentence") + +tokenizer = nlp.Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols("sentence", "token")\ + .setOutputCol("word_embeddings") + +# to get PROBLEM entitis +clinical_ner = medical.NerModel().pretrained("ner_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "word_embeddings"]) \ + .setOutputCol("clinical_ner") + +clinical_ner_chunk = medical.NerConverterInternal()\ + .setInputCols("sentence","token","clinical_ner")\ + .setOutputCol("clinical_ner_chunk")\ + .setWhiteList(["PROBLEM"]) + +# to get DRUG entities +posology_ner = medical.NerModel().pretrained("ner_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "word_embeddings"]) \ + .setOutputCol("posology_ner") + +posology_ner_chunk = medical.NerConverterInternal()\ + .setInputCols("sentence","token","posology_ner")\ + .setOutputCol("posology_ner_chunk")\ + .setWhiteList(["DRUG"]) + +# merge the chunks into a single ner_chunk +chunk_merger = medical.ChunkMergeApproach()\ + .setInputCols("clinical_ner_chunk","posology_ner_chunk")\ + .setOutputCol("final_ner_chunk")\ + .setMergeOverlapping(False) + +# convert chunks to doc to get sentence embeddings of them +chunk2doc = nlp.Chunk2Doc().setInputCols("final_ner_chunk").setOutputCol("doc_final_chunk") + +sbiobert_embeddings = nlp.BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\ + .setInputCols(["doc_final_chunk"])\ + .setOutputCol("sbert_embeddings")\ + .setCaseSensitive(False) + +# filter PROBLEM entity embeddings +router_sentence_icd10 = medical.Router() \ + .setInputCols("sbert_embeddings") \ + .setFilterFieldsElements(["PROBLEM"]) \ + .setOutputCol("problem_embeddings") + +# filter DRUG entity embeddings +router_sentence_rxnorm = medical.Router() \ + .setInputCols("sbert_embeddings") \ + .setFilterFieldsElements(["DRUG"]) \ + .setOutputCol("drug_embeddings") + +# use problem_embeddings only +icd_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_slim_billable_hcc","en", "clinical/models") \ + .setInputCols(["problem_embeddings"]) \ + .setOutputCol("icd10cm_code")\ + .setDistanceFunction("EUCLIDEAN") + +# use drug_embeddings only +rxnorm_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_rxnorm_augmented","en", "clinical/models") \ + .setInputCols(["drug_embeddings"]) \ + .setOutputCol("rxnorm_code")\ + .setDistanceFunction("EUCLIDEAN") + + +pipeline = nlp.Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + clinical_ner, + clinical_ner_chunk, + posology_ner, + posology_ner_chunk, + chunk_merger, + chunk2doc, + sbiobert_embeddings, + router_sentence_icd10, + router_sentence_rxnorm, + icd_resolver, + rxnorm_resolver +]) + +clinical_note = """The patient is a 41-year-old Vietnamese female with a cough that started last week. +She has had right-sided chest pain radiating to her back with fever starting yesterday. +She has a history of pericarditis in May 2006 and developed cough with right-sided chest pain. +MEDICATIONS +1. Coumadin 1 mg daily. Last INR was on Tuesday, August 14, 2007, and her INR was 2.3. +2. Amiodarone 100 mg p.o. daily. +""" + +data = spark.createDataFrame([[clinical_note]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +## Result + +result.selectExpr( + "final_ner_chunk.result as chunk", + "posology_ner_chunk.result as posology_chunk", + "rxnorm_code.result as rxnorm_code", + "clinical_ner_chunk.result as clinical_chunk", + "icd10cm_code.result as icd10cm_code", +).show(truncate=False) + ++-----------------------------------------------------------------------------------------------------------+----------------------+-------------+-------------------------------------------------------------------------------------+--------------------------------------+ +|chunk |posology_chunk |rxnorm_code |clinical_chunk |icd10cm_code | ++-----------------------------------------------------------------------------------------------------------+----------------------+-------------+-------------------------------------------------------------------------------------+--------------------------------------+ +|[a cough, right-sided chest pain, fever, pericarditis, cough, right-sided chest pain, Coumadin, Amiodarone]|[Coumadin, Amiodarone]|[202421, 703]|[a cough, right-sided chest pain, fever, pericarditis, cough, right-sided chest pain]|[R05, R10.11, A68, I30.1, R05, R10.11]| ++-----------------------------------------------------------------------------------------------------------+----------------------+-------------+-------------------------------------------------------------------------------------+--------------------------------------+ + + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("word_embeddings") + +// to get PROBLEM entitis +val clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") + .setInputCols(Array("sentence","token","word_embeddings")) + .setOutputCol("clinical_ner") + +val clinical_ner_chunk = new NerConverterInternal() + .setInputCols("sentence","token","clinical_ner") + .setOutputCol("clinical_ner_chunk") + .setWhiteList("PROBLEM") + +// to get DRUG entities +val posology_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") + .setInputCols(Array("sentence","token","word_embeddings")) + .setOutputCol("posology_ner") + +val posology_ner_chunk = new NerConverterInternal() + .setInputCols("sentence","token","posology_ner") + .setOutputCol("posology_ner_chunk") + .setWhiteList("DRUG") + +// merge the chunks into a single ner_chunk +val chunk_merger = new ChunkMergeApproach() + .setInputCols(Array("clinical_ner_chunk","posology_ner_chunk")) + .setOutputCol("final_ner_chunk") + .setMergeOverlapping(false) + +// convert chunks to doc to get sentence embeddings of them +val chunk2doc = new Chunk2Doc() + .setInputCols("final_ner_chunk") + .setOutputCol("doc_final_chunk") + +val sbiobert_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models") + .setInputCols("doc_final_chunk") + .setOutputCol("sbert_embeddings") + .setCaseSensitive(false) + +// filter PROBLEM entity embeddings +val router_sentence_icd10 = new Router() + .setInputCols("sbert_embeddings") + .setFilterFieldsElements("PROBLEM") + .setOutputCol("problem_embeddings") + +// filter DRUG entity embeddings +val router_sentence_rxnorm = new Router() + .setInputCols("sbert_embeddings") + .setFilterFieldsElements("DRUG") + .setOutputCol("drug_embeddings") + +// use problem_embeddings only +val icd_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_slim_billable_hcc", "en", "clinical/models") + .setInputCols("problem_embeddings") + .setOutputCol("icd10cm_code") + .setDistanceFunction("EUCLIDEAN") + +// use drug_embeddings only +val rxnorm_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_rxnorm_augmented", "en", "clinical/models") + .setInputCols("drug_embeddings") + .setOutputCol("rxnorm_code") + .setDistanceFunction("EUCLIDEAN") + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + clinical_ner, + clinical_ner_chunk, + posology_ner, + posology_ner_chunk, + chunk_merger, + chunk2doc, + sbiobert_embeddings, + router_sentence_icd10, + router_sentence_rxnorm, + icd_resolver, + rxnorm_resolver)) + + +val data = Seq("""The patient is a 41-year-old Vietnamese female with a cough that started last week. +She has had right-sided chest pain radiating to her back with fever starting yesterday. +She has a history of pericarditis in May 2006 and developed cough with right-sided chest pain. +MEDICATIONS +1. Coumadin 1 mg daily. Last INR was on Tuesday, August 14, 2007, and her INR was 2.3. +2. Amiodarone 100 mg p.o. daily.""").toDF("text") + +val res = mapperPipeline.fit(data).transform(data) + +// Show results + ++-----------------------------------------------------------------------------------------------------------+----------------------+-------------+-------------------------------------------------------------------------------------+--------------------------------------+ +|chunk |posology_chunk |rxnorm_code |clinical_chunk |icd10cm_code | ++-----------------------------------------------------------------------------------------------------------+----------------------+-------------+-------------------------------------------------------------------------------------+--------------------------------------+ +|[a cough, right-sided chest pain, fever, pericarditis, cough, right-sided chest pain, Coumadin, Amiodarone]|[Coumadin, Amiodarone]|[202421, 703]|[a cough, right-sided chest pain, fever, pericarditis, cough, right-sided chest pain]|[R05, R10.11, A68, I30.1, R05, R10.11]| ++-----------------------------------------------------------------------------------------------------------+----------------------+-------------+-------------------------------------------------------------------------------------+--------------------------------------+ + +{%- endcapture -%} + +{%- capture model_api_link -%} +[Router](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/annotator/Router.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[Router](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/router/index.html#module-sparknlp_jsl.annotator.router) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[RouterNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/Router.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%}