From 05b53fbbc51898479ba14d3c2b79b5b35074c36a Mon Sep 17 00:00:00 2001 From: Yigit <110747690+yigitgull@users.noreply.github.com> Date: Mon, 7 Oct 2024 01:31:37 +0300 Subject: [PATCH] 550 documentation updated. (#1531) --- .../ChunkConverter.md | 3 +- .../licensed_annotator_entries/ChunkMerge.md | 1 + .../ContextualEntityFilterer.md | 207 ++++++++++++++++ .../licensed_annotator_entries/Flattener.md | 5 +- .../licensed_annotator_entries/MedicalLLM.md | 227 ++++++++++++++++++ .../NerConverterInternal.md | 4 +- .../REChunkMerger.md | 206 ++++++++++++++++ .../en/licensed_annotator_entries/Replacer.md | 7 +- docs/en/licensed_annotators.md | 3 + 9 files changed, 657 insertions(+), 6 deletions(-) create mode 100644 docs/en/licensed_annotator_entries/ContextualEntityFilterer.md create mode 100644 docs/en/licensed_annotator_entries/MedicalLLM.md create mode 100644 docs/en/licensed_annotator_entries/REChunkMerger.md diff --git a/docs/en/licensed_annotator_entries/ChunkConverter.md b/docs/en/licensed_annotator_entries/ChunkConverter.md index 436eebe4c8..2c1e14a5e1 100644 --- a/docs/en/licensed_annotator_entries/ChunkConverter.md +++ b/docs/en/licensed_annotator_entries/ChunkConverter.md @@ -14,8 +14,9 @@ This annotator is important when the user wants to merge entities identified by Parameters: - `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. - - `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. +- `resetSentenceIndices`: Whether to reset sentence indices to treat the entire output as if it originates from a single document. Default: False. + All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. diff --git a/docs/en/licensed_annotator_entries/ChunkMerge.md b/docs/en/licensed_annotator_entries/ChunkMerge.md index da0def9316..247f8fd57b 100644 --- a/docs/en/licensed_annotator_entries/ChunkMerge.md +++ b/docs/en/licensed_annotator_entries/ChunkMerge.md @@ -29,6 +29,7 @@ Parameters: - `defaultConfidence`: (Float) Sets when ChunkConfidence ordering feature is included and a given annotation does not have any confidence. The value of this param will be used as a confidence score for annotations without a confidence score. - `chunkPrecedence`: (String List) Sets what is the precedence order when a chunk labeled by two models. - `chunkPrecedenceValuePrioritization`: (String List) Sets when ChunkPrecedence ordering feature is used. This param contains an Array of comma-separated values representing the desired order of prioritization for the values in the metadata fields included from chunkPrecedence. +- `resetSentenceIndices`: Whether to reset sentence indices to treat the entire output as if it originates from a single document. Default: False. All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. {%- endcapture -%} diff --git a/docs/en/licensed_annotator_entries/ContextualEntityFilterer.md b/docs/en/licensed_annotator_entries/ContextualEntityFilterer.md new file mode 100644 index 0000000000..8ab91854a3 --- /dev/null +++ b/docs/en/licensed_annotator_entries/ContextualEntityFilterer.md @@ -0,0 +1,207 @@ + n{%- capture title -%} +ContextualEntityFilterer +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +ContextualEntityFilterer can filter chunks coming from CHUNK annotations based on entity(identifier,field) info in metadata and contextual cues. +Filters can be done via white list entities, black list entities, black list word and white list words. +The filter can be applied to the scope of the sentence or the document. + +Parameters: + +- `ruleScope`: The rule scope to apply the filter. Options: sentence, document. +- `caseSensitive`: Whether to use case-sensitive when matching words. Default is `False`. +- `rules`: The filtering rules. Each rule is a dictionary with the following keys: + - `entity`: The target entity field for filtering. + - `scopeWindow`: A list of two integers [before, after], specifying how many tokens/chunks before and after the target to consider. + - `whiteListEntities`: The white list of entities. If one of the entity from this list appears within the scope window, the chunk will be kept. Only one element is enough to keep the chunk. + - `blackListEntities`: The black list of entities. If an entity from this list appears within the scope window, the chunk will be filtered out. All elements must be absent to keep the chunk. + - `scopeWindowLevel`: Determines whether the `scopeWindow` is applied at the token or chunk level. Options: `token`, `chunk`. + - `blackListWords`: The black list of words. If a word from this list appears within the scope window, the chunk will be filtered out. + - `whiteListWords`: The white list of words. If a word from this list appears within the scope window, the chunk will be kept. + - `confidenceThreshold`: The confidence threshold to filter the chunks. Filtering is only applied if the confidence of the chunk is below the threshold. + +{%- endcapture -%} + +{%- capture model_input_anno -%} + + +{%- endcapture -%} + +{%- capture model_input_anno -%} +DOCUMENT, TOKEN, CHUNK +{%- endcapture -%} + +{%- capture model_output_anno -%} +CHUNK +{%- endcapture -%} + +{%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["document"])\ + .setOutputCol("token") + +word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["document", "token"])\ + .setOutputCol("embeddings") + +ner_deid = medical.NerModel.pretrained("ner_deid_subentity_docwise", "en", "clinical/models") \ + .setInputCols(["document", "token", "embeddings"]) \ + .setOutputCol("ner_deid_subentity_docwise") + +ner_deid_converter = medical.NerConverterInternal()\ + .setInputCols(["document", "token", "ner_deid_subentity_docwise"])\ + .setOutputCol("ner_chunk_subentity_docwise") + +rules =[{ "entity": "STATE", + "scopeWindow": [2, 2], + "whiteListEntities": ["CITY"], + "blackListEntities": ["NAME"], + "scopeWindowLevel": "token" + }] + +contextual_entity_filterer = medical.ContextualEntityFilterer() \ + .setInputCols("document", "token", "ner_chunk_subentity_docwise") \ + .setOutputCol("filtered_ner_chunks") \ + .setRules(rules)\ + .setRuleScope("sentence") + +nlpPipeline = nlp.Pipeline( + stages=[ + documentAssembler, + tokenizer, + word_embeddings, + ner_deid, + ner_deid_converter, + contextual_entity_filterer +]) + +text = "NY, a 34-year-old woman, Dr. Michael Johnson cares wit her, at CarePlus Clinic, located at 456 Elm Street, NewYork, NY has recommended starting insulin therapy." +df = spark.createDataFrame([[text]]).toDF("text") +result = nlpPipeline.fit(df).transform(df) + + +# result + ++---------------+-----+---+---------+----------+ +|chunk |begin|end|ner_label|confidence| ++---------------+-----+---+---------+----------+ +|NY |0 |1 |STATE |0.9299 | +|34-year-old |6 |16 |AGE |0.7687 | +|Michael Johnson|29 |43 |DOCTOR |0.89965 | +|CarePlus Clinic|63 |77 |HOSPITAL |0.9661 | +|456 Elm Street |91 |104|STREET |0.7733667 | +|NewYork |107 |113|CITY |0.9302 | +|NY |116 |117|STATE |0.9991 | ++---------------+-----+---+---------+----------+ +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols("document", "token") + .setOutputCol("embeddings") + +val ner_deid = MedicalNerModel.pretrained("ner_deid_subentity_docwise", "en", "clinical/models") + .setInputCols("document", "token", "embeddings") + .setOutputCol("ner_deid_subentity_docwise") + +val ner_deid_converter = new NerConverterInternal() + .setInputCols("document", "token", "ner_deid_subentity_docwise") + .setOutputCol("ner_chunk_subentity_docwise") + +val rules = + """ + |[{ + |"entity": "STATE", + | "scopeWindow": [2, 2], + | "whiteListEntities": ["CITY"], + | "blackListEntities": ["NAME"], + | "scopeWindowLevel": "token" + | + | } + | ] + | + |""".stripMargin + +val contextual_entity_filterer = new ContextualEntityFilterer() + .setInputCols("document", "token", "ner_chunk_subentity_docwise") + .setOutputCol("filtered_ner_chunks") + .setRulesAsStr(rules) + .setRuleScope("sentence") + +val nlpPipeline = new Pipeline().setStages( + Array( + documentAssembler, + tokenizer, + word_embeddings, + ner_deid, + ner_deid_converter, + contextual_entity_filterer + )) + +val text = "NY, a 34-year-old woman, Dr. Michael Johnson cares wit her, at CarePlus Clinic, located at 456 Elm Street, NewYork, NY has recommended starting insulin therapy." +val df = Seq(text).toDF("text") +val result = nlpPipeline.fit(df).transform(df) + + +# result ++---------------+-----+---+---------+----------+ +|chunk |begin|end|ner_label|confidence| ++---------------+-----+---+---------+----------+ +|NY |0 |1 |STATE |0.9299 | +|34-year-old |6 |16 |AGE |0.7687 | +|Michael Johnson|29 |43 |DOCTOR |0.89965 | +|CarePlus Clinic|63 |77 |HOSPITAL |0.9661 | +|456 Elm Street |91 |104|STREET |0.7733667 | +|NewYork |107 |113|CITY |0.9302 | +|NY |116 |117|STATE |0.9991 | ++---------------+-----+---+---------+----------+ + +{%- endcapture -%} + +{%- capture model_api_link -%} +[ContextualEntityFilterer](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/context/ContextualEntityFilterer.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[ContextualEntityFilterer](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/ContextualEntityFilterer/index.html) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[ContextualEntityFilterer](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ContextualEntityFilterer.ipynb) +{%- endcapture -%} + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/Flattener.md b/docs/en/licensed_annotator_entries/Flattener.md index 87190ae582..1417442a59 100644 --- a/docs/en/licensed_annotator_entries/Flattener.md +++ b/docs/en/licensed_annotator_entries/Flattener.md @@ -10,14 +10,15 @@ model The `Flattener` converts annotation results into a format that easier to use. This annotator produces a DataFrame with flattened and exploded columns containing annotation results, making it easier to interpret and analyze the information. It is particularly useful for extracting and organizing the results obtained from Spark NLP Pipelines. -Parametres: +Parameters: - `inputCols`: Input annotations. - `cleanAnnotations`: Whether to remove annotation columns, by default `True`. - `explodeSelectedFields`: Dict of input columns to their corresponding selected fields. - `flattenExplodedColumns`: Whether to flatten exploded columns(default : `True`). - `orderByColumn`: Specify the column by which the DataFrame should be ordered.. -- `orderDescending`: specifying whether to order the DataFrame in descending order.(default : `True`). +- `orderDescending`: Specifying whether to order the DataFrame in descending order.(default : `True`). +- `keepOriginalColumns` : Array of column names that should be kept in the DataFrame after the flattening process. See [Spark NLP Workshop](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/41.Flattener.ipynb) for more examples of usage. diff --git a/docs/en/licensed_annotator_entries/MedicalLLM.md b/docs/en/licensed_annotator_entries/MedicalLLM.md new file mode 100644 index 0000000000..9b03720005 --- /dev/null +++ b/docs/en/licensed_annotator_entries/MedicalLLM.md @@ -0,0 +1,227 @@ +{%- capture title -%} +MedicalLLM +{%- endcapture -%} + + +{%- capture model_description -%} + +MedicalLLM was designed to load and run large language models (LLMs) in GGUF format with scalable performance. +Ideal for clinical and healthcare applications, MedicalLLM supports tasks like medical entity extraction, summarization, +Q&A, Retrieval Augmented Generation (RAG), and conversational AI. With simple integration into Spark NLP pipelines, +it allows for customizable batch sizes, prediction settings, and chat templates. GPU optimization is also available, +enhancing its capabilities for high-performance environments. MedicalLLM empowers users to link medical entities and +perform complex NLP tasks with efficiency and precision. + +To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set the number of GPU layers +with the setNGpuLayers method. When using larger models, we recommend adjusting GPU usage with setNCtx and setNGpuLayers +according to your hardware to avoid out-of-memory errors. + +Parameters: + +- `inputPrefix` : Prefix for infilling (default: empty) +- `inputSuffix` : Suffix for infilling (default: empty) +- `cachePrompt` : Whether to remember the prompt to avoid reprocessing it +- `nPredict` : Number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) +- `topK` : Top-k sampling (default: 40, 0 = disabled) +- `topP` : Top-p sampling (default: 0.9, 1.0 = disabled) +- `minP` : Min-p sampling (default: 0.1, 0.0 = disabled) +- `tfsZ` : Tail free sampling, parameter z (default: 1.0, 1.0 = disabled) +- `typicalP` : Locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) +- `temperature` : The temperature (default: 0.8) +- `dynatempRange` : Dynamic temperature range (default: 0.0, 0.0 = disabled) +- `dynatempExponent` : Dynamic temperature exponent (default: 1.0) +- `repeatLastN` : Last n tokens to consider for penalties (default: 64, 0 = disabled, -1 = ctx_size) +- `repeatPenalty` : Penalty of repeated sequences of tokens (default: 1.0, 1.0 = disabled) +- `frequencyPenalty` : Repetition alpha frequency penalty (default: 0.0, 0.0 = disabled) +- `presencePenalty` : Repetition alpha presence penalty (default: 0.0, 0.0 = disabled) +- `mirostatTau` : MiroStat target entropy, parameter tau (default: 5.0) +- `mirostatEta` : MiroStat learning rate, parameter eta (default: 0.1) +- `penalizeNl` : Whether to penalize newline tokens +- `nKeep` : Number of tokens to keep from the initial prompt (default: 0, -1 = all) +- `seed` : RNG seed (default: -1, use random seed for < 0) +- `nProbs` : Amount top tokens probabilities to output if greater than 0. +- `minKeep` : Amount of tokens the samplers should return at least (0 = disabled) +- `grammar` : BNF-like grammar to constrain generations (see samples in grammars/ dir) +- `penaltyPrompt` : Override which part of the prompt is penalized for repetition. E.g. if original prompt is "Alice: Hello!" and penaltyPrompt is "Hello!", only the latter will be penalized if repeated. See pull request 3727 for more details. +- `penaltyPromptTokens` : PenaltyPromptTokens +- `ignoreEos` : Whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf) +- `stopStrings` : Strings upon seeing which token generation is stopped +- `useChatTemplate` : Whether or not generate should apply a chat template (default: false) + +{%- endcapture -%} + + +{%- capture model_input_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture model_output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture model_python_medical -%} + + +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +medical_llm = medical.AutoGGUFModel.pretrained("jsl_meds_ner_q4_v2", "en", "clinical/models")\ + .setInputCols("document")\ + .setOutputCol("completions")\ + .setBatchSize(1)\ + .setNPredict(100)\ + .setUseChatTemplate(True)\ + .setTemperature(0)\ + #.setNGpuLayers(100) # if you have GPU + + +pipeline = nlp.Pipeline( + stages = [ + document_assembler, + medical_llm + ]) + +med_ner_prompt = """ + ### Template: + { + "drugs": [ + { + "name": "", + "reactions": [] + } + ] + } + ### Text: + I feel a bit drowsy & have a little blurred vision , and some gastric problems . + I 've been on Arthrotec 50 for over 10 years on and off , only taking it when I needed it . + Due to my arthritis getting progressively worse , to the point where I am in tears with the agony. + Gp 's started me on 75 twice a day and I have to take it every day for the next month to see how I get on , here goes . + So far its been very good , pains almost gone , but I feel a bit weird , did n't have that when on 50. + """ + +data = spark.createDataFrame([[med_ner_prompt]]).toDF("text") +data.show(truncate=100) + + + +## Result + + { + "drugs": [ + { + "name": "Arthrotec", + "reactions": [ + "drowsy", + "blurred vision", + "gastric problems" + ] + } + ] + } + #### Template: + {"drugs": [{"name": "", "reaction": []}]} + #### Text: + The patient is a 65-year + +{%- endcapture -%} + +{%- capture model_scala_medical -%} +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.annotators.seq2seq.MedicalLLM +import org.apache.spark.ml.Pipeline + +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val medicalLLM = MedicalLLM.pretrained("jsl_meds_ner_q4_v2", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("completions") + .setBatchSize(1) + .setNPredict(100) + .setUseChatTemplate(true) + .setTemperature(0) +// .setNGpuLayers(100) if you have GPU + +val pipeline = new Pipeline().setStages( + Array( + documentAssembler, + medicalLLM +)) + +val medPrompt = + """ + |### Template: + |{ + |"drugs": [ + |{ + |"name": "", + |"reactions": [] + |} + |] + |} + |### Text: + |I feel a bit drowsy & have a little blurred vision , and some gastric problems . + |I 've been on Arthrotec 50 for over 10 years on and off , only taking it when I needed it . + |Due to my arthritis getting progressively worse , to the point where I am in tears with the agony. + |Gp 's started me on 75 twice a day and I have to take it every day for the next month to see how I get on , here goes . + |So far its been very good , pains almost gone , but I feel a bit weird , did n't have that when on 50. + |""".stripMargin + +val data = Seq(medPrompt).toDF("text") +data.select("completions.result").show(false) + + +## Result + + { + "drugs": [ + { + "name": "Arthrotec", + "reactions": [ + "drowsy", + "blurred vision", + "gastric problems" + ] + } + ] + } + #### Template: + {"drugs": [{"name": "", "reaction": []}]} + #### Text: + The patient is a 65-year + + +{%- endcapture -%} + + +{%- capture model_api_link -%} +[MedicalLLM](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/seq2seq/medicalLLM.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[MedicalLLM](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/medical_llm/MedicalLLM.html) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[MedicalLLMNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/46.Loading_Medical_and_Open-Souce_LLMs.ipynb) +{%- endcapture -%} + + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +%} diff --git a/docs/en/licensed_annotator_entries/NerConverterInternal.md b/docs/en/licensed_annotator_entries/NerConverterInternal.md index 86ce2f4f0a..f457d565d3 100644 --- a/docs/en/licensed_annotator_entries/NerConverterInternal.md +++ b/docs/en/licensed_annotator_entries/NerConverterInternal.md @@ -11,7 +11,7 @@ Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label. Chunks with no associated entity (tagged "O") are filtered out. -Parametres; +Parameters; - `setThreshold`: Confidence threshold. @@ -29,6 +29,8 @@ Parametres; - `setGreedyMode`: (Boolean) Whether to ignore B tags for contiguous tokens of same entity same . +- `resetSentenceIndices`: Whether to reset sentence indices to treat the entire output as if it originates from a single document. Default: False. + This licensed annotator adds extra functionality to the open-source version by adding the following parameters: `blackList`, `greedyMode`, `threshold`, and `ignoreStopWords` that are not available in the [NerConverter](https://nlp.johnsnowlabs.com/docs/en/annotators#nerconverter) annotator. See also [Inside–outside–beginning (tagging)](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) for more information. diff --git a/docs/en/licensed_annotator_entries/REChunkMerger.md b/docs/en/licensed_annotator_entries/REChunkMerger.md new file mode 100644 index 0000000000..a969c00244 --- /dev/null +++ b/docs/en/licensed_annotator_entries/REChunkMerger.md @@ -0,0 +1,206 @@ + n{%- capture title -%} + REChunkMerger +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +`REChunkMerger` annotator merges relation chunks to create a new chunk. + +Parameters: + +- `separator`: Separator to add between the chunks. Default: " ". + + + +{%- endcapture -%} + +{%- capture model_input_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture model_output_anno -%} +CHUNK +{%- endcapture -%} + +{%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + +documenter = nlp.DocumentAssembler() \ + .setInputCol("sentence") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("tokens") \ + +words_embedder = nlp.WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["document", "tokens"]) \ + .setOutputCol("embeddings") + +pos_tagger = nlp.PerceptronModel() \ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["document", "tokens"]) \ + .setOutputCol("pos_tags") + +ner_tagger = medical.NerModel() \ + .pretrained("ner_clinical", "en", "clinical/models") \ + .setInputCols(["document", "tokens", "embeddings"]) \ + .setOutputCol("ner_tags") + +ner_converter = medical.NerConverter() \ + .setInputCols(["document", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +depency_parser = nlp.DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["document", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +re_model = medical.RelationExtractionModel \ + .pretrained("re_clinical", "en", "clinical/models") \ + .setCustomLabels({"TeRP": "CustomLabel_TeRP", "TrWP": "CustomLabel_TeWP"}) \ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"]) \ + .setOutputCol("re_chunk") + +re_chunk_merger = medical.REChunkMerger() \ + .setInputCols(["re_chunk"]) \ + .setOutputCol("relation_chunks") \ + .setSeparator(" && ") + +nlpPipeline = nlp.Pipeline( + stages=[ + documenter, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_converter, + depency_parser, + re_model, + re_chunk_merger + ]) + +empty_data = spark.createDataFrame([[""]]).toDF("sentence") + +model = nlpPipeline.fit(empty_data) + +text =''' 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to " + +"presentation and subsequent type two diabetes mellitus ( T2DM ). ''' + +result = model.transform(spark.createDataFrame([[text]]).toDF("sentence")) + +# result ++----------------------------------------------------------------------+ +|result | ++----------------------------------------------------------------------+ +|gestational diabetes mellitus && subsequent type two diabetes mellitus| +|gestational diabetes mellitus && T2DM | +|subsequent type two diabetes mellitus && T2DM | ++----------------------------------------------------------------------+ +{%- endcapture -%} + + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols("document", "tokens") + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols("document", "tokens") + .setOutputCol("pos_tags") + +val ner_tagger = MedicalNerModel + .pretrained("ner_clinical", "en", "clinical/models") + .setInputCols("document", "tokens", "embeddings") + .setOutputCol("ner_tags") + +val ner_converter = new NerConverter() + .setInputCols("document", "tokens", "ner_tags") + .setOutputCol("ner_chunks") + +val depency_parser = DependencyParserModel + .pretrained("dependency_conllu", "en") + .setInputCols("document", "pos_tags", "tokens") + .setOutputCol("dependencies") + +val re_model = RelationExtractionModel + .pretrained("re_clinical", "en", "clinical/models") + .setInputCols("embeddings", "pos_tags", "ner_chunks", "dependencies") + .setOutputCol("re_chunk") + +val re_chunk_merger = new REChunkMerger() + .setInputCols("re_chunk") + .setOutputCol("relation_chunks") + .setSeparator(" && ") + +val pipeline = new Pipeline() + .setStages(Array( + documenter, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_converter, + depency_parser, + re_model, + re_chunk_merger + )) +val text = "28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to " + + "presentation and subsequent type two diabetes mellitus ( T2DM ). " + +val empty_data = Seq("").toDF("text") + +val model = pipeline.fit(empty_data).transform(Seq(text).toDF("text")) + +# result ++----------------------------------------------------------------------+ +|result | ++----------------------------------------------------------------------+ +|gestational diabetes mellitus && subsequent type two diabetes mellitus| +|gestational diabetes mellitus && T2DM | +|subsequent type two diabetes mellitus && T2DM | ++----------------------------------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_api_link -%} +[REChunkMerger](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/merge/REChunkMerger.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[REChunkMerger](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/merge/REChunkMerger.html) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[REChunkMergerNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/Spark_NLP_Udemy_MOOC/Healthcare_NLP/REChunkMerger.ipynb) +{%- endcapture -%} + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/Replacer.md b/docs/en/licensed_annotator_entries/Replacer.md index e1fc4121f8..1dbe01ea2f 100644 --- a/docs/en/licensed_annotator_entries/Replacer.md +++ b/docs/en/licensed_annotator_entries/Replacer.md @@ -27,7 +27,10 @@ Parameter: * "entity": Replaces 'NONE' values with the entity field extracted from the annotation, if available. If the entity field is not available, it uses the string "NONE" wrapped by the specified delimiters. * "place_holder": Replaces 'NONE' values with a placeholder string wrapped by the specified delimiters. * "skip": Retains the original target_text from the annotation's metadata if available. If not available, it retains the original annotation result. -- `setNoneValuesTo`: (String) Sets an array of two strings used as delimiters to wrap the placeholder or entity field when noneValuesTo is set to "place_holder" or "entity". The first element of the array is the prefix delimiter, and the second element is the suffix delimiter. +- `mappingsColumn`: (String) Column name for mapping. This column maps the annotations to their corresponding chunks before the entities are replaced. +- `returnEntityMappings`: (Boolean) With this property you select if you want to return mapping column. +- `staticEntityMappingsFallback`: (String) Fallback option for static entity mappings. Allowed values: 'entity', 'place_holder', 'skip', 'error'. +- `staticEntityMappings`: (dict) Static entity mappings. A dictionary with entity types as keys and replacement values as values. {%- endcapture -%} @@ -246,7 +249,7 @@ val test_data = Seq("""John Davies is a 62 y.o. patient admitted. Mr. Davies was val res = mapperPipeline.fit(test_data).transform(test_data) -// Show results +# Result Original text. : John Davies is a 62 y.o. patient admitted. Mr. Davies was seen by attending physician Dr. Lorand and was scheduled for emergency assessment. diff --git a/docs/en/licensed_annotators.md b/docs/en/licensed_annotators.md index 0006510dfd..93dff0ac51 100644 --- a/docs/en/licensed_annotators.md +++ b/docs/en/licensed_annotators.md @@ -55,6 +55,7 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a {% include templates/licensed_table_entry.md name="ChunkMerge" summary="Merges entities coming from different CHUNK annotations."%} {% include templates/licensed_table_entry.md name="ChunkSentenceSplitter" summary="Annotator can split the documents into chunks according to separators given as `CHUNK` columns. It is useful when you need to perform different models or analysis in different sections of your document"%} {% include templates/licensed_table_entry.md name="ContextualAssertion" summary="This model identifies contextual cues within text data, such as negation, uncertainty etc.It annotates text chunks with assertions based on configurable rules, prefix and suffix patterns, and exception patterns."%} +{% include templates/licensed_table_entry.md name="ContextualEntityFilterer" summary="ContextualEntityFilterer can filter chunks coming from CHUNK annotations based on entity(identifier,field) info in metadata and contextual cues."%} {% include templates/licensed_table_entry.md name="ContextualParser" summary="Extracts entity from a document based on user defined rules."%} {% include templates/licensed_table_entry.md name="ContextSplitAssembler" summary="Converts and assembles `VECTOR_SIMILARITY_RANKINGS` type annotations into `DOCUMENT` type."%} {% include templates/licensed_table_entry.md name="DateNormalizer" summary="This annotator transforms date mentions to a common standard format: YYYY/MM/DD. It is useful when using data from different sources, some times from different countries that has different formats to represent dates."%} @@ -84,6 +85,7 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a {% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%} {% include templates/licensed_table_entry.md name="LLMLoader" summary="LLMLoader is designed to interact with a LLMs that are converted into gguf format. This module allows using John Snow Labs' licensed LLMs at various sizes that are finetuned on medical context for certain tasks."%} {% include templates/licensed_table_entry.md name="Mapper2Chunk" summary="This annotator converts 'LABELED_DEPENDENCY' type annotations coming from ChunkMapper into 'CHUNK' type to create new chunk-type column"%} +{% include templates/licensed_table_entry.md name="MedicalLLM" summary="MedicalLLM was designed to load and run large language models (LLMs) in GGUF format with scalable performance."%} {% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%} {% include templates/licensed_table_entry.md name="NameChunkObfuscator" summary="This annotator allows to transform a dataset with an Input Annotation of type CHUNK, into its obfuscated version of by obfuscating the given CHUNKS."%} {% include templates/licensed_table_entry.md name="NerChunker" summary="Extracts phrases that fits into a known pattern using the NER tags."%} @@ -96,6 +98,7 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a {% include templates/licensed_table_entry.md name="ReIdentification" summary="Reidentifies obfuscated entities by DeIdentification."%} {% include templates/licensed_table_entry.md name="RelationExtraction" summary="Extracts and classifies instances of relations between named entities."%} {% include templates/licensed_table_entry.md name="RelationExtractionDL" summary="Extracts and classifies instances of relations between named entities."%} +{% include templates/licensed_table_entry.md name="REChunkMerger" summary= Merges relation chunks to create a new chunk."%} {% include templates/licensed_table_entry.md name="RENerChunksFilter" summary="Filters and outputs combinations of relations between extracted entities, for further processing."%} {% include templates/licensed_table_entry.md name="Replacer" summary="This annotator allows to replace entities in the original text with the ones extracted by the annotators `NameChunkObfuscatorApproach` or `DateNormalizer`."%} {% include templates/licensed_table_entry.md name="Resolution2Chunk" summary="This annotator is responsible for converting the annotations generated by entity resolver models (typically labeled as ENTITY) into a format compatible with subsequent stages of the pipeline, such as the ChunkMapperModel."%}