diff --git a/docs/en/licensed_annotator_entries/ChunkSentenceSplitter.md b/docs/en/licensed_annotator_entries/ChunkSentenceSplitter.md index c1d7a3a84a..b745d68097 100644 --- a/docs/en/licensed_annotator_entries/ChunkSentenceSplitter.md +++ b/docs/en/licensed_annotator_entries/ChunkSentenceSplitter.md @@ -16,7 +16,7 @@ Parametres; - `InsertChunk`: (boolean) Whether to insert the chunk in the paragraph or not. -- `DefaultEntity`: (str) Sets the key in the metadata dictionary that you want to filter (by default "entity") +- `DefaultEntity`: (str) Sets the key in the metadata dictionary that you want to filter (by default 'entity') For detailed usage of this annotator, visit [this notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/18.Chunk_Sentence_Splitter.ipynb) from our `Spark NLP Workshop`. @@ -31,32 +31,33 @@ DOCUMENT {%- endcapture -%} {%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical # Defining the pipeline documentAssembler = nlp.DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") + .setInputCol("text")\ + .setOutputCol("document") sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ - .setInputCols(["document"])\ - .setOutputCol("sentence") + .setInputCols(["document"])\ + .setOutputCol("sentence") tokenizer = nlp.Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token")\ + .setInputCols(["sentence"])\ + .setOutputCol("token")\ word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ - .setInputCols(["sentence", "token"])\ - .setOutputCol("embeddings") + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") clinical_ner = medical.NerModel.pretrained("ner_jsl_slim", "en", "clinical/models") \ - .setInputCols(["sentence", "token", "embeddings"]) \ - .setOutputCol("ner") + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") ner_converter = medical.NerConverterInternal() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk")\ - .setWhiteList(["Header"]) + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk")\ + .setWhiteList(["Header"]) #applying ChunkSentenceSplitter chunkSentenceSplitter = medical.ChunkSentenceSplitter()\ @@ -65,15 +66,15 @@ chunkSentenceSplitter = medical.ChunkSentenceSplitter()\ .setGroupBySentences(False) pipeline_model = nlp.Pipeline( - stages = [ - documentAssembler, - sentenceDetector, - tokenizer, - word_embeddings, - clinical_ner, - ner_converter, - chunkSentenceSplitter - ]) + stages = [ + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter, + chunkSentenceSplitter + ]) sentences = [["""Sample Name: Mesothelioma - Pleural Biopsy @@ -108,35 +109,36 @@ paragraphs.selectExpr("explode(paragraphs) as result")\ {%- endcapture -%} {%- capture model_scala_medical -%} +import spark.implicits._ val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols(Array("sentence")) + .setInputCols("sentence") .setOutputCol("token") val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") -val clinical_ner = MedicalNerModel.pretrained("ner_jsl_slim", "en", "clinical/models") \ - .setInputCols(Array("sentence", "token", "embeddings")) +val clinical_ner = MedicalNerModel.pretrained("ner_jsl_slim", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) .setOutputCol("ner") -val ner_converter = new NerConverterInternal() - .setInputCols(Array("sentence", "token", "ner")) +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Header")) + .setWhiteList("Header") -//applying ChunkSentenceSplitter +#applying ChunkSentenceSplitter val chunkSentenceSplitter = new ChunkSentenceSplitter() - .setInputCols("document","ner_chunk") + .setInputCols(Array("document","ner_chunk")) .setOutputCol("paragraphs") .setGroupBySentences(false) @@ -164,8 +166,6 @@ Dr. X was present for the entire procedure which was right VATS pleurodesis and val data = Seq(sentences).toDF("text") val paragraphs = pipeline_model.fit(df).transform(df) -paragraphs.selectExpr("explode(paragraphs) as result") - .selectExpr("result.result","result.metadata.entity", "result.metadata.splitter_chunk").show(truncate=80) +--------------------------------------------------------------------------------+------------+------------------------+ | result| entity| splitter_chunk| @@ -183,6 +183,7 @@ paragraphs.selectExpr("explode(paragraphs) as result") {%- endcapture -%} {%- capture model_python_legal -%} +from johnsnowlabs import nlp, medical, legal documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -197,8 +198,8 @@ tokenizer = nlp.Tokenizer()\ .setOutputCol("token") embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") ner_model = legal.NerModel.pretrained("legner_headers", "en", "legal/models")\ .setInputCols(["sentence", "token", "embeddings"])\ @@ -212,7 +213,6 @@ chunkSentenceSplitter = legal.ChunkSentenceSplitter()\ .setInputCols("document","ner_chunk")\ .setOutputCol("paragraphs")\ .setGroupBySentences(False) - nlp_pipeline = nlp.Pipeline(stages=[ documentAssembler, @@ -230,7 +230,7 @@ NOW, THEREFORE, for good and valuable consideration, and in consideration of the 2. Definitions. For purposes of this Agreement, the following terms have the meanings ascribed thereto in this Section 1. 2. Appointment as Reseller. -2.1 Appointment. The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 +2.1 Appointment. The Company hereby [***]. Allscripts may also disclose Company's pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 2.2 Customer Agreements. @@ -242,35 +242,36 @@ paragraphs = nlp_pipeline.fit(sdf).transform(sdf) paragraphs.selectExpr("explode(paragraphs) as result")\ .selectExpr("result.result","result.metadata.entity").show(truncate=50) -+----------------------------------------------------------------------------------------------------+---------+ -| result| entity| -+----------------------------------------------------------------------------------------------------+---------+ -|AGREEMENT NOW, THEREFORE, for good and valuable consideration, and in consideration of the mutua...|SUBHEADER| -| Appointment as Reseller. |SUBHEADER| -| 2.1 Appointment. |SUBHEADER| -|The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to ...|SUBHEADER| -| 6 2.2 Customer Agreements. | HEADER| -|a) Subscriptions. Allscripts and its Affiliates may sell Subscriptions for terms no less than one...|SUBHEADER| -+----------------------------------------------------------------------------------------------------+---------+ ++--------------------------------------------------+---------+ +| result| entity| ++--------------------------------------------------+---------+ +|AGREEMENT NOW, THEREFORE, for good and valuabl... |SUBHEADER| +| Appointment as Reseller. |SUBHEADER| +| 2.1 Appointment. |SUBHEADER| +|The Company hereby [***]. Allscripts may also d...|SUBHEADER| +| 6 2.2 Customer Agreements. | HEADER| +|a) Subscriptions. Allscripts and its Affiliates...|SUBHEADER| ++--------------------------------------------------+---------+ {%- endcapture -%} {%- capture model_scala_legal -%} +import spark.implicits._ val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx") - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols(Array("sentence")) + .setInputCols("sentence") .setOutputCol("token") -val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") - .setInputCols(Array("sentence", "token")) +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") + .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") val ner_model = LegalNerModel.pretrained("legner_headers", "en", "legal/models") @@ -282,7 +283,7 @@ val ner_converter = new NerConverterInternal() .setOutputCol("ner_chunk") val chunkSentenceSplitter = new ChunkSentenceSplitter() - .setInputCols("document","ner_chunk") + .setInputCols(Array("document","ner_chunk")) .setOutputCol("paragraphs") .setGroupBySentences(false) @@ -302,7 +303,7 @@ NOW, THEREFORE, for good and valuable consideration, and in consideration of the 2. Definitions. For purposes of this Agreement, the following terms have the meanings ascribed thereto in this Section 1. 2. Appointment as Reseller. -2.1 Appointment. The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 +2.1 Appointment. The Company hereby [***]. Allscripts may also disclose Company's pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 2.2 Customer Agreements. @@ -311,24 +312,22 @@ a) Subscriptions. Allscripts and its Affiliates may sell Subscriptions for terms val data = Seq(text).toDF("text") val paragraphs = nlp_pipeline.fit(data).transform(data) -paragraphs.selectExpr("explode(paragraphs) as result") - .selectExpr("result.result","result.metadata.entity").show(truncate=50) - -+----------------------------------------------------------------------------------------------------+---------+ -| result| entity| -+----------------------------------------------------------------------------------------------------+---------+ -|AGREEMENT NOW, THEREFORE, for good and valuable consideration, and in consideration of the mutua...|SUBHEADER| -| Appointment as Reseller. |SUBHEADER| -| 2.1 Appointment. |SUBHEADER| -|The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to ...|SUBHEADER| -| 6 2.2 Customer Agreements. | HEADER| -|a) Subscriptions. Allscripts and its Affiliates may sell Subscriptions for terms no less than one...|SUBHEADER| -+----------------------------------------------------------------------------------------------------+---------+ ++--------------------------------------------------+---------+ +| result| entity| ++--------------------------------------------------+---------+ +|AGREEMENT NOW, THEREFORE, for good and valuabl... |SUBHEADER| +| Appointment as Reseller. |SUBHEADER| +| 2.1 Appointment. |SUBHEADER| +|The Company hereby [***]. Allscripts may also d...|SUBHEADER| +| 6 2.2 Customer Agreements. | HEADER| +|a) Subscriptions. Allscripts and its Affiliates...|SUBHEADER| ++--------------------------------------------------+---------+ {%- endcapture -%} {%- capture model_python_finance -%} +from johnsnowlabs import nlp, medical, finance documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -358,7 +357,7 @@ chunkSentenceSplitter = legal.ChunkSentenceSplitter()\ .setInputCols("document","ner_chunk")\ .setOutputCol("paragraphs")\ .setGroupBySentences(False) - + nlp_pipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, @@ -375,7 +374,7 @@ For purposes of this Agreement, the following terms have the meanings ascribed t 2.1 Appointment. -The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 +The Company hereby [***]. Allscripts may also disclose Company's pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 2.2 Customer Agreements.""" @@ -385,34 +384,35 @@ paragraphs = nlp_pipeline.fit(sdf).transform(sdf) paragraphs.selectExpr("explode(paragraphs) as result")\ .selectExpr("result.result","result.metadata.entity").show(truncate=50) -+----------------------------------------------------------------------------------------------------+---------+ -| result| entity| -+----------------------------------------------------------------------------------------------------+---------+ -| 2. | HEADER| -|DEFINITION. For purposes of this Agreement, the following terms have the meanings ascribed ther...|SUBHEADER| -| 2.1 Appointment. |SUBHEADER| -|The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to ...|SUBHEADER| -| 6 2.2 Customer Agreements| HEADER| -+----------------------------------------------------------------------------------------------------+---------+ ++--------------------------------------------------+---------+ +| result| entity| ++--------------------------------------------------+---------+ +| 2. | HEADER| +|DEFINITION. For purposes of this Agreement, t...|SUBHEADER| +| 2.1 Appointment. |SUBHEADER| +|The Company hereby [***]. Allscripts may also d...|SUBHEADER| +| 6 2.2 Customer Agreements| HEADER| ++--------------------------------------------------+---------+ {%- endcapture -%} -{%- capture model_scala_legal -%} +{%- capture model_scala_finance -%} +import spark.implicits._ val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx") - .setInputCols(Array("document")) + .setInputCols("document") .setOutputCol("sentence") -val tokenizer = new Tokenizer() - .setInputCols(Array("sentence")) +val tokenizer = new Tokenizer + .setInputCols("sentence") .setOutputCol("token") -val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") - .setInputCols(Array("sentence", "token")) +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") + .setInputCols(Array("sentence", "token")) .setOutputCol("embeddings") val ner_model = FinanceNerModel.pretrained("finner_headers", "en", "finance/models") @@ -424,7 +424,7 @@ val ner_converter = new NerConverterInternal() .setOutputCol("ner_chunk") val chunkSentenceSplitter = new ChunkSentenceSplitter() - .setInputCols("document","ner_chunk") + .setInputCols(Array("document","ner_chunk")) .setOutputCol("paragraphs") .setGroupBySentences(false) @@ -444,25 +444,23 @@ For purposes of this Agreement, the following terms have the meanings ascribed t 2.1 Appointment. -The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 +The Company hereby [***]. Allscripts may also disclose Company's pricing information relating to its Merchant Processing Services and facilitate procurement of Merchant Processing Services on behalf of Sublicensed Customers, including, without limitation by references to such pricing information and Merchant Processing Services in Customer Agreements. 6 2.2 Customer Agreements.""" val data = Seq(text).toDF("text") val paragraphs = nlp_pipeline.fit(data).transform(data) -paragraphs.selectExpr("explode(paragraphs) as result") - .selectExpr("result.result","result.metadata.entity").show(truncate=50) -+----------------------------------------------------------------------------------------------------+---------+ -| result| entity| -+----------------------------------------------------------------------------------------------------+---------+ -| 2. | HEADER| -|DEFINITION. For purposes of this Agreement, the following terms have the meanings ascribed ther...|SUBHEADER| -| 2.1 Appointment. |SUBHEADER| -|The Company hereby [***]. Allscripts may also disclose Company"s pricing information relating to ...|SUBHEADER| -| 6 2.2 Customer Agreements| HEADER| -+----------------------------------------------------------------------------------------------------+---------+ ++--------------------------------------------------+---------+ +| result| entity| ++--------------------------------------------------+---------+ +| 2. | HEADER| +|DEFINITION. For purposes of this Agreement, t...|SUBHEADER| +| 2.1 Appointment. |SUBHEADER| +|The Company hereby [***]. Allscripts may also d...|SUBHEADER| +| 6 2.2 Customer Agreements| HEADER| ++--------------------------------------------------+---------+ {%- endcapture -%} @@ -476,7 +474,7 @@ paragraphs.selectExpr("explode(paragraphs) as result") {%- endcapture -%} {%- capture model_notebook_link -%} -[ChunkSentenceSplitter](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkSentenceSplitter.ipynb) +[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ChunkSentenceSplitter.ipynb) {%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md diff --git a/docs/en/licensed_annotator_entries/ContextualParser.md b/docs/en/licensed_annotator_entries/ContextualParser.md index ea9ed8c575..80edad4733 100644 --- a/docs/en/licensed_annotator_entries/ContextualParser.md +++ b/docs/en/licensed_annotator_entries/ContextualParser.md @@ -6,32 +6,6 @@ ContextualParser approach {%- endcapture -%} -{%- capture model -%} -model -{%- endcapture -%} - -{%- capture model_description -%} -Extracts entity from a document based on user defined rules. Rule matching is based on a RegexMatcher defined in a -JSON file. In this file, regex is defined that you want to match along with the information that will output on -metadata field. To instantiate a model, see ContextualParserApproach and its accompanied example. -{%- endcapture -%} - -{%- capture model_input_anno -%} -DOCUMENT, TOKEN -{%- endcapture -%} - -{%- capture model_output_anno -%} -CHUNK -{%- endcapture -%} - -{%- capture model_api_link -%} -[ContextualParserModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/context/ContextualParserModel.html) -{%- endcapture -%} - -{%- capture model_python_api_link -%} -[ContextualParserModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/contextual_parser/index.html#sparknlp_jsl.annotator.context.contextual_parser.ContextualParserModel) -{%- endcapture -%} - {%- capture approach_description -%} Creates a model, that extracts entity from a document based on user defined rules. Rule matching is based on a RegexMatcher defined in a JSON file. It is set through the parameter setJsonPath() @@ -39,6 +13,21 @@ In this JSON file, regex is defined that you want to match along with the inform field. Additionally, a dictionary can be provided with `setDictionary` to map extracted entities to a unified representation. The first column of the dictionary file should be the representation with following columns the possible matches. + +Parametres; + +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. +- `jsonPath`: Path to json file containing regex patterns and rules to match the entities. +- `dictionary`: Path to dictionary file in tsv or csv format. +- `caseSensitive`: Whether to use case sensitive when matching values. +- `prefixAndSuffixMatch`: Whether to match both prefix and suffix to annotate the match. +- `optionalContextRules`: When set to true, it will output regex match regardless of context matches. +- `shortestContextMatch`: When set to true, it will stop finding for matches when prefix/suffix data is found in the text. +- `completeContextMatch`: Whether to do an exact match of prefix and suffix. + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. + {%- endcapture -%} {%- capture approach_input_anno -%} @@ -50,7 +39,8 @@ CHUNK {%- endcapture -%} {%- capture approach_python_medical -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, medical + # An example JSON file `regex_token.json` can look like this: # # { @@ -63,35 +53,36 @@ from johnsnowlabs import * # Which means to extract the stage code on a sentence level. # An example pipeline could then be defined like this # Pipeline could then be defined like this + documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ - .setOutputCol("document") + .setInputCol("text") \ + .setOutputCol("document") sentenceDetector = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") + .setInputCols(["document"]) \ + .setOutputCol("sentence") tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") - -# Define the parser (json file needs to be provided) -data = spark.createDataFrame([["A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... "]]).toDF("text") + .setInputCols(["sentence"]) \ + .setOutputCol("token") contextualParser = medical.ContextualParserApproach() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("entity") \ - .setJsonPath("/path/to/regex_token.json") \ - .setCaseSensitive(True) \ - .setContextMatch(False) + .setInputCols(["sentence", "token"]) \ + .setOutputCol("entity") \ + .setJsonPath("/path/to/regex_token.json") \ + .setCaseSensitive(True) \ + .setContextMatch(False) -pipeline = Pipeline(stages=[ +pipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, contextualParser ]) +# Define the parser (json file needs to be provided) +data = spark.createDataFrame([["A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... "]]).toDF("text") + result = pipeline.fit(data).transform(data) # Show Results @@ -108,7 +99,8 @@ result.selectExpr("explode(entity)").show(5, truncate=False) {%- endcapture -%} {%- capture approach_python_legal -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, legal + # An example JSON file `regex_token.json` can look like this: # # { @@ -121,37 +113,51 @@ from johnsnowlabs import * # Which means to extract the stage code on a sentence level. # An example pipeline could then be defined like this # Pipeline could then be defined like this + documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ - .setOutputCol("document") + .setInputCol("text") \ + .setOutputCol("document") sentenceDetector = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") + .setInputCols(["document"]) \ + .setOutputCol("sentence") tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") - -# Define the parser (json file needs to be provided) + .setInputCols(["sentence"]) \ + .setOutputCol("token") contextualParser = legal.ContextualParserApproach() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("entity") \ - .setJsonPath("/path/to/regex_token.json") \ - .setCaseSensitive(True) \ - .setContextMatch(False) + .setInputCols(["sentence", "token"]) \ + .setOutputCol("entity") \ + .setJsonPath("/path/to/regex_token.json") \ + .setCaseSensitive(True) \ + .setContextMatch(False) -pipeline = Pipeline(stages=[ +pipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, contextualParser ]) + +# Define the parser (json file needs to be provided) +data = spark.createDataFrame([["Peter Parker is a nice guy and lives in New York . Bruce Wayne is also a nice guy and lives in San Antonio and Gotham City ."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +# Show Results +result.selectExpr("explode(entity)").show(5, truncate=False) + ++---------------------------------------------------------------+ +|result | ++---------------------------------------------------------------+ +|[Peter Parker, New York, Bruce Wayne, San Antonio, Gotham City]| ++---------------------------------------------------------------+ {%- endcapture -%} {%- capture approach_python_finance -%} -from johnsnowlabs import * +from johnsnowlabs import nlp, finance + # An example JSON file `regex_token.json` can look like this: # # { @@ -164,37 +170,53 @@ from johnsnowlabs import * # Which means to extract the stage code on a sentence level. # An example pipeline could then be defined like this # Pipeline could then be defined like this + documentAssembler = nlp.DocumentAssembler() \ - .setInputCol("text") \ - .setOutputCol("document") + .setInputCol("text") \ + .setOutputCol("document") sentenceDetector = nlp.SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") + .setInputCols(["document"]) \ + .setOutputCol("sentence") tokenizer = nlp.Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") + .setInputCols(["sentence"]) \ + .setOutputCol("token") # Define the parser (json file needs to be provided) contextualParser = finance.ContextualParserApproach() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("entity") \ - .setJsonPath("/path/to/regex_token.json") \ - .setCaseSensitive(True) \ - .setContextMatch(False) + .setInputCols(["sentence", "token"]) \ + .setOutputCol("entity") \ + .setJsonPath("/path/to/regex_token.json") \ + .setCaseSensitive(True) \ + .setContextMatch(False) -pipeline = Pipeline(stages=[ +pipeline = nlp.Pipeline(stages=[ documentAssembler, sentenceDetector, tokenizer, contextualParser ]) + +# Define the parser (json file needs to be provided) +data = spark.createDataFrame([["Peter Parker is a nice guy and lives in New York . Bruce Wayne is also a nice guy and lives in San Antonio and Gotham City ."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +# Show Results +result.selectExpr("explode(entity)").show(5, truncate=False) + ++---------------------------------------------------------------+ +|result | ++---------------------------------------------------------------+ +|[Peter Parker, New York, Bruce Wayne, San Antonio, Gotham City]| ++---------------------------------------------------------------+ {%- endcapture -%} {%- capture approach_scala_medical -%} -from johnsnowlabs import * +import spark.implicits._ + // An example JSON file `regex_token.json` can look like this: // // { @@ -206,26 +228,26 @@ from johnsnowlabs import * // // Which means to extract the stage code on a sentence level. // An example pipeline could then be defined like this -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector() - .setInputCols("document") - .setOutputCol("sentence") +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val contextualParser = new ContextualParserApproach() + .setInputCols(Array("sentence", "token")) + .setOutputCol("entity") + .setJsonPath("/path/to/regex_token.json") + .setCaseSensitive(true) + .setContextMatch(false) -// Define the parser (json file needs to be provided) -val data = Seq("A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... ").toDF("text") -val contextualParser = new medical.ContextualParserApproach() - .setInputCols(Array("sentence", "token")) - .setOutputCol("entity") - .setJsonPath("/path/to/regex_token.json") - .setCaseSensitive(true) - .setContextMatch(false) val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, @@ -233,6 +255,9 @@ val pipeline = new Pipeline().setStages(Array( contextualParser )) +// Define the parser (json file needs to be provided) +val data = Seq("A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... ").toDF("text") + val result = pipeline.fit(data).transform(data) // Show Results @@ -251,7 +276,8 @@ val result = pipeline.fit(data).transform(data) {%- endcapture -%} {%- capture approach_scala_legal -%} -from johnsnowlabs import * +import spark.implicits._ + // An example JSON file `regex_token.json` can look like this: // // { @@ -263,36 +289,51 @@ from johnsnowlabs import * // // Which means to extract the stage code on a sentence level. // An example pipeline could then be defined like this -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector() - .setInputCols("document") - .setOutputCol("sentence") +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val contextualParser = new ContextualParserApproach() + .setInputCols(Array("sentence", "token")) + .setOutputCol("entity") + .setJsonPath("/path/to/regex_token.json") + .setCaseSensitive(true) + .setContextMatch(false) -// Define the parser (json file needs to be provided) -val data = Seq("A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... ").toDF("text") -val contextualParser = new legal.ContextualParserApproach() - .setInputCols(Array("sentence", "token")) - .setOutputCol("entity") - .setJsonPath("/path/to/regex_token.json") - .setCaseSensitive(true) - .setContextMatch(false) val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, tokenizer, contextualParser )) + +// Define the parser (json file needs to be provided) +val data = Seq("Peter Parker is a nice guy and lives in New York . Bruce Wayne is also a nice guy and lives in San Antonio and Gotham City .").toDF("text") + +val result = pipeline.fit(data).transform(data) + +// Show Results +result.selectExpr("explode(entity)").show(5, truncate=False) + ++---------------------------------------------------------------+ +|result | ++---------------------------------------------------------------+ +|[Peter Parker, New York, Bruce Wayne, San Antonio, Gotham City]| ++---------------------------------------------------------------+ {%- endcapture -%} {%- capture approach_scala_finance -%} -from johnsnowlabs import * +import spark.implicits._ + // An example JSON file `regex_token.json` can look like this: // // { @@ -304,32 +345,45 @@ from johnsnowlabs import * // // Which means to extract the stage code on a sentence level. // An example pipeline could then be defined like this -val documentAssembler = new nlp.DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") -val sentenceDetector = new nlp.SentenceDetector() - .setInputCols("document") - .setOutputCol("sentence") +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") -val tokenizer = new nlp.Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val contextualParser = new ContextualParserApproach() + .setInputCols(Array("sentence", "token")) + .setOutputCol("entity") + .setJsonPath("/path/to/regex_token.json") + .setCaseSensitive(true) + .setContextMatch(false) -// Define the parser (json file needs to be provided) -val data = Seq("A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... ").toDF("text") -val contextualParser = new finance.ContextualParserApproach() - .setInputCols(Array("sentence", "token")) - .setOutputCol("entity") - .setJsonPath("/path/to/regex_token.json") - .setCaseSensitive(true) - .setContextMatch(false) val pipeline = new Pipeline().setStages(Array( documentAssembler, sentenceDetector, tokenizer, contextualParser )) + +// Define the parser (json file needs to be provided) +val data = Seq("Peter Parker is a nice guy and lives in New York . Bruce Wayne is also a nice guy and lives in San Antonio and Gotham City .").toDF("text") + +val result = pipeline.fit(data).transform(data) + +// Show Results + ++---------------------------------------------------------------+ +|result | ++---------------------------------------------------------------+ +|[Peter Parker, New York, Bruce Wayne, San Antonio, Gotham City]| ++---------------------------------------------------------------+ {%- endcapture -%} {%- capture approach_api_link -%} @@ -340,15 +394,14 @@ val pipeline = new Pipeline().setStages(Array( [ContextualParserApproach](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/contextual_parser/index.html#sparknlp_jsl.annotator.context.contextual_parser.ContextualParserApproach) {%- endcapture -%} +{%- capture approach_notebook_link -%} +[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/ContextualParserApproach.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model approach=approach -model_description=model_description -model_input_anno=model_input_anno -model_output_anno=model_output_anno -model_api_link=model_api_link -model_python_api_link=model_python_api_link approach_description=approach_description approach_input_anno=approach_input_anno approach_output_anno=approach_output_anno @@ -360,4 +413,5 @@ approach_scala_legal=approach_scala_legal approach_scala_finance=approach_scala_finance approach_api_link=approach_api_link approach_python_api_link=approach_python_api_link +approach_notebook_link=approach_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/DateNormalizer.md b/docs/en/licensed_annotator_entries/DateNormalizer.md index b60a6fdfc8..89822abae5 100644 --- a/docs/en/licensed_annotator_entries/DateNormalizer.md +++ b/docs/en/licensed_annotator_entries/DateNormalizer.md @@ -14,6 +14,24 @@ For the relative dates (next year, past month, etc.), you can define an achor da The resultant chunk date will contain a metada indicating whether the normalization was successful or not (True / False). +Parametres; + +- `anchorDateYear`: (Int) Sets an anchor year for the relative dates such as a day after tomorrow. If not set it will use the current year. + +- `anchorDateMonth`: (Int) Sets an anchor month for the relative dates such as a day after tomorrow. If not set it will use the current month. + +- `anchorDateDay`: (Int) Sets an anchor day of the day for the relative dates such as a day after tomorrow. If not set it will use the current day. + +- `outputDateformat`: (string) Select what output format to use. If not set, the dates will be formatted as `YYYY/MM/DD`. Options are: + - `eu`: Format the dates as `DD/MM/YYYY` + - `us`: Format the dates as `MM/DD/YYYY` + +- `defaultReplacementDay`: (Int) Defines which value to use for creating the Day Value when original Date-Entity has no Day Information. Defaults to 15. + +- `defaultReplacementMonth`: (Int) Defines which value to use for creating the Month Value when original Date-Entity has no Month Information. Defaults to 06. + +- `defaultReplacementYear`: (Int) Defines which value to use for creating the Year Value when original Date-Entity has no Year Information. Defaults to 2020. + {%- endcapture -%} {%- capture model_input_anno -%} @@ -25,8 +43,22 @@ CHUNK {%- endcapture -%} {%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("original_date")\ + .setOutputCol("document") + +doc2chunk = nlp.Doc2Chunk()\ + .setInputCols("document")\ + .setOutputCol("date_chunk") -from pyspark.sql.types import StringType +date_normalizer = medical.DateNormalizer()\ + .setInputCols("date_chunk")\ + .setOutputCol("date")\ + .setAnchorDateYear(2000) + +pipeline = nlp.Pipeline(stages=[document_assembler, doc2chunk, date_normalizer]) dates = [ "08/02/2018", @@ -42,22 +74,107 @@ dates = [ ] df = spark.createDataFrame(dates, StringType()).toDF("original_date") -document_assembler = ( - DocumentAssembler().setInputCol("original_date").setOutputCol("document") -) +result = pipeline.fit(df).transform(df) +result.selectExpr( + "date.result as normalized_date", + "original_date", + "date.metadata[0].normalized as metadata", +).show() + ++---------------+-------------+--------+ +|normalized_date|original_date|metadata| ++---------------+-------------+--------+ +| [2018/08/02]| 08/02/2018| true| +| [2018/11/15]| 11/2018| true| +| [2018/11/01]| 11/01/2018| true| +| [2021/03/12]| 12Mar2021| true| +| [2018/01/30]| Jan 30, 2018| true| +| [1999/04/13]| 13.04.1999| true| +| [2020/04/03]| 3April 2020| true| +| [2000/12/11]| next monday| true| +| [2000/12/06]| today| true| +| [2000/12/13]| next week| true| ++---------------+-------------+--------+ + +{%- endcapture -%} + + +{%- capture model_scala_medical -%} +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("original_date") + .setOutputCol("document") -doc2chunk = Doc2Chunk().setInputCols("document").setOutputCol("date_chunk") +val doc2chunk = new Doc2Chunk() + .setInputCols("document") + .setOutputCol("date_chunk") -date_normalizer = ( - DateNormalizer() +val date_normalizer = new DateNormalizer() .setInputCols("date_chunk") .setOutputCol("date") .setAnchorDateYear(2000) - .setAnchorDateMonth(3) - .setAnchorDateDay(15) -) -pipeline = Pipeline(stages=[document_assembler, doc2chunk, date_normalizer]) +val pipeline = new Pipeline().setStages(Array( + document_assembler, + doc2chunk, + date_normalizer +)) + +import spark.implicits._ + +val df = Seq(("08/02/2018"),("11/2018"),("11/01/2018"),("next monday"),("today"),("next week")).toDF("original_date") + +val result = pipeline.fit(df).transform(df) + ++---------------+-------------+--------+ +|normalized_date|original_date|metadata| ++---------------+-------------+--------+ +| [2018/08/02]| 08/02/2018| true| +| [2018/11/15]| 11/2018| true| +| [2018/11/01]| 11/01/2018| true| +| [2021/03/12]| 12Mar2021| true| +| [2018/01/30]| Jan 30, 2018| true| +| [1999/04/13]| 13.04.1999| true| +| [2020/04/03]| 3April 2020| true| +| [2000/12/11]| next monday| true| +| [2000/12/06]| today| true| +| [2000/12/13]| next week| true| ++---------------+-------------+--------+ + +{%- endcapture -%} + +{%- capture model_python_legal -%} +from johnsnowlabs import nlp, legal + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("original_date")\ + .setOutputCol("document") + +doc2chunk = nlp.Doc2Chunk()\ + .setInputCols("document")\ + .setOutputCol("date_chunk") + +date_normalizer = legal.DateNormalizer()\ + .setInputCols("date_chunk")\ + .setOutputCol("date")\ + .setAnchorDateYear(2000) + +pipeline = nlp.Pipeline(stages=[document_assembler, doc2chunk, date_normalizer]) + +dates = [ + "08/02/2018", + "11/2018", + "11/01/2018", + "12Mar2021", + "Jan 30, 2018", + "13.04.1999", + "3April 2020", + "next monday", + "today", + "next week", +] +df = spark.createDataFrame(dates, StringType()).toDF("original_date") result = pipeline.fit(df).transform(df) result.selectExpr( @@ -70,52 +187,167 @@ result.selectExpr( |normalized_date|original_date|metadata| +---------------+-------------+--------+ | [2018/08/02]| 08/02/2018| true| -| [2018/11/DD]| 11/2018| true| +| [2018/11/15]| 11/2018| true| | [2018/11/01]| 11/01/2018| true| | [2021/03/12]| 12Mar2021| true| | [2018/01/30]| Jan 30, 2018| true| | [1999/04/13]| 13.04.1999| true| | [2020/04/03]| 3April 2020| true| -| [2000/03/20]| next monday| true| -| [2000/03/15]| today| true| -| [2000/03/22]| next week| true| +| [2000/12/11]| next monday| true| +| [2000/12/06]| today| true| +| [2000/12/13]| next week| true| ++---------------+-------------+--------+ + +{%- endcapture -%} + + +{%- capture model_scala_legal -%} + +val document_assembler = new DocumentAssembler() + .setInputCol("original_date") + .setOutputCol("document") + +val doc2chunk = new Doc2Chunk() + .setInputCols("document") + .setOutputCol("date_chunk") + +val date_normalizer = new DateNormalizer() + .setInputCols("date_chunk") + .setOutputCol("date") + .setAnchorDateYear(2000) + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + doc2chunk, + date_normalizer +)) + +import spark.implicits._ + +val df = Seq(("08/02/2018"),("11/2018"),("11/01/2018"),("next monday"),("today"),("next week")).toDF("original_date") + +val result = pipeline.fit(df).transform(df) + + ++---------------+-------------+--------+ +|normalized_date|original_date|metadata| ++---------------+-------------+--------+ +| [2018/08/02]| 08/02/2018| true| +| [2018/11/15]| 11/2018| true| +| [2018/11/01]| 11/01/2018| true| +| [2021/03/12]| 12Mar2021| true| +| [2018/01/30]| Jan 30, 2018| true| +| [1999/04/13]| 13.04.1999| true| +| [2020/04/03]| 3April 2020| true| +| [2000/12/11]| next monday| true| +| [2000/12/06]| today| true| +| [2000/12/13]| next week| true| ++---------------+-------------+--------+ + +{%- endcapture -%} + +{%- capture model_python_finance -%} + +from johnsnowlabs import nlp, finance + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("original_date")\ + .setOutputCol("document") + +doc2chunk = nlp.Doc2Chunk()\ + .setInputCols("document")\ + .setOutputCol("date_chunk") + +date_normalizer = finance.DateNormalizer()\ + .setInputCols("date_chunk")\ + .setOutputCol("date")\ + .setAnchorDateYear(2000) + +pipeline = nlp.Pipeline(stages=[document_assembler, doc2chunk, date_normalizer]) + +dates = [ + "08/02/2018", + "11/2018", + "11/01/2018", + "12Mar2021", + "Jan 30, 2018", + "13.04.1999", + "3April 2020", + "next monday", + "today", + "next week", +] +df = spark.createDataFrame(dates, StringType()).toDF("original_date") + +result = pipeline.fit(df).transform(df) +result.selectExpr( + "date.result as normalized_date", + "original_date", + "date.metadata[0].normalized as metadata", +).show() + ++---------------+-------------+--------+ +|normalized_date|original_date|metadata| ++---------------+-------------+--------+ +| [2018/08/02]| 08/02/2018| true| +| [2018/11/15]| 11/2018| true| +| [2018/11/01]| 11/01/2018| true| +| [2021/03/12]| 12Mar2021| true| +| [2018/01/30]| Jan 30, 2018| true| +| [1999/04/13]| 13.04.1999| true| +| [2020/04/03]| 3April 2020| true| +| [2000/12/11]| next monday| true| +| [2000/12/06]| today| true| +| [2000/12/13]| next week| true| +---------------+-------------+--------+ {%- endcapture -%} {%- capture model_scala_medical -%} +import spark.implicits._ + +val document_assembler = new DocumentAssembler() + .setInputCol("original_date") + .setOutputCol("document") + +val doc2chunk = new Doc2Chunk() + .setInputCols("document") + .setOutputCol("date_chunk") +val date_normalizer = new DateNormalizer() + .setInputCols("date_chunk") + .setOutputCol("date") + .setAnchorDateYear(2000) + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + doc2chunk, + date_normalizer +)) + +import spark.implicits._ + val df = Seq(("08/02/2018"),("11/2018"),("11/01/2018"),("next monday"),("today"),("next week")).toDF("original_date") -val documentAssembler = new DocumentAssembler().setInputCol("original_date").setOutputCol("document") - -val chunksDF = documentAssembler - .transform(df) - .mapAnnotationsCol[Seq[Annotation]]("document", - "chunk_date", - CHUNK, - (aa:Seq[Annotation]) => - aa.map( ann => ann.copy(annotatorType = CHUNK))) -val dateNormalizerModel = new DateNormalizer() - .setInputCols("chunk_date") - .setOutputCol("date") - .setAnchorDateDay(15) - .setAnchorDateMonth(3) - .setAnchorDateYear(2000) -val dateDf = dateNormalizerModel.transform(chunksDF) - -dateDf.select("chunk_date.result","text").show() -+-------------+-------------+ -| result|original_date| -+-------------+-------------+ -| [08/02/2018]| 08/02/2018| -| [11/2018]| 11/2018| -| [11/01/2018]| 11/01/2018| -|[next monday]| next monday| -| [today]| today| -| [next week]| next week| -+-------------+-------------+ +val result = pipeline.fit(df).transform(df) + + ++---------------+-------------+--------+ +|normalized_date|original_date|metadata| ++---------------+-------------+--------+ +| [2018/08/02]| 08/02/2018| true| +| [2018/11/15]| 11/2018| true| +| [2018/11/01]| 11/01/2018| true| +| [2021/03/12]| 12Mar2021| true| +| [2018/01/30]| Jan 30, 2018| true| +| [1999/04/13]| 13.04.1999| true| +| [2020/04/03]| 3April 2020| true| +| [2000/12/11]| next monday| true| +| [2000/12/06]| today| true| +| [2000/12/13]| next week| true| ++---------------+-------------+--------+ + {%- endcapture -%} {%- capture model_api_link -%} @@ -126,6 +358,9 @@ dateDf.select("chunk_date.result","text").show() [DateNormalizer](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/normalizer/date_normalizer/index.html#sparknlp_jsl.annotator.normalizer.date_normalizer.DateNormalizer) {%- endcapture -%} +{%- capture model_notebook_link -%} +[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DateNormalizer.ipynb) +{%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title @@ -135,6 +370,11 @@ model_input_anno=model_input_anno model_output_anno=model_output_anno model_python_medical=model_python_medical model_scala_medical=model_scala_medical +model_python_legal=model_python_legal +model_scala_legal=model_scala_legal +model_python_finance=model_python_finance +model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md b/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md index 4ed19345f3..f25b10f057 100644 --- a/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md +++ b/docs/en/licensed_annotator_entries/Doc2ChunkInternal.md @@ -10,6 +10,14 @@ model Converts `DOCUMENT`, `TOKEN` typed annotations into `CHUNK` type with the contents of a `chunkCol`. Chunk text must be contained within input `DOCUMENT`. May be either `StringType` or `ArrayType[StringType]` (using `setIsArray`). Useful for annotators that require a CHUNK type input. +Parameters; + +- `inputCols`: The name of the columns containing the input annotations. It can read either a String column or an Array. +- `outputCol`: The name of the column in Document type that is generated. We can specify only one column here. + + +All the parameters can be set using the corresponding set method in camel case. For example, `.setInputcols()`. + For more extended examples on document pre-processing see the [Spark NLP Workshop](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb). @@ -25,22 +33,100 @@ CHUNK {%- capture model_python_medical -%} -import sparknlp -from sparknlp.base import * -from sparknlp.common import * -from sparknlp.annotator import * -from sparknlp.training import * -from pyspark.ml import Pipeline - -documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document") -tokenizer = Tokenizer().setInputCol("document").setOutputCol("token") -chunkAssembler = ( - Doc2ChunkInternal() - .setInputCols("document", "token") +from johnsnowlabs import nlp, medical + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +chunkAssembler = medical.Doc2ChunkInternal()\ + .setInputCols("document", "token")\ + .setChunkCol("target")\ + .setOutputCol("chunk")\ + .setIsArray(True) + +pipeline = nlp.Pipeline().setStages([documentAssembler, tokenizer, chunkAssembler]) + +data = spark.createDataFrame( + [ + [ + "Spark NLP is an open-source text processing library for advanced natural language processing.", + ["Spark NLP", "text processing library", "natural language processing"], + ] + ] +).toDF("text", "target") + + +result = pipeline.fit(data).transform(data) +result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False) + ++-----------------------------------------------------------------+---------------------+ +|result |annotatorType | ++-----------------------------------------------------------------+---------------------+ +|[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]| ++-----------------------------------------------------------------+---------------------+ + +{%- endcapture -%} + +{%- capture model_scala_medical -%} +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val chunkAssembler = new Doc2ChunkInternal() + .setInputCols(Array("document", "token")) .setChunkCol("target") .setOutputCol("chunk") + .setIsArray(true) + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + chunkAssembler +)) + +val data = Seq(("Spark NLP is an open-source text processing library for advanced natural language processing.", + "Spark NLP", "text processing library", "natural language processing")).toDF("text", "target") + +val result = pipeline.fit(data).transform(data) + ++-----------------------------------------------------------------+---------------------+ +|result |annotatorType | ++-----------------------------------------------------------------+---------------------+ +|[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]| ++-----------------------------------------------------------------+---------------------+ + +{%- endcapture -%} + +{%- capture model_python_legal -%} + +from johnsnowlabs import nlp, legal + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +chunkAssembler = legal.Doc2ChunkInternal()\ + .setInputCols("document", "token")\ + .setChunkCol("target")\ + .setOutputCol("chunk")\ .setIsArray(True) -) + +pipeline = nlp.Pipeline().setStages([documentAssembler, tokenizer, chunkAssembler]) data = spark.createDataFrame( [ @@ -51,12 +137,10 @@ data = spark.createDataFrame( ] ).toDF("text", "target") -pipeline = ( - Pipeline().setStages([documentAssembler, tokenizer, chunkAssembler]).fit(data) -) -result = pipeline.transform(data) +result = pipeline.fit(data).transform(data) result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False) + +-----------------------------------------------------------------+---------------------+ |result |annotatorType | +-----------------------------------------------------------------+---------------------+ @@ -65,6 +149,117 @@ result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False) {%- endcapture -%} +{%- capture model_scala_legal -%} +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val chunkAssembler = new Doc2ChunkInternal() + .setInputCols(Array("document", "token")) + .setChunkCol("target") + .setOutputCol("chunk") + .setIsArray(true) + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + chunkAssembler +)) + +val data = Seq(("Spark NLP is an open-source text processing library for advanced natural language processing.", + "Spark NLP", "text processing library", "natural language processing")).toDF("text", "target") + +val result = pipeline.fit(data).transform(data) + ++-----------------------------------------------------------------+---------------------+ +|result |annotatorType | ++-----------------------------------------------------------------+---------------------+ +|[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]| ++-----------------------------------------------------------------+---------------------+ + +{%- endcapture -%} + +{%- capture model_python_finance -%} +from johnsnowlabs import nlp, finance + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +chunkAssembler = finance.Doc2ChunkInternal()\ + .setInputCols("document", "token")\ + .setChunkCol("target")\ + .setOutputCol("chunk")\ + .setIsArray(True) + +pipeline = nlp.Pipeline().setStages([documentAssembler, tokenizer, chunkAssembler]) + +data = spark.createDataFrame( + [ + [ + "Spark NLP is an open-source text processing library for advanced natural language processing.", + ["Spark NLP", "text processing library", "natural language processing"], + ] + ] +).toDF("text", "target") + + +result = pipeline.fit(data).transform(data) +result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False) + ++-----------------------------------------------------------------+---------------------+ +|result |annotatorType | ++-----------------------------------------------------------------+---------------------+ +|[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]| ++-----------------------------------------------------------------+---------------------+ + +{%- endcapture -%} + +{%- capture model_scala_finance -%} +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val chunkAssembler = new Doc2ChunkInternal() + .setInputCols(Array("document", "token")) + .setChunkCol("target") + .setOutputCol("chunk") + .setIsArray(true) + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + chunkAssembler +)) + +val data = Seq(("Spark NLP is an open-source text processing library for advanced natural language processing.", + "Spark NLP", "text processing library", "natural language processing")).toDF("text", "target") + +val result = pipeline.fit(data).transform(data) + ++-----------------------------------------------------------------+---------------------+ +|result |annotatorType | ++-----------------------------------------------------------------+---------------------+ +|[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]| ++-----------------------------------------------------------------+---------------------+ + +{%- endcapture -%} {%- capture model_api_link -%} [Doc2ChunkInternal](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/annotator/Doc2ChunkInternal.html) @@ -74,6 +269,9 @@ result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False) [Doc2ChunkInternal](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/doc2_chunk_internal/index.html#sparknlp_jsl.annotator.doc2_chunk_internal.Doc2ChunkInternal) {%- endcapture -%} +{%- capture model_notebook_link -%} +[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/Doc2ChunkInternal.ipynb) +{%- endcapture -%} {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title @@ -82,6 +280,12 @@ model_description=model_description model_input_anno=model_input_anno model_output_anno=model_output_anno model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_python_legal=model_python_legal +model_scala_legal=model_scala_legal +model_python_medical=model_python_finance +model_scala_finance=model_scala_finance model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %} diff --git a/docs/en/licensed_annotator_entries/DocumentHashCoder.md b/docs/en/licensed_annotator_entries/DocumentHashCoder.md index e01ada4a6d..e939682db2 100644 --- a/docs/en/licensed_annotator_entries/DocumentHashCoder.md +++ b/docs/en/licensed_annotator_entries/DocumentHashCoder.md @@ -12,6 +12,18 @@ This annotator can replace dates in a column of `DOCUMENT` type according with t If the specified column contains strings that can be parsed to integers, use those numbers to make the shift in the data accordingly. +Parametres; + +- `PatientIdColumn` *(String)*: Name of the column containing patient ID. + +- `setDateShiftColumn` *(String)*: Sets column to be used for hash or predefined shift. + +- `setNewDateShift` *(String)*: Sets column that has a reference of where chunk begins. + +- `setRangeDays` *(int)*: Sets the range of dates to be sampled from. + +- `setSeed` *(int)*: Sets the seed for random number generator. + {%- endcapture -%} {%- capture model_input_anno -%} @@ -23,9 +35,178 @@ DOCUMENT {%- endcapture -%} {%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical +import pandas as pd + +data = pd.DataFrame( + {'patientID' : ['A001', 'A001', + 'A003', 'A003'], + 'text' : ['Chris Brown was discharged on 10/02/2022', + 'Mark White was discharged on 10/04/2022', + 'John was discharged on 15/03/2022', + 'John Moore was discharged on 15/12/2022' + ], + 'dateshift' : ['10', '10', + '30', '30'] + } +) + +my_input_df = spark.createDataFrame(data) + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +documentHasher = medical.DocumentHashCoder()\ + .setInputCols("document")\ + .setOutputCol("document2")\ + .setPatientIdColumn("patientID")\ + .setNewDateShift("shift_days") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["document2"])\ + .setOutputCol("token") + +embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["document2", "token"])\ + .setOutputCol("word_embeddings") + +clinical_ner = medical.NerModel\ + .pretrained("ner_deid_subentity_augmented", "en", "clinical/models")\ + .setInputCols(["document2","token", "word_embeddings"])\ + .setOutputCol("ner") + +ner_converter = medical.NerConverterInternal()\ + .setInputCols(["document2", "token", "ner"])\ + .setOutputCol("ner_chunk") + +de_identification = medical.DeIdentification() \ + .setInputCols(["ner_chunk", "token", "document2"]) \ + .setOutputCol("deid_text") \ + .setMode("obfuscate") \ + .setObfuscateDate(True) \ + .setDateTag("DATE") \ + .setLanguage("en") \ + .setObfuscateRefSource('faker') \ + .setUseShifDays(True)\ + .setRegion('us') + +pipeline = nlp.Pipeline().setStages([ + documentAssembler, + documentHasher, + tokenizer, + embeddings, + clinical_ner, + ner_converter, + de_identification +]) + +empty_data = spark.createDataFrame([["", ""]]).toDF("text", "patientID") +pipeline_model = pipeline.fit(empty_data) + +output = pipeline_model.transform(my_input_df) +output.select('patientID','text', 'deid_text.result').show(truncate = False) + ++---------+----------------------------------------+---------------------------------------------+ +|patientID|text |result | ++---------+----------------------------------------+---------------------------------------------+ +|A001 |Chris Brown was discharged on 10/02/2022|[Aldona Bar was discharged on 05/18/2022] | +|A001 |Mark White was discharged on 02/28/2020 |[Leta Speller was discharged on 10/14/2019] | +|A002 |John was discharged on 03/15/2022 |[Lonia Blood was discharged on 01/19/2022] | +|A002 |John Moore was discharged on 12/31/2022 |[Murriel Hopper was discharged on 11/06/2022]| ++---------+----------------------------------------+---------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_python_finance -%} +from johnsnowlabs import nlp, finance import pandas as pd +data = pd.DataFrame( + {'patientID' : ['A001', 'A001', + 'A003', 'A003'], + 'text' : ['Chris Brown was discharged on 10/02/2022', + 'Mark White was discharged on 10/04/2022', + 'John was discharged on 15/03/2022', + 'John Moore was discharged on 15/12/2022' + ], + 'dateshift' : ['10', '10', + '30', '30'] + } +) + +my_input_df = spark.createDataFrame(data) + +documentAssembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +documentHasher = finance.DocumentHashCoder()\ + .setInputCols("document")\ + .setOutputCol("document2")\ + .setPatientIdColumn("patientID")\ + .setNewDateShift("shift_days") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["document2"])\ + .setOutputCol("token") + +embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["document2", "token"])\ + .setOutputCol("word_embeddings") + +clinical_ner = finance.NerModel\ + .pretrained("ner_deid_subentity_augmented", "en", "clinical/models")\ + .setInputCols(["document2","token", "word_embeddings"])\ + .setOutputCol("ner") + +ner_converter = finance.NerConverterInternal()\ + .setInputCols(["document2", "token", "ner"])\ + .setOutputCol("ner_chunk") + +de_identification = finance.DeIdentification() \ + .setInputCols(["ner_chunk", "token", "document2"]) \ + .setOutputCol("deid_text") \ + .setMode("obfuscate") \ + .setObfuscateDate(True) \ + .setDateTag("DATE") \ + .setLanguage("en") \ + .setObfuscateRefSource('faker') \ + .setUseShifDays(True)\ + .setRegion('us') + +pipeline = nlp.Pipeline().setStages([ + documentAssembler, + documentHasher, + tokenizer, + embeddings, + clinical_ner, + ner_converter, + de_identification + +]) + +empty_data = spark.createDataFrame([["", ""]]).toDF("text", "patientID") +pipeline_model = pipeline.fit(empty_data) + +output = pipeline_model.transform(my_input_df) +output.select('patientID','text', 'deid_text.result').show(truncate = False) + ++---------+----------------------------------------+----------------------------------------------+ +|patientID|text |result | ++---------+----------------------------------------+----------------------------------------------+ +|A001 |Chris Brown was discharged on 10/02/2022|[Andreas Newport was discharged on 04/09/2022]| +|A001 |Mark White was discharged on 02/28/2020 |[Kara Dies was discharged on 09/05/2019] | +|A002 |John was discharged on 03/15/2022 |[Lane Hacker was discharged on 02/17/2022] | +|A002 |John Moore was discharged on 12/31/2022 |[Orlena Sheldon was discharged on 12/05/2022] | ++---------+----------------------------------------+----------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_python_legal -%} +from johnsnowlabs import nlp, legal +import pandas as pd data = pd.DataFrame( {'patientID' : ['A001', 'A001', @@ -42,33 +223,34 @@ data = pd.DataFrame( my_input_df = spark.createDataFrame(data) -documentAssembler = DocumentAssembler()\ +documentAssembler = nlp.DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") -documentHasher = DocumentHashCoder()\ +documentHasher = legal.DocumentHashCoder()\ .setInputCols("document")\ .setOutputCol("document2")\ - .setDateShiftColumn("dateshift") + .setPatientIdColumn("patientID")\ + .setNewDateShift("shift_days") -tokenizer = Tokenizer()\ +tokenizer = nlp.Tokenizer()\ .setInputCols(["document2"])\ .setOutputCol("token") -embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ +embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ .setInputCols(["document2", "token"])\ .setOutputCol("word_embeddings") -clinical_ner = MedicalNerModel\ +clinical_ner = legal.NerModel\ .pretrained("ner_deid_subentity_augmented", "en", "clinical/models")\ .setInputCols(["document2","token", "word_embeddings"])\ .setOutputCol("ner") -ner_converter = NerConverter()\ +ner_converter = legal.NerConverterInternal()\ .setInputCols(["document2", "token", "ner"])\ .setOutputCol("ner_chunk") -de_identification = DeIdentification() \ +de_identification = legal.DeIdentification() \ .setInputCols(["ner_chunk", "token", "document2"]) \ .setOutputCol("deid_text") \ .setMode("obfuscate") \ @@ -76,9 +258,10 @@ de_identification = DeIdentification() \ .setDateTag("DATE") \ .setLanguage("en") \ .setObfuscateRefSource('faker') \ - .setUseShifDays(True) + .setUseShifDays(True)\ + .setRegion('us') -pipeline_col = Pipeline().setStages([ +pipeline = nlp.Pipeline().setStages([ documentAssembler, documentHasher, tokenizer, @@ -86,22 +269,261 @@ pipeline_col = Pipeline().setStages([ clinical_ner, ner_converter, de_identification + ]) -empty_data = spark.createDataFrame([["", "", ""]]).toDF("patientID","text", "dateshift") -pipeline_col_model = pipeline_col.fit(empty_data) +empty_data = spark.createDataFrame([["", ""]]).toDF("text", "patientID") +pipeline_model = pipeline.fit(empty_data) + +output = pipeline_model.transform(my_input_df) +output.select('patientID','text', 'deid_text.result').show(truncate = False) + ++---------+----------------------------------------+----------------------------------------------+ +|patientID|text |result | ++---------+----------------------------------------+----------------------------------------------+ +|A001 |Chris Brown was discharged on 10/02/2022|[Andreas Newport was discharged on 04/09/2022]| +|A001 |Mark White was discharged on 02/28/2020 |[Kara Dies was discharged on 09/05/2019] | +|A002 |John was discharged on 03/15/2022 |[Lane Hacker was discharged on 02/17/2022] | +|A002 |John Moore was discharged on 12/31/2022 |[Orlena Sheldon was discharged on 12/05/2022] | ++---------+----------------------------------------+----------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_scala_medical -%} + +import spark.implicits._ + +val data = Seq( + ("A001", "Chris Brown was discharged on 10/02/2022"), + ("A001", "Mark White was discharged on 02/28/2020"), + ("A002", "John was discharged on 03/15/2022"), + ("A002", "John Moore was discharged on 12/31/2022") +) + +val columns = Seq("patientID", "text") +val myInputDF: DataFrame = spark.createDataFrame(data).toDF(columns: _*) + + +val my_input_df = spark.createDataFrame(data) + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val documentHasher = new DocumentHashCoder() + .setInputCols("document") + .setOutputCol("document2") + .setPatientIdColumn("patientID") + .setNewDateShift("shift_days") -output = pipeline_col_model.transform(my_input_df) -output.select('text', 'dateshift', 'deid_text.result').show(truncate = False) +val tokenizer = new Tokenizer() + .setInputCols("document2") + .setOutputCol("token") -+----------------------------------------+---------+----------------------------------------------+ -text |dateshift|result | -+----------------------------------------+---------+----------------------------------------------+ -Chris Brown was discharged on 10/02/2022|10 |[Ellender Manual was discharged on 20/02/2022]| -Mark White was discharged on 10/04/2022 |10 |[Errol Bang was discharged on 20/04/2022] | -John was discharged on 15/03/2022 |30 |[Ariel Null was discharged on 14/04/2022] | -John Moore was discharged on 15/12/2022 |30 |[Jean Cotton was discharged on 14/01/2023] | -+----------------------------------------+---------+----------------------------------------------+ +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("document2","token")) + .setOutputCol("word_embeddings") + +val clinical_ner = MedicalNerModel.pretrained("ner_deid_subentity_augmented","en","clinical/models") + .setInputCols(Array("document2","token","word_embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("document2","token","ner")) + .setOutputCol("ner_chunk") + +val de_identification = new DeIdentification() + .setInputCols(Array("ner_chunk","token","document2")) + .setOutputCol("deid_text") + .setMode("obfuscate") + .setObfuscateDate(true) + .setDateTag("DATE") + .setLanguage("en") + .setObfuscateRefSource("faker") + .setUseShifDays(true) + .setRegion("us") + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + documentHasher, + tokenizer, + embeddings, + clinicalNer, + nerConverter, + deIdentification +)) + +val emptyData = Seq(("", "")).toDF("text", "patientID") + +val pipelineModel = pipeline.fit(emptyData) +val result = pipelineModel.transform(myInputDF) + ++---------+----------------------------------------+----------------------------------------------+ +|patientID|text |result | ++---------+----------------------------------------+----------------------------------------------+ +|A001 |Chris Brown was discharged on 10/02/2022|[Andreas Newport was discharged on 04/09/2022]| +|A001 |Mark White was discharged on 02/28/2020 |[Kara Dies was discharged on 09/05/2019] | +|A002 |John was discharged on 03/15/2022 |[Lane Hacker was discharged on 02/17/2022] | +|A002 |John Moore was discharged on 12/31/2022 |[Orlena Sheldon was discharged on 12/05/2022] | ++---------+----------------------------------------+----------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_scala_finance -%} + +import spark.implicits._ + +val data = Seq( + ("A001", "Chris Brown was discharged on 10/02/2022"), + ("A001", "Mark White was discharged on 02/28/2020"), + ("A002", "John was discharged on 03/15/2022"), + ("A002", "John Moore was discharged on 12/31/2022") +) + +val columns = Seq("patientID", "text") +val myInputDF: DataFrame = spark.createDataFrame(data).toDF(columns: _*) + +val my_input_df = spark.createDataFrame(data) + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val documentHasher = new DocumentHashCoder() + .setInputCols("document") + .setOutputCol("document2") + .setPatientIdColumn("patientID") + .setNewDateShift("shift_days") + +val tokenizer = new Tokenizer() + .setInputCols("document2") + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("document2","token")) + .setOutputCol("word_embeddings") + +val clinical_ner = FinanceNerModel.pretrained("ner_deid_subentity_augmented","en","clinical/models") + .setInputCols(Array("document2","token","word_embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("document2","token","ner")) + .setOutputCol("ner_chunk") + +val de_identification = new DeIdentification() + .setInputCols(Array("ner_chunk","token","document2")) + .setOutputCol("deid_text") + .setMode("obfuscate") + .setObfuscateDate(true) + .setDateTag("DATE") + .setLanguage("en") + .setObfuscateRefSource("faker") + .setUseShifDays(true) + .setRegion("us") + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + documentHasher, + tokenizer, + embeddings, + clinicalNer, + nerConverter, + deIdentification +)) + +val emptyData = Seq(("", "")).toDF("text", "patientID") +val pipelineModel = pipeline.fit(emptyData) +val result = pipelineModel.transform(myInputDF) + ++---------+----------------------------------------+----------------------------------------------+ +|patientID|text |result | ++---------+----------------------------------------+----------------------------------------------+ +|A001 |Chris Brown was discharged on 10/02/2022|[Andreas Newport was discharged on 04/09/2022]| +|A001 |Mark White was discharged on 02/28/2020 |[Kara Dies was discharged on 09/05/2019] | +|A002 |John was discharged on 03/15/2022 |[Lane Hacker was discharged on 02/17/2022] | +|A002 |John Moore was discharged on 12/31/2022 |[Orlena Sheldon was discharged on 12/05/2022] | ++---------+----------------------------------------+----------------------------------------------+ + +{%- endcapture -%} + +{%- capture model_scala_legal -%} + +import spark.implicits._ + +val data = Seq( + ("A001", "Chris Brown was discharged on 10/02/2022"), + ("A001", "Mark White was discharged on 02/28/2020"), + ("A002", "John was discharged on 03/15/2022"), + ("A002", "John Moore was discharged on 12/31/2022") +) + +val columns = Seq("patientID", "text") +val myInputDF: DataFrame = spark.createDataFrame(data).toDF(columns: _*) + + +val my_input_df = spark.createDataFrame(data) + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val documentHasher = new DocumentHashCoder() + .setInputCols("document") + .setOutputCol("document2") + .setPatientIdColumn("patientID") + .setNewDateShift("shift_days") + +val tokenizer = new Tokenizer() + .setInputCols("document2") + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("document2","token")) + .setOutputCol("word_embeddings") + +val clinical_ner = LegalNerModel.pretrained("ner_deid_subentity_augmented","en","clinical/models") + .setInputCols(Array("document2","token","word_embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("document2","token","ner")) + .setOutputCol("ner_chunk") + +val de_identification = new DeIdentification() + .setInputCols(Array("ner_chunk","token","document2")) + .setOutputCol("deid_text") + .setMode("obfuscate") + .setObfuscateDate(true) + .setDateTag("DATE") + .setLanguage("en") + .setObfuscateRefSource("faker") + .setUseShifDays(true) + .setRegion("us") + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + documentHasher, + tokenizer, + embeddings, + clinicalNer, + nerConverter, + deIdentification +)) + +val emptyData = Seq(("", "")).toDF("text", "patientID") + +val pipelineModel = pipeline.fit(emptyData) +val result = pipelineModel.transform(myInputDF) + ++---------+----------------------------------------+----------------------------------------------+ +|patientID|text |result | ++---------+----------------------------------------+----------------------------------------------+ +|A001 |Chris Brown was discharged on 10/02/2022|[Andreas Newport was discharged on 04/09/2022]| +|A001 |Mark White was discharged on 02/28/2020 |[Kara Dies was discharged on 09/05/2019] | +|A002 |John was discharged on 03/15/2022 |[Lane Hacker was discharged on 02/17/2022] | +|A002 |John Moore was discharged on 12/31/2022 |[Orlena Sheldon was discharged on 12/05/2022] | ++---------+----------------------------------------+----------------------------------------------+ {%- endcapture -%} @@ -113,6 +535,10 @@ John Moore was discharged on 15/12/2022 |30 |[Jean Cotton was discharged o [DocumentHashCoder](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/deid/doccument_hashcoder/index.html#sparknlp_jsl.annotator.deid.doccument_hashcoder.DocumentHashCoder) {%- endcapture -%} +{%- capture model_notebook_link -%} +[Notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DocumentHashCoder.ipynb) +{%- endcapture -%} + {% include templates/licensed_approach_model_medical_fin_leg_template.md title=title model=model @@ -120,6 +546,12 @@ model_description=model_description model_input_anno=model_input_anno model_output_anno=model_output_anno model_python_medical=model_python_medical +model_python_finance=model_python_finance +model_python_legal=model_python_legal +model_scala_medical=model_scala_medical +model_scala_finance=model_scala_finance +model_scala_legal=model_scala_legal model_api_link=model_api_link model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link %}