Source code for sparknlp_jsl.utils.ocr_nlp_processor

-from sparknlp_jsl.utils.ocr_utils import __ocr_pipeline, __colored_box, __highlighted_box, __black_box
+from sparknlp_jsl.utils.ocr_utils import __ocr_pipeline, __colored_box, __highlighted_box, __bounding_box
 from pyspark.sql import SparkSession
 from typing import Optional, List, IO
 from pyspark.ml import PipelineModel
@@ -336,8 +336,8 @@ 

Source code for sparknlp_jsl.utils.ocr_nlp_processor

[docs]def ocr_entity_processor(spark: SparkSession, file_path: str, ner_pipeline: PipelineModel, style: str = "bounding_box", save_dir: str = "save_folder", label: bool = False, - label_color: str = "red", color_chart_path: str = "color_chart.png", - chunk_col: str = "ner_chunk", + label_color: str = "red", box_color: tuple = (0,0,0), + color_chart_path: str = "color_chart.png", chunk_col: str = "ner_chunk", black_list: Optional[List[str]] = [], display_result: bool = False, resolution:int=200, confidenceThreshold:int=70, @@ -360,8 +360,8 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

:param black_list: List of NER labels that will be painted over in 'highlight' and 'bounding_box' styles :type black_list: list - :param style: PDF file process style that has 3 options; 'black_band': Black bands over the chunks detected - by NER pipeline. 'bounding_box': Colorful bounding boxes around the chunks detected by NER pipeline. Each + :param style: PDF file process style that has 3 options; 'colored_box': Draws bands with a single color over the chunks detected + by NER pipeline (default is black). 'bounding_box': Colorful bounding boxes around the chunks detected by NER pipeline. Each color represents a different NER label. 'highlight': Colorful highlights over the chunks detected by NER pipeline. Each color represents a different NER label. :type style: str @@ -374,6 +374,9 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

:param label_color: Color of NER labels if 'label=True' , defaults to "red" :type label_color: str + :param box_color: RGB code for colored box if 'style=colored_box', defaults to "black" + :type box_color: tuple + :param color_chart_path: File name of color chart in PNG format that shows the colors of NER labels in the processed file, defaults to "color_chart.png" :type color_chart_path: str @@ -396,7 +399,7 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

if label and label_color not in colors: raise Exception(f"{label_color} is not a valid color. Please pick one:{colors}") else: - __colored_box(result, file_name, style, chunk_col, black_list, save_dir, label, label_color, + __bounding_box(result, file_name, style, chunk_col, black_list, save_dir, label, label_color, color_chart_path, display_result) elif style == "highlight": @@ -407,11 +410,11 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

label_color, color_chart_path, display_result) - elif style == "black_band": + elif style == "colored_box": if label and label_color not in colors: raise Exception(f"{label_color} is not a valid color. Please pick one:{colors}") else: - __black_box(result, file_name, style, chunk_col, save_dir, label, label_color, display_result) + __colored_box(result, file_name, style, chunk_col, save_dir, box_color, label, label_color, display_result) else: raise Exception(style, diff --git a/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html b/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html index f464b04166..df53a7bf49 100644 --- a/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html +++ b/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html @@ -559,11 +559,11 @@

Source code for sparknlp_jsl.utils.ocr_utils

print(f"File saved to {save_dir}/{file_name}_{style}.pdf successfully.")
 
 
-def __draw_black_box(img_pil_deid, coord_df, label, label_color):
-    overlay = Image.new('RGBA', img_pil_deid.size, (0, 0, 0, 0))  # black
+def __draw_colored_box(img_pil_deid, coord_df, box_color, label, label_color):
+    overlay = Image.new('RGBA', img_pil_deid.size, (0, 0, 0, 0))  # transparent
     draw = ImageDraw.Draw(overlay)  # Create a context for drawing things on it.
     for i, row in coord_df.iterrows():
-        draw.rectangle((row['coord'][:2], row['coord'][2:]), fill=(0, 0, 0, 255), outline=(0, 0, 0, 255), width=5)
+        draw.rectangle((row['coord'][:2], row['coord'][2:]), fill=box_color, outline=box_color, width=5)
 
         if label:
             label_position = tuple([row['coord'][0], row['coord'][1] - 10])
@@ -577,10 +577,10 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

return img
 
-
-def __black_box(result, file_name, style, chunk_col, save_dir="Black_Box", label=False, label_color="black",
-                display_result=False):
-    print("Drawing black box...")
+def __colored_box(result, file_name, style, chunk_col, save_dir="Colored_Box", box_color=(0,0,0), label=False, label_color="black",
+                  display_result=False):
+    print("Drawing colored box...")
+    box_color += (255,)
     image_list = []  # append highlighted images
     res_pd = result.selectExpr("pagenum", f"{chunk_col}.begin", f"{chunk_col}.end", f"{chunk_col}.result",
                                f"{chunk_col}.metadata").toPandas()
@@ -604,7 +604,7 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

img_deid = result.select('image_raw').collect()[row][0]
         img_pil_deid = to_pil_image(img_deid, img_deid.mode)
         img_pil_deid = img_pil_deid.convert("RGBA")
-        image_list.append(__draw_black_box(img_pil_deid, coord_df, label, label_color))
+        image_list.append(__draw_colored_box(img_pil_deid, coord_df, box_color, label, label_color))
 
     __save_file(image_list, file_name, style, save_dir)
 
@@ -714,11 +714,11 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

try:
             if pred_label in black_list:
-                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=5,
+                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=2,
                                fill=label2color[pred_label])
 
             else:
-                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=5)
+                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=2)
         except:
             print("Error in this row:", row)
             continue
@@ -732,11 +732,11 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

return img_pil_deid
 
 
-def __colored_box(result, file_name, style, chunk_col, black_list, save_dir="Colored_Box", label=False,
+def __bounding_box(result, file_name, style, chunk_col, black_list, save_dir="Bounding_Box", label=False,
                   label_color="red",
                   color_chart_path="color_chart.png", display_result=False):
     global label2color, colors
-    print("Drawing colored box...")
+    print("Drawing bounding box...")
     image_list = []  # append images
     black_list = list(set([i.lower() for i in black_list]))  # lowercase black_list
     res_pd = result.selectExpr("pagenum", f"{chunk_col}.begin", f"{chunk_col}.end", f"{chunk_col}.result",
diff --git a/docs/licensed/api/python/objects.inv b/docs/licensed/api/python/objects.inv
index b5b03c6afb..eddbf45c38 100644
Binary files a/docs/licensed/api/python/objects.inv and b/docs/licensed/api/python/objects.inv differ
diff --git a/docs/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/contextual_parser/index.html b/docs/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/contextual_parser/index.html
index 7cf00009c9..b2d281da77 100644
--- a/docs/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/contextual_parser/index.html
+++ b/docs/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/context/contextual_parser/index.html
@@ -777,6 +777,11 @@ 

ClassesoutputAnnotatorType[source]#
+
+
+name = ContextualParserModel[source]#
+
+
caseSensitive[source]#
@@ -853,6 +858,22 @@

Classes +
+static pretrained(name='date_of_birth_parser', lang='en', remote_loc='clinical/models')[source]#
+

Download a pre-trained ContextualParserModel.

+
+
Args:

name (str): Name of the pre-trained model, by default “date_of_birth_parser” +lang (str): Language of the pre-trained model, by default “en” +remote_loc (str): Remote location of the pre-trained model. If None, use the +open-source location. Other values are “clinical/models”, +“finance/models”, or “legal/models”.

+
+
Returns:

ContextualParserModel: A pre-trained ContextualParserModel.

+
+
+

+ @@ -912,6 +933,7 @@

ClassesContextualParserModel