Source code for sparknlp_jsl.utils.ocr_nlp_processor

-from sparknlp_jsl.utils.ocr_utils import __ocr_pipeline, __colored_box, __highlighted_box, __black_box
+from sparknlp_jsl.utils.ocr_utils import __ocr_pipeline, __colored_box, __highlighted_box, __bounding_box
 from pyspark.sql import SparkSession
 from typing import Optional, List, IO
 from pyspark.ml import PipelineModel
@@ -336,8 +336,8 @@ 

Source code for sparknlp_jsl.utils.ocr_nlp_processor

[docs]def ocr_entity_processor(spark: SparkSession, file_path: str, ner_pipeline: PipelineModel, style: str = "bounding_box", save_dir: str = "save_folder", label: bool = False, - label_color: str = "red", color_chart_path: str = "color_chart.png", - chunk_col: str = "ner_chunk", + label_color: str = "red", box_color: tuple = (0,0,0), + color_chart_path: str = "color_chart.png", chunk_col: str = "ner_chunk", black_list: Optional[List[str]] = [], display_result: bool = False, resolution:int=200, confidenceThreshold:int=70, @@ -360,8 +360,8 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

:param black_list: List of NER labels that will be painted over in 'highlight' and 'bounding_box' styles :type black_list: list - :param style: PDF file process style that has 3 options; 'black_band': Black bands over the chunks detected - by NER pipeline. 'bounding_box': Colorful bounding boxes around the chunks detected by NER pipeline. Each + :param style: PDF file process style that has 3 options; 'colored_box': Draws bands with a single color over the chunks detected + by NER pipeline (default is black). 'bounding_box': Colorful bounding boxes around the chunks detected by NER pipeline. Each color represents a different NER label. 'highlight': Colorful highlights over the chunks detected by NER pipeline. Each color represents a different NER label. :type style: str @@ -374,6 +374,9 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

:param label_color: Color of NER labels if 'label=True' , defaults to "red" :type label_color: str + :param box_color: RGB code for colored box if 'style=colored_box', defaults to "black" + :type box_color: tuple + :param color_chart_path: File name of color chart in PNG format that shows the colors of NER labels in the processed file, defaults to "color_chart.png" :type color_chart_path: str @@ -396,7 +399,7 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

if label and label_color not in colors: raise Exception(f"{label_color} is not a valid color. Please pick one:{colors}") else: - __colored_box(result, file_name, style, chunk_col, black_list, save_dir, label, label_color, + __bounding_box(result, file_name, style, chunk_col, black_list, save_dir, label, label_color, color_chart_path, display_result) elif style == "highlight": @@ -407,11 +410,11 @@

Source code for sparknlp_jsl.utils.ocr_nlp_processor

label_color, color_chart_path, display_result) - elif style == "black_band": + elif style == "colored_box": if label and label_color not in colors: raise Exception(f"{label_color} is not a valid color. Please pick one:{colors}") else: - __black_box(result, file_name, style, chunk_col, save_dir, label, label_color, display_result) + __colored_box(result, file_name, style, chunk_col, save_dir, box_color, label, label_color, display_result) else: raise Exception(style, diff --git a/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html b/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html index f464b04166..df53a7bf49 100644 --- a/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html +++ b/docs/licensed/api/python/modules/sparknlp_jsl/utils/ocr_utils.html @@ -559,11 +559,11 @@

Source code for sparknlp_jsl.utils.ocr_utils

print(f"File saved to {save_dir}/{file_name}_{style}.pdf successfully.")
 
 
-def __draw_black_box(img_pil_deid, coord_df, label, label_color):
-    overlay = Image.new('RGBA', img_pil_deid.size, (0, 0, 0, 0))  # black
+def __draw_colored_box(img_pil_deid, coord_df, box_color, label, label_color):
+    overlay = Image.new('RGBA', img_pil_deid.size, (0, 0, 0, 0))  # transparent
     draw = ImageDraw.Draw(overlay)  # Create a context for drawing things on it.
     for i, row in coord_df.iterrows():
-        draw.rectangle((row['coord'][:2], row['coord'][2:]), fill=(0, 0, 0, 255), outline=(0, 0, 0, 255), width=5)
+        draw.rectangle((row['coord'][:2], row['coord'][2:]), fill=box_color, outline=box_color, width=5)
 
         if label:
             label_position = tuple([row['coord'][0], row['coord'][1] - 10])
@@ -577,10 +577,10 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

return img
 
-
-def __black_box(result, file_name, style, chunk_col, save_dir="Black_Box", label=False, label_color="black",
-                display_result=False):
-    print("Drawing black box...")
+def __colored_box(result, file_name, style, chunk_col, save_dir="Colored_Box", box_color=(0,0,0), label=False, label_color="black",
+                  display_result=False):
+    print("Drawing colored box...")
+    box_color += (255,)
     image_list = []  # append highlighted images
     res_pd = result.selectExpr("pagenum", f"{chunk_col}.begin", f"{chunk_col}.end", f"{chunk_col}.result",
                                f"{chunk_col}.metadata").toPandas()
@@ -604,7 +604,7 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

img_deid = result.select('image_raw').collect()[row][0]
         img_pil_deid = to_pil_image(img_deid, img_deid.mode)
         img_pil_deid = img_pil_deid.convert("RGBA")
-        image_list.append(__draw_black_box(img_pil_deid, coord_df, label, label_color))
+        image_list.append(__draw_colored_box(img_pil_deid, coord_df, box_color, label, label_color))
 
     __save_file(image_list, file_name, style, save_dir)
 
@@ -714,11 +714,11 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

try:
             if pred_label in black_list:
-                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=5,
+                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=2,
                                fill=label2color[pred_label])
 
             else:
-                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=5)
+                draw.rectangle((row['coord'][:2], row['coord'][2:]), outline=label2color[pred_label], width=2)
         except:
             print("Error in this row:", row)
             continue
@@ -732,11 +732,11 @@ 

Source code for sparknlp_jsl.utils.ocr_utils

return img_pil_deid
 
 
-def __colored_box(result, file_name, style, chunk_col, black_list, save_dir="Colored_Box", label=False,
+def __bounding_box(result, file_name, style, chunk_col, black_list, save_dir="Bounding_Box", label=False,
                   label_color="red",
                   color_chart_path="color_chart.png", display_result=False):
     global label2color, colors
-    print("Drawing colored box...")
+    print("Drawing bounding box...")
     image_list = []  # append images
     black_list = list(set([i.lower() for i in black_list]))  # lowercase black_list
     res_pd = result.selectExpr("pagenum", f"{chunk_col}.begin", f"{chunk_col}.end", f"{chunk_col}.result",
diff --git a/docs/licensed/api/python/objects.inv b/docs/licensed/api/python/objects.inv
index b5b03c6afb02668fab58a33baa1440433bdbb060..eddbf45c38e5f203d31c0def8eac68792b7e989b 100644
GIT binary patch
literal 26556
zcmYg%V{|56&~0o?tcfPJZQITh+qP}nwryvUOeS`oaAMoOdB6MJweFAJU42gd>h9IM
zs`hRYA{7Sdn;387w+#YLnmt@d0EBp
zMfLxiz)kcWt6G>6IhooTTH2YIIuY5M5xH1cIun^$+L#i3pN;Kp9h^*^olQ-MT%9fL
z%!#~gERE>lJk~CHfo^-8Sy$fq3WKd(cFA)%o94M&u1D8D4s_dwZMm4;9}qd@giv`X
zl|d5-ZVew+b4MrK
z)gy`Hg(&?jP{_+Kg&lwNCiM6DReK~5dV|~B-R(m7A%k+c($CrdIq&!TVLrBf`0AH2z1$+xc_9RkI7^hK^6
z#K37qzrHu!0pAF{f#Ks%v}D(eKCHC+_?o=fI5FbWPCm>!E$Mafz+EuWu-~;|YXa$(
z3F#M5!SUIyIZX~zjF3PLeE}hOS#RHeHTN$PtDxlkP{@?9@_BfpC0HA!I443+j~fh!
zVWv8|b+>VKlIunz@zV$eRvz{N`k|HDlozAO$Oa962w~G__J@wA>x{ESSM>tZIu+6kgo%qycy_~mEBfuIXU
zL)Cdr;+PlqrT3;k*}t0?Bi+p*rtDQVK;fdIlN7Fe6Wt}$fW&>
zUd_*^w)*yl$EVNh+uJ5LwD1Id^+FiGf{9Vc*5v*1HnQn{#%wqI_3Yy7>iuFv(-&GB
zrdDlhBVkn^u~{8`6pcZr-&4I#!IOKbr@O7=)9?Kl<#ZSLx#Hc?(XXdPeRR35k-aVc
zi7AT#(pQ0P66<(n+>3eka!B)bAaesQy2C6D9}deI%?wS4*E=Q6~2X>NceW{
zY1EF6cf(;AV&orRBSZlVjQvW^p&fgQ+3y#!uhvasS1~b5XOv*^Qs2YXW|UU~XbMV8
zHED%4LLR0-0N|#(P2NJ<+R#PIEo~>~n4>0DZVbGJJR3@;y*R^JKOES{SHrT72iLle
zV8uM)`uxn3Uk>BpWpCCJwAK5!*Oh_!f~}3J=xN4z5X}oRGv$LwkODTh=6P`1J!v~@
zhkCocp(oRT;@Et#N{suoRF5OYKS0HDz}own(e^B+T9CkGovO5?2v7WQjVl;=8VJ@z
zL^Q9t6;7a0(m%PB9^IOimSbN*i`ZL;z%6hWgONt!690dsTm0Ma)6-XyMkFxCA%Sr@
z)U;C5a#(?zAS7)|y+9VI;YD_MXb;iu@m;|SUHNOcfHVn0j}&69VVhIx9j~;dcOc85
z9!7pWp4G^!IWBI@GF@L7CE-ztnVNFLU1K^x;y3#z=b*O$+*ix=jY+yKd!pwm(|+7t
z60Q1%`j&(-c&%dhhypQnV=Bc%(A)C#G6+DJjubp2BDN6ChWW9F~6E&v%@B{G8DQ7#|i9K
zy7kn9EKsu3wJ5jBc#KdSv)%<8#Wu#tH{Y%fgUddb<9y82O4Yyti3XONSzh)0NFV7i
zIMZDc{NoKft~NOrmf2)1o9Qi59Z128Hj=2Jfjjkh=g-C#qVdo51;$}~6N14Yp^a<#
z)o{2Ty#F$A;Fd=sN?^-sQ@r4UbA^+wq}p*!8?IiU@$?Eay0XZhnvoWh2wP|Q?
z%NuGhBP&op$yxGtp&;nfOCbb~1$J;54JSvCb1~5J7_I8T1nJf6K$G6)Vi{Pca1Ks
zgX(%;BEGFU(d+pAgbpX^(|m(oduj!L=h`Ty5cPJsUzZLk+y~@%jt7EJ9L!gF;nE{^
z>gV_QM<)G)_SwZ3gfs{NnAQ-gi9nWz-#^W;_q=Wk;C>ockc}6ZwNlVooCu$iwvz9t_t+3{MT=zk
zrgRm!s%e8HwBR7=7%D}dx?-^H&f5vBOKx_4dViq2y)j-(+dSA(?)?+|EuM6W4Hj_r
zzT!I<%__!phdaTg{Z~0a?^sG3|A310F5ON0BWjq9#$ab1yqPHb@@yg-x|t6EW@V{&
z@{jeym*PzXyRh{JQUh6ayMTw*<^6R%^<@NO@7)IWs9$Ty607v(K-I0FQy~9$uhcbW
zslMfM``}o4huqVv#vT6q60W}XoI`76?Q6S3xA&bujD432*M&vr#;8|W(*q;!Hl9Tx
zZeAtj$=FKy)oA|<+W=9PYzRd#?^}+E7&1CS0LHfe|Vi&V-jgjd%=g8kU
zCb+hTJ*eXR!=qyU%9Fg0&!uNK-&l(-hA+V#xAe6+cs?_>#qJZR7CS8P8$WMo&1lKm
zKXiO0pt}8C;n;^E9;5T@u;9zDAg&fSC>9KGeE+6eX9S_~qW=UOF(3=r)xVn3G=n;I
zf_5c|;9d?32u)wl@q_s_tie_Td%iB7Mp|SIp8kCfvp{!
zj@8p#4%7`2o?q4_RG$dso=^ymv32BW!&<~509#5N3p=nLH3nS5cTX6AYgHnHwEvoe
zyz(pb7uW__K7HpNfDAySp{)h)ZT1EVM(2|q*bFVw)PHe;v%M6b!;w~i&!=O#@JrE=
zdj4^iG*{ZKpE>OPNBgKOD(LnxT}IaLqw0>oEeZx0<8a&jq^puH7B}KHfLOd5fV