Merge remote-tracking branch 'origin/visual-ner' into release/531

# Conflicts: # nlu/pipe/col_substitution/col_name_substitution_utils.py # nlu/pipe/col_substitution/substitution_map_OCR.py # nlu/universe/component_universes.py
JohnSnowLabs · Apr 8, 2024 · 28ca5a0 · 28ca5a0
2 parents a758f64 + 32609ec
commit 28ca5a0
Show file tree

Hide file tree

Showing 23 changed files with 1,179 additions and 40 deletions.
diff --git a/examples/colab/ocr/ocr_visual_document_ner.ipynb b/examples/colab/ocr/ocr_visual_document_ner.ipynb
diff --git a/nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py b/nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py
@@ -1,18 +1,18 @@
-from sparknlp.annotator import MPNetEmbeddings
-
-
-class MPNetSentence:
-    @staticmethod
-    def get_default_model():
-        return MPNetEmbeddings.pretrained() \
-            .setInputCols(["documents"]) \
-            .setOutputCol("mpnet_embeddings")
-
-    @staticmethod
-    def get_pretrained_model(name, language, bucket=None):
-        return MPNetEmbeddings.pretrained(name,language,bucket) \
-            .setInputCols(["documents"]) \
-            .setOutputCol("mpnet_embeddings")
-
-
-
+# from sparknlp.annotator import MPNetEmbeddings
+#
+#
+# class MPNetSentence:
+#     @staticmethod
+#     def get_default_model():
+#         return MPNetEmbeddings.pretrained() \
+#             .setInputCols(["documents"]) \
+#             .setOutputCol("mpnet_embeddings")
+#
+#     @staticmethod
+#     def get_pretrained_model(name, language, bucket=None):
+#         return MPNetEmbeddings.pretrained(name,language,bucket) \
+#             .setInputCols(["documents"]) \
+#             .setOutputCol("mpnet_embeddings")
+#
+#
+#
diff --git a/nlu/ocr_components/utils/hocr_tokenizer/__init__.py b/nlu/ocr_components/utils/hocr_tokenizer/__init__.py
diff --git a/nlu/ocr_components/utils/hocr_tokenizer/hocr_tokenizer.py b/nlu/ocr_components/utils/hocr_tokenizer/hocr_tokenizer.py
@@ -0,0 +1,7 @@
+class HocrTokenizer:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import HocrTokenizer
+        return HocrTokenizer() \
+            .setInputCol("hocr") \
+            .setOutputCol("text_tokenized")
diff --git a/nlu/ocr_components/visual_ner/__init__.py b/nlu/ocr_components/visual_ner/__init__.py
diff --git a/nlu/ocr_components/visual_ner/visual_document_ner/__init__.py b/nlu/ocr_components/visual_ner/visual_document_ner/__init__.py
diff --git a/nlu/ocr_components/visual_ner/visual_document_ner/visual_document_ner.py b/nlu/ocr_components/visual_ner/visual_document_ner/visual_document_ner.py
@@ -0,0 +1,8 @@
+class VisualDocumentNer:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import VisualDocumentNer
+        return VisualDocumentNer()\
+            .pretrained("lilt_roberta_funsd_v1", "en", "clinical/ocr")\
+            .setInputCols(["text_tokenized", "image"])\
+            .setOutputCol("text_entity")
diff --git a/nlu/pipe/col_substitution/col_name_substitution_utils.py b/nlu/pipe/col_substitution/col_name_substitution_utils.py
@@ -14,9 +14,6 @@
 
 import nlu
 from nlu.pipe.col_substitution import substitution_map_OS
-from nlu.universe.feature_universes import NLP_FEATURES
-from nlu.pipe.col_substitution import substitution_map_OS
-from nlu.pipe.col_substitution import col_substitution_OS
 import logging
 
 from nlu.pipe.extractors.extractor_base_data_classes import SparkOCRExtractorConfig
@@ -139,7 +136,26 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:
         result_cols = []
         if isinstance(configs, SparkOCRExtractorConfig):
             # TODO better OCR-EX handling --> Col Name generator function which we use everywhere for unified col naming !!!!!
-            return ['text']
+            # return ['text']
+            for col in df.columns:
+                if 'meta_' + configs.output_col_prefix in col:
+                    base_meta_prefix = 'meta_' + configs.output_col_prefix
+                    meta_col_name = base_meta_prefix + col.split(base_meta_prefix)[-1]
+                    if meta_col_name in df.columns:
+                        # special case for overlapping names with _
+                        if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and not \
+                                c.spark_output_column_names[0].split('_')[-1].isnumeric(): continue
+                        if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and \
+                                c.spark_output_column_names[0].split('_')[-1].isnumeric():
+                            id1 = int(col.split(base_meta_prefix)[-1].split('_')[1])
+                            id2 = int(c.spark_output_column_names.split('_')[-1])
+                            if id1 != id2: continue
+                        result_cols.append(meta_col_name)
+                    elif c.type == AnnoTypes.CHUNK_CLASSIFIER:
+                        result_cols.append(col)
+                    else:
+                        logger.info(f"Could not find meta col for os_components={c}, col={col}. Ommiting col..")
+            return result_cols
         if isinstance(c.model, MultiDocumentAssembler):
             return [f'{NLP_FEATURES.DOCUMENT_QUESTION}_results', f'{NLP_FEATURES.DOCUMENT_QUESTION_CONTEXT}_results']
 

diff --git a/nlu/pipe/col_substitution/col_substitution_OCR.py b/nlu/pipe/col_substitution/col_substitution_OCR.py
@@ -53,3 +53,46 @@ def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identi
     #     else : logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
     #     # new_cols[col]= f"{new_base_name}_confidence"
     # return new_cols
+def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identifier=''):
+    """
+    Drug Norm is always unique
+    Fetched fields are:
+    - entities@<storage_ref>_results
+    - entities@<storage_ref>_<metadata>
+        - entities@<storage_ref>_entity
+        - entities@<storage_ref>_confidence
+    """
+    new_cols = {}
+    for c in cols:
+        if 'visual_classifier_label.1' in cols:
+            new_cols['visual_classifier_label.1'] = 'file_path'
+        if 'visual_classifier_label' in cols:
+            new_cols['visual_classifier_label'] = 'visual_classifier_prediction'
+
+        new_cols[c] = c
+    return new_cols  # TODO
+
+def substitute_document_ner_cols(c, cols, nlu_identifier):
+    """
+    Drug Norm is always unique
+    Fetched fields are:
+    - entities@<storage_ref>_results
+    - entities@<storage_ref>_<metadata>
+        - entities@<storage_ref>_entity
+        - entities@<storage_ref>_confidence
+    """
+    new_cols = {}
+    new_base_name = 'entities' if nlu_identifier == 'UNIQUE' else f'entities_{nlu_identifier}'
+    for c in cols:
+        if '_ocr_confidence' in c:
+            new_cols['meta_text_entity_confidence'] = f'{new_base_name}_confidence'
+        if '_token' in c:
+            new_cols['meta_text_entity_token'] = f'{new_base_name}_ner_entity'
+        if '_entity_x' in c:
+            new_cols['meta_text_entity_x'] = f'{new_base_name}_x_location'
+        if '_entity_y' in c:
+            new_cols['meta_text_entity_y'] = f'{new_base_name}_y_location'
+
+        # new_cols[c] = c
+    return new_cols
+
diff --git a/nlu/pipe/col_substitution/substitution_map_OCR.py b/nlu/pipe/col_substitution/substitution_map_OCR.py
@@ -1,13 +1,11 @@
 """
 Resolve Annotator Classes in the Pipeline to Extractor Configs and Methods
-
 Every Annotator should have 2 configs. Some might offor multuple configs/method pairs, based on model_anno_obj/NLP reference.
 - default/minimalistic -> Just the results of the annotations, no confidences or extra metadata
 - with meta            -> A config that leverages white/black list and gets the most relevant metadata
 - with positions       -> With Begins/Ends
 - with sentence references -> Reeturn the sentence/chunk no. reference from the metadata.
                                 If a document has multi-sentences, this will map a label back to a corrosponding sentence
-
 """
 # from nlu.pipe.col_substitution.col_substitution_HC import *
 from nlu.pipe.col_substitution.col_substitution_OS import *
@@ -16,15 +14,11 @@
 from sparkocr.transformers import *
 
 OCR_anno2substitution_fn = {
-     VisualDocumentClassifier : {
+    VisualDocumentClassifier : {
         'default': substitute_document_classifier_text_cols ,
     },
+    VisualDocumentNerLilt : {
+        'default': substitute_document_ner_cols,
+    },
 
 }
-
-
-
-
-
-
-
diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py
@@ -79,9 +79,9 @@
     SentenceEmbeddings: {
         'default': substitute_sent_embed_cols,
     },
-    MPNetEmbeddings: {
-        'default': substitute_sent_embed_cols,
-    },
+    # MPNetEmbeddings: {
+    #     'default': substitute_sent_embed_cols,
+    # },
     Tokenizer: {
         'default': substitute_tokenizer_cols,
     },

diff --git a/nlu/pipe/extractors/extractor_base_data_classes.py b/nlu/pipe/extractors/extractor_base_data_classes.py
@@ -142,7 +142,7 @@ class SparkOCRExtractorConfig(SparkNLPExtractorConfig):
     get_image_resolution: bool = field(default=False)
     get_image_data: bool = field(default=False)
     # General OCR fields
-    # get_path          :bool              = field(default = False)# origin is path
+    get_path: bool = field(default=False)# origin is path
     get_modification_time: bool = field(default=False)
     get_length: bool = field(default=False)
     get_page_num: bool = field(default=False)

diff --git a/nlu/pipe/extractors/extractor_configs_OCR.py b/nlu/pipe/extractors/extractor_configs_OCR.py
@@ -28,6 +28,22 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'):
         description='Gets label and confidence of visual classifier',
     )
 
+def default_visual_ner_config(output_col_prefix='visual_ocr'):
+    return SparkOCRExtractorConfig(
+        get_text=True,
+        get_begin=True,
+        get_end=True,
+        get_result=True,
+        get_meta=True,
+        get_full_meta=True,
+        get_image_data=True,
+        get_path=True,
+        get_annotator_type=False,
+        output_col_prefix=output_col_prefix,
+        meta_white_list=['entity', 'confidence', 'sentence', 'chunk'],
+        name='visual_ner label, confidence and entities ',
+        description='Gets label, entities and confidence of visual ner',
+    )
 
 def default_binary_to_image_config(output_col_prefix='binary_image'):
     return SparkOCRExtractorConfig(

diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
@@ -80,11 +80,15 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon
         else:
             return {'visual_classifier_confidence': row}
 
+    # if 'FULL binary to image extractor ' in configs.name:
+    #     if not isinstance(row, str):
+    #         return {'path': row}
+
+
     else:
         # # OCR unpackers (TODO WIP)
         # unpack_text = lambda x: unpack_dict_list(x, 'text')
-        # # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data?
-        # unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
+        # # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data?       # unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
         # unpack_image_height = lambda x: unpack_dict_list(x, 'height')
         # unpack_image_width = lambda x: unpack_dict_list(x, 'width')
         # unpack_image_n_channels = lambda x: unpack_dict_list(x, 'nChannels')
@@ -317,6 +321,8 @@ def apply_extractors_and_merge(df, anno_2_ex_config, keep_stranger_features, str
     extractor = lambda c: df[c].apply(extract_master, configs=anno_2_ex_config[c])
     keep_strangers = lambda c: df[c]
 
+    stranger_features.append('path') if 'path' in df.columns and 'text_entity' in anno_2_ex_config.keys() else None
+
     # merged_extraction_df
     # apply the extract_master together with it's configs to every column and geenrate a list of output DF's, one per Spark NLP COL
     # TODO handle MULTI-COL-OUTPUT. If Anno has multi cols, then we either needs multiple keys in anno_2_ex or use something besides

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -11320,7 +11320,8 @@ class Spellbook:
         'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
         'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
         'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
-        'en.image_table_detector':'general_model_table_detection_v2'
+        'en.image_table_detector':'general_model_table_detection_v2',
+        'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1',
     }
 
     # ocr_model_references = {
@@ -16296,6 +16297,7 @@ class Spellbook:
                              'general_model_table_detection_v2': 'ImageTableDetector',
                              'image_table_cell_detector': 'ImageTableCellDetector',
                              'image_table_cell2text_table': 'ImageCellsToTextTable',
+                            'lilt_roberta_funsd_v1': 'VisualDocumentNer',
                              'instructor_large':'InstructorEmbeddings',
                              'instructor_base':'InstructorEmbeddings',
                              'initial_model': 'MPNetEmbeddings',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -314,6 +314,8 @@ class AnnoClassRef:
         OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR: 'ImageTableCellDetector',
         OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE: 'ImageCellsToTextTable',
         OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
+        OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer',
+        OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer',
     }
 
     @staticmethod

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -144,19 +144,23 @@
 from nlu.ocr_components.text_recognizers.img2text.img2text import Img2Text
 from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text
 from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image
+from nlu.ocr_components.utils.hocr_tokenizer.hocr_tokenizer import HocrTokenizer
 from nlu.ocr_components.utils.image2hocr.image2hocr import Image2Hocr
 from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR
+from nlu.ocr_components.visual_ner.visual_document_ner.visual_document_ner import VisualDocumentNer
 from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector
 from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable
 from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions
 # from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
 from nlu.pipe.col_substitution.col_substitution_HC import *
 from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \
     substitute_document_classifier_text_cols
+from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols,substitute_document_ner_cols
 from nlu.pipe.col_substitution.col_substitution_OS import *
 from nlu.pipe.extractors.extractor_configs_HC import *
 from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \
     default_visual_classifier_config
+from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, default_visual_ner_config
 from nlu.pipe.extractors.extractor_configs_OS import *
 from nlu.pipe.nlu_component import NluComponent
 from nlu.universe.annotator_class_universe import AnnoClassRef
@@ -4456,4 +4460,46 @@ class ComponentUniverse:
                                 applicable_file_types=['DOCX', 'DOC'],
                                 ),
 
+        O_A.HOCR_TOKENIZER: partial(NluComponent,
+                                    name=O_A.HOCR_TOKENIZER,
+                                    type=T.OCR_UTIL,
+                                    get_default_model=HocrTokenizer.get_default_model,
+                                    # TODO EXtractor0
+                                    pdf_extractor_methods={'default': default_binary_to_image_config},
+                                    # TODO substitor
+                                    pdf_col_name_substitutor=substitute_recognized_text_cols,
+                                    output_level=L.DOCUMENT,
+                                    node=OCR_FEATURE_NODES.nodes[O_A.HOCR_TOKENIZER],
+                                    description='Convert text to PDF file',
+                                    provider=ComponentBackends.ocr,
+                                    license=Licenses.ocr,
+                                    computation_context=ComputeContexts.spark,
+                                    output_context=ComputeContexts.spark,
+                                    jsl_anno_class_id=O_A.HOCR_TOKENIZER,
+                                    jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
+                                        O_A.HOCR_TOKENIZER],
+                                    applicable_file_types=['DOCX', 'DOC'],
+                                    ),
+
+        O_A.VISUAL_DOCUMENT_NER: partial(NluComponent,
+                                         name=O_A.VISUAL_DOCUMENT_NER,
+                                         type=T.PDF_BUILDER,
+                                         get_default_model=VisualDocumentNer.get_default_model,
+                                         pdf_extractor_methods={'default': default_visual_ner_config},
+                                         # TODO EXtractor
+                                         pdf_col_name_substitutor=substitute_document_ner_cols,
+                                         # TODO substitor
+                                         output_level=L.CHUNK,
+                                         node=OCR_FEATURE_NODES.nodes[O_A.VISUAL_DOCUMENT_NER],
+                                         description='Convert text to PDF file',
+                                         provider=ComponentBackends.ocr,
+                                         license=Licenses.ocr,
+                                         computation_context=ComputeContexts.spark,
+                                         output_context=ComputeContexts.spark,
+                                         jsl_anno_class_id=O_A.VISUAL_DOCUMENT_NER,
+                                         jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
+                                             O_A.VISUAL_DOCUMENT_NER],
+                                         applicable_file_types=['JPG', 'JPEG']
+                                         ),
+
     }
diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -319,7 +319,7 @@ class OCR_NODE_IDS:
     """
     # Visual Document Understanding
     VISUAL_DOCUMENT_CLASSIFIER = JslAnnoId('visual_document_classifier')
-    VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_NER')
+    VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_ner')
 
     # Object Detection
     IMAGE_HANDWRITTEN_DETECTOR = JslAnnoId('image_handwritten_detector')

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -307,6 +307,8 @@ class OCR_FEATURE_NODES:
         A.IMAGE2HOCR: OcrFeatureNode(A.IMAGE2HOCR, [F.OCR_IMAGE], [F.HOCR]),
 
         # VISUAL_DOCUMENT_NER : OcrFeatureNode(A.VISUAL_DOCUMENT_NER, [OcrFeature.HOCR, OcrFeature.FILE_PATH], [NlpFeature.NER_Annotation]), # TODO NlpFeature Space!
+        A.VISUAL_DOCUMENT_NER: OcrFeatureNode(A.VISUAL_DOCUMENT_NER, [F.TEXT_DOCUMENT_TOKENIZED, F.OCR_IMAGE],
+                                              [F.TEXT_ENTITY]),
 
         # Object Detection
         A.IMAGE_SPLIT_REGIONS: OcrFeatureNode(A.IMAGE_SPLIT_REGIONS, [F.OCR_IMAGE,F.OCR_REGION], [F.IMG_SPLIT_REGIONS]),

diff --git a/nlu/universe/feature_resolutions.py b/nlu/universe/feature_resolutions.py
@@ -114,5 +114,7 @@ class FeatureResolutions:
         OCR_FEATURES.OCR_TABLE_CELLS: ResolvedFeature(OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR,
                                                       OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR, 'xx', False,
                                                       ComponentUniverse.components[OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR]),
+        OCR_FEATURES.TEXT_DOCUMENT_TOKENIZED: ResolvedFeature(OCR_NODE_IDS.HOCR_TOKENIZER, OCR_NODE_IDS.HOCR_TOKENIZER, 'xx', False,
+                                           ComponentUniverse.components[OCR_NODE_IDS.HOCR_TOKENIZER]),
 
     }
diff --git a/tests/datasets/ocr/images/ocr_ner.png b/tests/datasets/ocr/images/ocr_ner.png
diff --git a/tests/datasets/ocr/images/ocr_test.png b/tests/datasets/ocr/images/ocr_test.png
diff --git a/tests/nlu_ocr_tests/ocr_visual_document_ner.py b/tests/nlu_ocr_tests/ocr_visual_document_ner.py
@@ -0,0 +1,27 @@
+import os
+import sys
+
+sys.path.append(os.getcwd())
+import unittest
+import nlu
+
+os.environ["PYTHONPATH"] = "F:/Work/repos/nlu"
+os.environ['PYSPARK_PYTHON'] = sys.executable
+os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
+from johnsnowlabs import nlp, visual
+
+# nlp.install(json_license_path='license.json',visual=True)
+nlp.start(visual=True)
+
+class OcrTest(unittest.TestCase):
+
+    def test_classify_document(self):
+        # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
+        # text that we generate PDF to has to come from an image struct!
+        # We need convert text to img struct!
+        p = nlu.load('en.lilt_roberta_funds.v1').predict('ocr_ner.png',output_level='chunk')
+        for i,j in p.iterrows():
+            print(i,'---->',j)
+
+if __name__ == '__main__':
+    unittest.main()