JohnSnowLabs · C-K-Loan · Dec 13, 2023 · Nov 15, 2023 · Nov 28, 2023 · Dec 5, 2023
diff --git a/nlu/ocr_components/table_extractors/image2table/__init__.py b/nlu/ocr_components/table_extractors/image2table/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image2table/image2table.py b/nlu/ocr_components/table_extractors/image2table/image2table.py
@@ -0,0 +1,7 @@
+class IMAGE_TABLE_DETECTOR:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageTableDetector
+        return ImageTableDetector.pretrained("general_model_table_detection_v2", "en", "clinical/ocr") \
+            .setInputCol("ocr_image") \
+            .setOutputCol("region")
diff --git a/nlu/ocr_components/table_extractors/image2table_cell/__init__.py b/nlu/ocr_components/table_extractors/image2table_cell/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image2table_cell/image2table_cell.py b/nlu/ocr_components/table_extractors/image2table_cell/image2table_cell.py
@@ -0,0 +1,8 @@
+class ImageTableCellDetector:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageTableCellDetector
+        return ImageTableCellDetector() \
+            .setInputCol("image_region") \
+            .setAlgoType("morphops") \
+            .setOutputCol("ocr_table_cells")
diff --git a/nlu/ocr_components/table_extractors/image_table_cell2text/__init__.py b/nlu/ocr_components/table_extractors/image_table_cell2text/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image_table_cell2text/image_table_cell2text.py b/nlu/ocr_components/table_extractors/image_table_cell2text/image_table_cell2text.py
@@ -0,0 +1,9 @@
+class ImageTable2Cell2TextTable:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageCellsToTextTable
+        return ImageCellsToTextTable() \
+            .setInputCol("image_region") \
+            .setCellsCol('ocr_table_cells')\
+            .setOutputCol("ocr_table")
+
diff --git a/nlu/ocr_components/utils/image_split_regions/__init__.py b/nlu/ocr_components/utils/image_split_regions/__init__.py
diff --git a/nlu/ocr_components/utils/image_split_regions/image_split_regions.py b/nlu/ocr_components/utils/image_split_regions/image_split_regions.py
@@ -0,0 +1,14 @@
+class ImageSplitRegions:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageSplitRegions
+        return ImageSplitRegions() \
+            .setInputCol("ocr_image") \
+            .setInputRegionsCol("region") \
+            .setOutputCol("image_region")
+
+# .setInputRegionsCol("ocr_table_16969+
+#
+#
+#
+# ") \
diff --git a/nlu/pipe/extractors/extractor_methods/ocr_extractors.py b/nlu/pipe/extractors/extractor_methods/ocr_extractors.py
@@ -14,7 +14,7 @@ def extract_table(df):
     return exploded_results.toPandas()
 
 
-def extract_tables(df, rename_cols=True):
+def extract_tables(df, rename_cols=False):
     df = df.withColumn("table_index", df.ocr_table.area.index)
     # pagennum
     pandas_tables = []

diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py
@@ -75,6 +75,7 @@ def __init__(self,
                  requires_binary_format: bool = False,  # Set to true for OCR annotators that require binary image format
                  requires_image_format: bool = False,  # Set to true for OCR annotators that require image format
                  is_visual_annotator: bool = False,  # Set to true for OCR annotators that require image format
+                 is_light_pipe_incompatible: bool = False,  # Set to true for OCR annotators that require image format
                  ):
         self.name = name
         self.type = type
@@ -116,6 +117,7 @@ def __init__(self,
         self.requires_binary_format = requires_binary_format
         self.requires_image_format = requires_image_format
         self.is_visual_annotator = is_visual_annotator
+        self.is_light_pipe_incompatible = is_light_pipe_incompatible
 
     def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
                      nlu_ref: str,

diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -60,7 +60,7 @@ def __init__(self):
         self.has_table_qa_models = False
         self.requires_image_format = False
         self.requires_binary_format = False
-
+        self.is_light_pipe_incompatible = False
     def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
             name_to_add='', idx=None):
         '''
@@ -203,7 +203,8 @@ def fit(self, dataset=None, dataset_path=None, label_seperator=','):
                 logger.info(
                     'Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.')
                 self.vanilla_transformer_pipe = self.spark_estimator_pipe.fit(self.get_sample_spark_dataframe())
-                self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)
+                if not self.is_light_pipe_incompatible:
+                    self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)
 
         self.has_trainable_components = False
         self.is_fitted = True
@@ -452,28 +453,6 @@ def save(self, path, component='entire_pipeline', overwrite=True):
             else:
                 self[component].save(path)
 
-    def predict_embeds(self,
-                       data,
-                       multithread=True,
-                       return_spark_df=False,
-                       ):
-        '''
-        Annotates a Pandas Dataframe/Pandas Series/Numpy Array/Spark DataFrame/Python List strings /Python String abd returns List of Floats or Spark-Df, only with embeddings.
-        :param data: Data to predict on
-                and drop_irrelevant_cols = True then chunk, sentence and Doc will be dropped
-        :param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline
-                                 This will run fully distributed in on the Spark Master, but not prettify the output dataframe
-        :param return_spark_df: return Spark-DF and not collect all data into driver instead of returning list of float
-        :param multithread: Use multithreaded Light-pipeline instead of spark-pipeline
-
-        :return:
-        '''
-        from nlu.pipe.utils.predict_helper import __predict__
-        return __predict__(self, data, output_level=None, positions=False, keep_stranger_features=False, metadata=False,
-                           multithread=multithread,
-                           drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True,
-                           embed_only=True)
-
     def predict(self,
                 data,
                 output_level='',

diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -701,6 +701,8 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
                 pipe.has_nlp_components = False
             if c.jsl_anno_py_class == 'ImageAssembler':
                 pipe.contains_ocr_components = True
+            if c.is_light_pipe_incompatible:
+                pipe.is_light_pipe_incompatible = True
 
         return pipe
 

diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py
@@ -311,7 +311,8 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
         else:
             pipe.fit()
 
-        pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)
+        if not pipe.is_light_pipe_incompatible:
+            pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)
 
     if pipe.contains_ocr_components and pipe.contains_audio_components:
         """ Idea:

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -11155,27 +11155,30 @@ class Spellbook:
         'jpg2text': OCR_NODE_IDS.IMAGE2TEXT,  # Alias for img2text
         'pdf2text': OCR_NODE_IDS.PDF2TEXT,
         'doc2text': OCR_NODE_IDS.DOC2TEXT,
-
+        #  'image_table_detector': OCR_NODE_IDS.IMAGE_TABLE_DETECTOR,
+        'image_table_cell_detector': OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR,
+        'image_table_cell2text_table': OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE,
         'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE,
         'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE,
         'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
         'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
-        'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482'
+        'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
+        'en.image_table_detector':'general_model_table_detection_v2'
     }
 
-    ocr_model_references = {
-        'img2text': OCR_NODE_IDS.IMAGE2TEXT,
-        'png2text': OCR_NODE_IDS.IMAGE2TEXT,  # Alias for img2text
-        'jpg2text': OCR_NODE_IDS.IMAGE2TEXT,  # Alias for img2text
-        'pdf2text': OCR_NODE_IDS.PDF2TEXT,
-        'doc2text': OCR_NODE_IDS.DOC2TEXT,
-
-        'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE,
-        'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE,
-        'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
-        'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
-        'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482'
-    }
+    # ocr_model_references = {
+    #     'img2text': OCR_NODE_IDS.IMAGE2TEXT,
+    #     'png2text': OCR_NODE_IDS.IMAGE2TEXT,  # Alias for img2text
+    #     'jpg2text': OCR_NODE_IDS.IMAGE2TEXT,  # Alias for img2text
+    #     'pdf2text': OCR_NODE_IDS.PDF2TEXT,
+    #     'doc2text': OCR_NODE_IDS.DOC2TEXT,
+    #
+    #     'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE,
+    #     'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE,
+    #     'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
+    #     'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
+    #     'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482'
+    # }
     # map storage ref to nlu refner_jsl
     storage_ref_2_nlu_ref = {'ar': {'glove_300d': 'ar.embed'},
                              'bn': {'bengali_cc_300d': 'bn.embed.glove',
@@ -16023,6 +16026,9 @@ class Spellbook:
                              'image_classifier_vit_where_am_I_hospital_balcony_hallway_airport_coffee_house_apartment_office': 'ViTForImageClassification',
                              'image_classifier_vit_world_landmarks': 'ViTForImageClassification',
                              'image_classifier_convnext_tiny_224_local':'ConvNextImageClassifier',
+                             'general_model_table_detection_v2': 'ImageTableDetector',
+                             'image_table_cell_detector': 'ImageTableCellDetector',
+                             'image_table_cell2text_table': 'ImageCellsToTextTable',
                              'instructor_large':'InstructorEmbeddings',
                              'instructor_base':'InstructorEmbeddings',
                              'initial_model': 'MPNetEmbeddings',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -300,7 +300,10 @@ class AnnoClassRef:
         OCR_NODE_IDS.TEXT2PDF: 'TextToPdf',
         OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER: 'VisualDocumentClassifier',
         OCR_NODE_IDS.IMAGE2HOCR: 'ImageToHocr',
-
+        OCR_NODE_IDS.IMAGE_TABLE_DETECTOR: 'ImageTableDetector',
+        OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR: 'ImageTableCellDetector',
+        OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE: 'ImageCellsToTextTable',
+        OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
     }
 
     @staticmethod

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -131,6 +131,10 @@
 from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text
 from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image
 from nlu.ocr_components.utils.image2hocr.image2hocr import Image2Hocr
+from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR
+from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector
+from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable
+from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions
 # from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
 from nlu.pipe.col_substitution.col_substitution_HC import *
 from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols
@@ -4171,6 +4175,86 @@ class ComponentUniverse:
         #                                         applicable_file_types=['JPG', 'JPEG']
         #                                         ),
         #
+
+
+        O_A.IMAGE_TABLE_CELL_DETECTOR: partial(NluComponent,
+                                               name=O_A.IMAGE_TABLE_CELL_DETECTOR,
+                                               type=T.TEXT_RECOGNIZER,
+                                               get_default_model= ImageTableCellDetector.get_default_model,
+                                               pdf_extractor_methods={'default': default_text_recognizer_config},
+                                               pdf_col_name_substitutor=substitute_recognized_text_cols,  # TODO substitor
+                                               output_level=L.DOCUMENT,  # TODO new output level IMG? Or treat as DOC?
+                                               node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_TABLE_CELL_DETECTOR],
+                                               description='Recognize text from image files',
+                                               provider=ComponentBackends.ocr,
+                                               license=Licenses.ocr,
+                                               computation_context=ComputeContexts.spark,
+                                               output_context=ComputeContexts.spark,
+                                               jsl_anno_class_id=O_A.IMAGE_TABLE_CELL_DETECTOR,
+                                               jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE_TABLE_CELL_DETECTOR],
+                                               applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', '.TIFF'],
+                                               is_light_pipe_incompatible=True
+                                               ),
+
+        O_A.IMAGE_TABLE_CELL2TEXT_TABLE: partial(NluComponent,
+                                                 name=O_A.IMAGE_TABLE_CELL2TEXT_TABLE,
+                                                 type=T.TEXT_RECOGNIZER,
+                                                 get_default_model=ImageTable2Cell2TextTable.get_default_model,
+                                                 pdf_extractor_methods={'default': default_text_recognizer_config},
+                                                 pdf_col_name_substitutor=substitute_recognized_text_cols,  # TODO substitor
+                                                 output_level=L.DOCUMENT,  # TODO new output level IMG? Or treat as DOC?
+                                                 node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_TABLE_CELL2TEXT_TABLE],
+                                                 description='Recognize text from image files',
+                                                 provider=ComponentBackends.ocr,
+                                                 license=Licenses.ocr,
+                                                 computation_context=ComputeContexts.spark,
+                                                 output_context=ComputeContexts.spark,
+                                                 jsl_anno_class_id=O_A.IMAGE_TABLE_CELL2TEXT_TABLE,
+                                                 jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE_TABLE_CELL2TEXT_TABLE],
+                                                 applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', '.TIFF'],
+                                                 is_light_pipe_incompatible=True
+                                                 ),
+
+        O_A.IMAGE_TABLE_DETECTOR: partial(NluComponent,
+                                          name=O_A.IMAGE_TABLE_DETECTOR,
+                                          type=T.TABLE_RECOGNIZER,
+                                          get_default_model=IMAGE_TABLE_DETECTOR.get_default_model,
+                                          pdf_extractor_methods={'default': default_binary_to_image_config},
+                                          pdf_col_name_substitutor=substitute_recognized_text_cols,
+                                          output_level=L.DOCUMENT,
+                                          node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_TABLE_DETECTOR],
+                                          description='Detect Tables from Images',
+                                          provider=ComponentBackends.ocr,
+                                          license=Licenses.ocr,
+                                          computation_context=ComputeContexts.spark,
+                                          output_context=ComputeContexts.spark,
+                                          jsl_anno_class_id=O_A.IMAGE_TABLE_DETECTOR,
+                                          jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE_TABLE_DETECTOR],
+                                          applicable_file_types=['PDF','JPEG', 'PNG'],
+                                          is_light_pipe_incompatible=True
+                                          ),
+
+        O_A.IMAGE_SPLIT_REGIONS: partial(NluComponent,
+                                         name=O_A.IMAGE_SPLIT_REGIONS,
+                                         type=T.OCR_UTIL,
+                                         get_default_model=ImageSplitRegions.get_default_model,
+                                         pdf_extractor_methods={'default': default_binary_to_image_config},
+                                         pdf_col_name_substitutor=substitute_recognized_text_cols,
+                                         output_level=L.DOCUMENT,
+                                         node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_SPLIT_REGIONS],
+                                         description='Convert Image to split regions',
+                                         provider=ComponentBackends.ocr,
+                                         license=Licenses.ocr,
+                                         computation_context=ComputeContexts.spark,
+                                         output_context=ComputeContexts.spark,
+                                         jsl_anno_class_id=O_A.IMAGE_SPLIT_REGIONS,
+                                         jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
+                                             O_A.IMAGE_SPLIT_REGIONS],
+                                         applicable_file_types=['DOCX', 'DOC', 'JPEG', 'PNG'],
+                                         is_light_pipe_incompatible=True
+                                         ),
+
+
         O_A.IMAGE2HOCR: partial(NluComponent,
                                 name=O_A.IMAGE2HOCR,
                                 type=T.OCR_UTIL,