Merge pull request #223 from JohnSnowLabs/release/511

Release/511
JohnSnowLabs · Jan 8, 2024 · da5c47c · da5c47c
2 parents f853ce7 + deca2a4
commit da5c47c
Show file tree

Hide file tree

Showing 34 changed files with 1,806 additions and 213 deletions.
diff --git a/...ent_examples/automatic_speech_recognition/automatic_speech_recognition_overview_ASR.ipynb b/...ent_examples/automatic_speech_recognition/automatic_speech_recognition_overview_ASR.ipynb
diff --git a/examples/colab/ocr/ocr_table_recognition_dl.ipynb b/examples/colab/ocr/ocr_table_recognition_dl.ipynb
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '5.0.3'
+__version__ = '5.1.1'
 
 
 import nlu.utils.environment.env_utils as env_utils

diff --git a/nlu/components/classifiers/asr_whisper/__init__.py b/nlu/components/classifiers/asr_whisper/__init__.py
diff --git a/nlu/components/classifiers/asr_whisper/whisper.py b/nlu/components/classifiers/asr_whisper/whisper.py
@@ -0,0 +1,15 @@
+from sparknlp.annotator import *
+
+
+class Whisper:
+    @staticmethod
+    def get_default_model():
+        return WhisperForCTC.pretrained() \
+            .setInputCols("audio_assembler") \
+            .setOutputCol("text")
+
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        return WhisperForCTC.pretrained(name, language, bucket) \
+            .setInputCols("audio_assembler") \
+            .setOutputCol("text")
diff --git a/nlu/components/embeddings/sentence_roberta/RobertaSentenceEmbedding.py b/nlu/components/embeddings/sentence_roberta/RobertaSentenceEmbedding.py
@@ -0,0 +1,18 @@
+from sparknlp.annotator import RoBertaSentenceEmbeddings
+
+
+class RobertaSentence:
+    @staticmethod
+    def get_default_model():
+        return RoBertaSentenceEmbeddings.pretrained() \
+            .setInputCols("sentence") \
+            .setOutputCol("sentence_embeddings")
+
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        return RoBertaSentenceEmbeddings.pretrained(name,language,bucket) \
+            .setInputCols('sentence') \
+            .setOutputCol("sentence_embeddings")
+
+
+
diff --git a/nlu/components/embeddings/sentence_roberta/__init__.py b/nlu/components/embeddings/sentence_roberta/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image2table/__init__.py b/nlu/ocr_components/table_extractors/image2table/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image2table/image2table.py b/nlu/ocr_components/table_extractors/image2table/image2table.py
@@ -0,0 +1,7 @@
+class IMAGE_TABLE_DETECTOR:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageTableDetector
+        return ImageTableDetector.pretrained("general_model_table_detection_v2", "en", "clinical/ocr") \
+            .setInputCol("ocr_image") \
+            .setOutputCol("region")
diff --git a/nlu/ocr_components/table_extractors/image2table_cell/__init__.py b/nlu/ocr_components/table_extractors/image2table_cell/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image2table_cell/image2table_cell.py b/nlu/ocr_components/table_extractors/image2table_cell/image2table_cell.py
@@ -0,0 +1,8 @@
+class ImageTableCellDetector:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageTableCellDetector
+        return ImageTableCellDetector() \
+            .setInputCol("image_region") \
+            .setAlgoType("morphops") \
+            .setOutputCol("ocr_table_cells")
diff --git a/nlu/ocr_components/table_extractors/image_table_cell2text/__init__.py b/nlu/ocr_components/table_extractors/image_table_cell2text/__init__.py
diff --git a/nlu/ocr_components/table_extractors/image_table_cell2text/image_table_cell2text.py b/nlu/ocr_components/table_extractors/image_table_cell2text/image_table_cell2text.py
@@ -0,0 +1,9 @@
+class ImageTable2Cell2TextTable:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageCellsToTextTable
+        return ImageCellsToTextTable() \
+            .setInputCol("image_region") \
+            .setCellsCol('ocr_table_cells')\
+            .setOutputCol("ocr_table")
+
diff --git a/nlu/ocr_components/utils/image_split_regions/__init__.py b/nlu/ocr_components/utils/image_split_regions/__init__.py
diff --git a/nlu/ocr_components/utils/image_split_regions/image_split_regions.py b/nlu/ocr_components/utils/image_split_regions/image_split_regions.py
@@ -0,0 +1,14 @@
+class ImageSplitRegions:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import ImageSplitRegions
+        return ImageSplitRegions() \
+            .setInputCol("ocr_image") \
+            .setInputRegionsCol("region") \
+            .setOutputCol("image_region")
+
+# .setInputRegionsCol("ocr_table_16969+
+#
+#
+#
+# ") \
diff --git a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py
@@ -21,6 +21,7 @@
     ElmoEmbeddings ,
     E5Embeddings,
     BertSentenceEmbeddings,
+    RoBertaSentenceEmbeddings,
     UniversalSentenceEncoder,
     InstructorEmbeddings,
     SentenceEmbeddings,
@@ -88,6 +89,7 @@
     ElmoEmbeddings ,
     E5Embeddings,
     BertSentenceEmbeddings,
+    RoBertaSentenceEmbeddings,
     InstructorEmbeddings,
     UniversalSentenceEncoder,
     SentenceEmbeddings,

diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py
@@ -53,6 +53,9 @@
     BertSentenceEmbeddings: {
         'default': substitute_sent_embed_cols,
     },
+    RoBertaSentenceEmbeddings: {
+        'default': substitute_sent_embed_cols,
+    },
     InstructorEmbeddings: {
         'default': substitute_sent_embed_cols,
     },

diff --git a/nlu/pipe/extractors/extractor_methods/ocr_extractors.py b/nlu/pipe/extractors/extractor_methods/ocr_extractors.py
@@ -14,7 +14,7 @@ def extract_table(df):
     return exploded_results.toPandas()
 
 
-def extract_tables(df, rename_cols=True):
+def extract_tables(df, rename_cols=False):
     df = df.withColumn("table_index", df.ocr_table.area.index)
     # pagennum
     pandas_tables = []

diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py
@@ -75,6 +75,7 @@ def __init__(self,
                  requires_binary_format: bool = False,  # Set to true for OCR annotators that require binary image format
                  requires_image_format: bool = False,  # Set to true for OCR annotators that require image format
                  is_visual_annotator: bool = False,  # Set to true for OCR annotators that require image format
+                 is_light_pipe_incompatible: bool = False,  # Set to true for OCR annotators that require image format
                  ):
         self.name = name
         self.type = type
@@ -116,6 +117,7 @@ def __init__(self,
         self.requires_binary_format = requires_binary_format
         self.requires_image_format = requires_image_format
         self.is_visual_annotator = is_visual_annotator
+        self.is_light_pipe_incompatible = is_light_pipe_incompatible
 
     def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
                      nlu_ref: str,

diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -60,7 +60,7 @@ def __init__(self):
         self.has_table_qa_models = False
         self.requires_image_format = False
         self.requires_binary_format = False
-
+        self.is_light_pipe_incompatible = False
     def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
             name_to_add='', idx=None):
         '''
@@ -203,7 +203,8 @@ def fit(self, dataset=None, dataset_path=None, label_seperator=','):
                 logger.info(
                     'Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.')
                 self.vanilla_transformer_pipe = self.spark_estimator_pipe.fit(self.get_sample_spark_dataframe())
-                self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)
+                if not self.is_light_pipe_incompatible:
+                    self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)
 
         self.has_trainable_components = False
         self.is_fitted = True
@@ -452,28 +453,6 @@ def save(self, path, component='entire_pipeline', overwrite=True):
             else:
                 self[component].save(path)
 
-    def predict_embeds(self,
-                       data,
-                       multithread=True,
-                       return_spark_df=False,
-                       ):
-        '''
-        Annotates a Pandas Dataframe/Pandas Series/Numpy Array/Spark DataFrame/Python List strings /Python String abd returns List of Floats or Spark-Df, only with embeddings.
-        :param data: Data to predict on
-                and drop_irrelevant_cols = True then chunk, sentence and Doc will be dropped
-        :param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline
-                                 This will run fully distributed in on the Spark Master, but not prettify the output dataframe
-        :param return_spark_df: return Spark-DF and not collect all data into driver instead of returning list of float
-        :param multithread: Use multithreaded Light-pipeline instead of spark-pipeline
-
-        :return:
-        '''
-        from nlu.pipe.utils.predict_helper import __predict__
-        return __predict__(self, data, output_level=None, positions=False, keep_stranger_features=False, metadata=False,
-                           multithread=multithread,
-                           drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True,
-                           embed_only=True)
-
     def predict(self,
                 data,
                 output_level='',

diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -681,6 +681,7 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
             if c.license == Licenses.open_source \
                     and c.name != NLP_NODE_IDS.WAV2VEC_FOR_CTC \
                     and c.name != NLP_NODE_IDS.HUBERT_FOR_CTC \
+                    and c.name != NLP_NODE_IDS.WHISPER_FOR_CTC \
                     and c.name != NLP_NODE_IDS.AUDIO_ASSEMBLER:
                 # TODO Table Assembler/VIT/ Other non txt open source
                 pipe.has_nlp_components = True
@@ -701,6 +702,8 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
                 pipe.has_nlp_components = False
             if c.jsl_anno_py_class == 'ImageAssembler':
                 pipe.contains_ocr_components = True
+            if c.is_light_pipe_incompatible:
+                pipe.is_light_pipe_incompatible = True
 
         return pipe
 

diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py
@@ -311,7 +311,8 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
         else:
             pipe.fit()
 
-        pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)
+        if not pipe.is_light_pipe_incompatible:
+            pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)
 
     if pipe.contains_ocr_components and pipe.contains_audio_components:
         """ Idea: