Skip to content

Commit

Permalink
Merge pull request #223 from JohnSnowLabs/release/511
Browse files Browse the repository at this point in the history
Release/511
  • Loading branch information
C-K-Loan authored Jan 8, 2024
2 parents f853ce7 + deca2a4 commit da5c47c
Show file tree
Hide file tree
Showing 34 changed files with 1,806 additions and 213 deletions.

Large diffs are not rendered by default.

765 changes: 765 additions & 0 deletions examples/colab/ocr/ocr_table_recognition_dl.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '5.0.3'
__version__ = '5.1.1'


import nlu.utils.environment.env_utils as env_utils
Expand Down
Empty file.
15 changes: 15 additions & 0 deletions nlu/components/classifiers/asr_whisper/whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from sparknlp.annotator import *


class Whisper:
@staticmethod
def get_default_model():
return WhisperForCTC.pretrained() \
.setInputCols("audio_assembler") \
.setOutputCol("text")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return WhisperForCTC.pretrained(name, language, bucket) \
.setInputCols("audio_assembler") \
.setOutputCol("text")
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from sparknlp.annotator import RoBertaSentenceEmbeddings


class RobertaSentence:
@staticmethod
def get_default_model():
return RoBertaSentenceEmbeddings.pretrained() \
.setInputCols("sentence") \
.setOutputCol("sentence_embeddings")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return RoBertaSentenceEmbeddings.pretrained(name,language,bucket) \
.setInputCols('sentence') \
.setOutputCol("sentence_embeddings")



Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class IMAGE_TABLE_DETECTOR:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageTableDetector
return ImageTableDetector.pretrained("general_model_table_detection_v2", "en", "clinical/ocr") \
.setInputCol("ocr_image") \
.setOutputCol("region")
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class ImageTableCellDetector:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageTableCellDetector
return ImageTableCellDetector() \
.setInputCol("image_region") \
.setAlgoType("morphops") \
.setOutputCol("ocr_table_cells")
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class ImageTable2Cell2TextTable:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageCellsToTextTable
return ImageCellsToTextTable() \
.setInputCol("image_region") \
.setCellsCol('ocr_table_cells')\
.setOutputCol("ocr_table")

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class ImageSplitRegions:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageSplitRegions
return ImageSplitRegions() \
.setInputCol("ocr_image") \
.setInputRegionsCol("region") \
.setOutputCol("image_region")

# .setInputRegionsCol("ocr_table_16969+
#
#
#
# ") \
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
ElmoEmbeddings ,
E5Embeddings,
BertSentenceEmbeddings,
RoBertaSentenceEmbeddings,
UniversalSentenceEncoder,
InstructorEmbeddings,
SentenceEmbeddings,
Expand Down Expand Up @@ -88,6 +89,7 @@
ElmoEmbeddings ,
E5Embeddings,
BertSentenceEmbeddings,
RoBertaSentenceEmbeddings,
InstructorEmbeddings,
UniversalSentenceEncoder,
SentenceEmbeddings,
Expand Down
3 changes: 3 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OS.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
BertSentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
RoBertaSentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
InstructorEmbeddings: {
'default': substitute_sent_embed_cols,
},
Expand Down
2 changes: 1 addition & 1 deletion nlu/pipe/extractors/extractor_methods/ocr_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def extract_table(df):
return exploded_results.toPandas()


def extract_tables(df, rename_cols=True):
def extract_tables(df, rename_cols=False):
df = df.withColumn("table_index", df.ocr_table.area.index)
# pagennum
pandas_tables = []
Expand Down
2 changes: 2 additions & 0 deletions nlu/pipe/nlu_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(self,
requires_binary_format: bool = False, # Set to true for OCR annotators that require binary image format
requires_image_format: bool = False, # Set to true for OCR annotators that require image format
is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format
is_light_pipe_incompatible: bool = False, # Set to true for OCR annotators that require image format
):
self.name = name
self.type = type
Expand Down Expand Up @@ -116,6 +117,7 @@ def __init__(self,
self.requires_binary_format = requires_binary_format
self.requires_image_format = requires_image_format
self.is_visual_annotator = is_visual_annotator
self.is_light_pipe_incompatible = is_light_pipe_incompatible

def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
nlu_ref: str,
Expand Down
27 changes: 3 additions & 24 deletions nlu/pipe/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self):
self.has_table_qa_models = False
self.requires_image_format = False
self.requires_binary_format = False

self.is_light_pipe_incompatible = False
def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
name_to_add='', idx=None):
'''
Expand Down Expand Up @@ -203,7 +203,8 @@ def fit(self, dataset=None, dataset_path=None, label_seperator=','):
logger.info(
'Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.')
self.vanilla_transformer_pipe = self.spark_estimator_pipe.fit(self.get_sample_spark_dataframe())
self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)
if not self.is_light_pipe_incompatible:
self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)

self.has_trainable_components = False
self.is_fitted = True
Expand Down Expand Up @@ -452,28 +453,6 @@ def save(self, path, component='entire_pipeline', overwrite=True):
else:
self[component].save(path)

def predict_embeds(self,
data,
multithread=True,
return_spark_df=False,
):
'''
Annotates a Pandas Dataframe/Pandas Series/Numpy Array/Spark DataFrame/Python List strings /Python String abd returns List of Floats or Spark-Df, only with embeddings.
:param data: Data to predict on
and drop_irrelevant_cols = True then chunk, sentence and Doc will be dropped
:param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline
This will run fully distributed in on the Spark Master, but not prettify the output dataframe
:param return_spark_df: return Spark-DF and not collect all data into driver instead of returning list of float
:param multithread: Use multithreaded Light-pipeline instead of spark-pipeline
:return:
'''
from nlu.pipe.utils.predict_helper import __predict__
return __predict__(self, data, output_level=None, positions=False, keep_stranger_features=False, metadata=False,
multithread=multithread,
drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True,
embed_only=True)

def predict(self,
data,
output_level='',
Expand Down
3 changes: 3 additions & 0 deletions nlu/pipe/utils/pipe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
if c.license == Licenses.open_source \
and c.name != NLP_NODE_IDS.WAV2VEC_FOR_CTC \
and c.name != NLP_NODE_IDS.HUBERT_FOR_CTC \
and c.name != NLP_NODE_IDS.WHISPER_FOR_CTC \
and c.name != NLP_NODE_IDS.AUDIO_ASSEMBLER:
# TODO Table Assembler/VIT/ Other non txt open source
pipe.has_nlp_components = True
Expand All @@ -701,6 +702,8 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
pipe.has_nlp_components = False
if c.jsl_anno_py_class == 'ImageAssembler':
pipe.contains_ocr_components = True
if c.is_light_pipe_incompatible:
pipe.is_light_pipe_incompatible = True

return pipe

Expand Down
3 changes: 2 additions & 1 deletion nlu/pipe/utils/predict_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,8 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
else:
pipe.fit()

pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)
if not pipe.is_light_pipe_incompatible:
pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)

if pipe.contains_ocr_components and pipe.contains_audio_components:
""" Idea:
Expand Down
Loading

0 comments on commit da5c47c

Please sign in to comment.