Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added image table detector annotator #221

Merged
merged 3 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class IMAGE_TABLE_DETECTOR:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageTableDetector
return ImageTableDetector.pretrained("general_model_table_detection_v2", "en", "clinical/ocr") \
.setInputCol("ocr_image") \
.setOutputCol("region")
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class ImageTableCellDetector:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageTableCellDetector
return ImageTableCellDetector() \
.setInputCol("image_region") \
.setAlgoType("morphops") \
.setOutputCol("ocr_table_cells")
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class ImageTable2Cell2TextTable:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageCellsToTextTable
return ImageCellsToTextTable() \
.setInputCol("image_region") \
.setCellsCol('ocr_table_cells')\
.setOutputCol("ocr_table")

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class ImageSplitRegions:
@staticmethod
def get_default_model():
from sparkocr.transformers import ImageSplitRegions
return ImageSplitRegions() \
.setInputCol("ocr_image") \
.setInputRegionsCol("region") \
.setOutputCol("image_region")

# .setInputRegionsCol("ocr_table_16969+
#
#
#
# ") \
2 changes: 1 addition & 1 deletion nlu/pipe/extractors/extractor_methods/ocr_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def extract_table(df):
return exploded_results.toPandas()


def extract_tables(df, rename_cols=True):
def extract_tables(df, rename_cols=False):
df = df.withColumn("table_index", df.ocr_table.area.index)
C-K-Loan marked this conversation as resolved.
Show resolved Hide resolved
# pagennum
pandas_tables = []
Expand Down
2 changes: 2 additions & 0 deletions nlu/pipe/nlu_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(self,
requires_binary_format: bool = False, # Set to true for OCR annotators that require binary image format
requires_image_format: bool = False, # Set to true for OCR annotators that require image format
is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format
is_light_pipe_incompatible: bool = False, # Set to true for OCR annotators that require image format
):
self.name = name
self.type = type
Expand Down Expand Up @@ -116,6 +117,7 @@ def __init__(self,
self.requires_binary_format = requires_binary_format
self.requires_image_format = requires_image_format
self.is_visual_annotator = is_visual_annotator
self.is_light_pipe_incompatible = is_light_pipe_incompatible

def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
nlu_ref: str,
Expand Down
27 changes: 3 additions & 24 deletions nlu/pipe/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self):
self.has_table_qa_models = False
self.requires_image_format = False
self.requires_binary_format = False

self.is_light_pipe_incompatible = False
def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
name_to_add='', idx=None):
'''
Expand Down Expand Up @@ -203,7 +203,8 @@ def fit(self, dataset=None, dataset_path=None, label_seperator=','):
logger.info(
'Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.')
self.vanilla_transformer_pipe = self.spark_estimator_pipe.fit(self.get_sample_spark_dataframe())
self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)
if not self.is_light_pipe_incompatible:
self.light_transformer_pipe = LightPipeline(self.vanilla_transformer_pipe)

self.has_trainable_components = False
self.is_fitted = True
Expand Down Expand Up @@ -452,28 +453,6 @@ def save(self, path, component='entire_pipeline', overwrite=True):
else:
self[component].save(path)

def predict_embeds(self,
data,
multithread=True,
return_spark_df=False,
):
'''
Annotates a Pandas Dataframe/Pandas Series/Numpy Array/Spark DataFrame/Python List strings /Python String abd returns List of Floats or Spark-Df, only with embeddings.
:param data: Data to predict on
and drop_irrelevant_cols = True then chunk, sentence and Doc will be dropped
:param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline
This will run fully distributed in on the Spark Master, but not prettify the output dataframe
:param return_spark_df: return Spark-DF and not collect all data into driver instead of returning list of float
:param multithread: Use multithreaded Light-pipeline instead of spark-pipeline

:return:
'''
from nlu.pipe.utils.predict_helper import __predict__
return __predict__(self, data, output_level=None, positions=False, keep_stranger_features=False, metadata=False,
multithread=multithread,
drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True,
embed_only=True)

def predict(self,
data,
output_level='',
Expand Down
2 changes: 2 additions & 0 deletions nlu/pipe/utils/pipe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,8 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
pipe.has_nlp_components = False
if c.jsl_anno_py_class == 'ImageAssembler':
pipe.contains_ocr_components = True
if c.is_light_pipe_incompatible:
pipe.is_light_pipe_incompatible = True

return pipe

Expand Down
3 changes: 2 additions & 1 deletion nlu/pipe/utils/predict_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,8 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
else:
pipe.fit()

pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)
if not pipe.is_light_pipe_incompatible:
pipe.__configure_light_pipe_usage__(DataConversionUtils.size_of(data), multithread)

if pipe.contains_ocr_components and pipe.contains_audio_components:
""" Idea:
Expand Down
36 changes: 21 additions & 15 deletions nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -11155,27 +11155,30 @@ class Spellbook:
'jpg2text': OCR_NODE_IDS.IMAGE2TEXT, # Alias for img2text
'pdf2text': OCR_NODE_IDS.PDF2TEXT,
'doc2text': OCR_NODE_IDS.DOC2TEXT,

# 'image_table_detector': OCR_NODE_IDS.IMAGE_TABLE_DETECTOR,
'image_table_cell_detector': OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR,
'image_table_cell2text_table': OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE,
'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE,
'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE,
'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482'
'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
'en.image_table_detector':'general_model_table_detection_v2'
}

ocr_model_references = {
'img2text': OCR_NODE_IDS.IMAGE2TEXT,
'png2text': OCR_NODE_IDS.IMAGE2TEXT, # Alias for img2text
'jpg2text': OCR_NODE_IDS.IMAGE2TEXT, # Alias for img2text
'pdf2text': OCR_NODE_IDS.PDF2TEXT,
'doc2text': OCR_NODE_IDS.DOC2TEXT,

'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE,
'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE,
'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482'
}
# ocr_model_references = {
# 'img2text': OCR_NODE_IDS.IMAGE2TEXT,
# 'png2text': OCR_NODE_IDS.IMAGE2TEXT, # Alias for img2text
# 'jpg2text': OCR_NODE_IDS.IMAGE2TEXT, # Alias for img2text
# 'pdf2text': OCR_NODE_IDS.PDF2TEXT,
# 'doc2text': OCR_NODE_IDS.DOC2TEXT,
#
# 'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE,
# 'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE,
# 'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
# 'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
# 'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482'
# }
# map storage ref to nlu refner_jsl
storage_ref_2_nlu_ref = {'ar': {'glove_300d': 'ar.embed'},
'bn': {'bengali_cc_300d': 'bn.embed.glove',
Expand Down Expand Up @@ -16023,6 +16026,9 @@ class Spellbook:
'image_classifier_vit_where_am_I_hospital_balcony_hallway_airport_coffee_house_apartment_office': 'ViTForImageClassification',
'image_classifier_vit_world_landmarks': 'ViTForImageClassification',
'image_classifier_convnext_tiny_224_local':'ConvNextImageClassifier',
'general_model_table_detection_v2': 'ImageTableDetector',
'image_table_cell_detector': 'ImageTableCellDetector',
'image_table_cell2text_table': 'ImageCellsToTextTable',
'instructor_large':'InstructorEmbeddings',
'instructor_base':'InstructorEmbeddings',
'initial_model': 'MPNetEmbeddings',
Expand Down
5 changes: 4 additions & 1 deletion nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,10 @@ class AnnoClassRef:
OCR_NODE_IDS.TEXT2PDF: 'TextToPdf',
OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER: 'VisualDocumentClassifier',
OCR_NODE_IDS.IMAGE2HOCR: 'ImageToHocr',

OCR_NODE_IDS.IMAGE_TABLE_DETECTOR: 'ImageTableDetector',
OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR: 'ImageTableCellDetector',
OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE: 'ImageCellsToTextTable',
OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
}

@staticmethod
Expand Down
84 changes: 84 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@
from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text
from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image
from nlu.ocr_components.utils.image2hocr.image2hocr import Image2Hocr
from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR
from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector
from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable
from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions
# from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols
Expand Down Expand Up @@ -4171,6 +4175,86 @@ class ComponentUniverse:
# applicable_file_types=['JPG', 'JPEG']
# ),
#


O_A.IMAGE_TABLE_CELL_DETECTOR: partial(NluComponent,
name=O_A.IMAGE_TABLE_CELL_DETECTOR,
type=T.TEXT_RECOGNIZER,
get_default_model= ImageTableCellDetector.get_default_model,
pdf_extractor_methods={'default': default_text_recognizer_config},
pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor
output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC?
node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_TABLE_CELL_DETECTOR],
description='Recognize text from image files',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.IMAGE_TABLE_CELL_DETECTOR,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE_TABLE_CELL_DETECTOR],
applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', '.TIFF'],
is_light_pipe_incompatible=True
),

O_A.IMAGE_TABLE_CELL2TEXT_TABLE: partial(NluComponent,
name=O_A.IMAGE_TABLE_CELL2TEXT_TABLE,
type=T.TEXT_RECOGNIZER,
get_default_model=ImageTable2Cell2TextTable.get_default_model,
pdf_extractor_methods={'default': default_text_recognizer_config},
pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor
output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC?
node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_TABLE_CELL2TEXT_TABLE],
description='Recognize text from image files',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.IMAGE_TABLE_CELL2TEXT_TABLE,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE_TABLE_CELL2TEXT_TABLE],
applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', '.TIFF'],
is_light_pipe_incompatible=True
),

O_A.IMAGE_TABLE_DETECTOR: partial(NluComponent,
name=O_A.IMAGE_TABLE_DETECTOR,
type=T.TABLE_RECOGNIZER,
get_default_model=IMAGE_TABLE_DETECTOR.get_default_model,
pdf_extractor_methods={'default': default_binary_to_image_config},
pdf_col_name_substitutor=substitute_recognized_text_cols,
output_level=L.DOCUMENT,
node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_TABLE_DETECTOR],
description='Detect Tables from Images',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.IMAGE_TABLE_DETECTOR,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE_TABLE_DETECTOR],
applicable_file_types=['PDF','JPEG', 'PNG'],
is_light_pipe_incompatible=True
),

O_A.IMAGE_SPLIT_REGIONS: partial(NluComponent,
name=O_A.IMAGE_SPLIT_REGIONS,
type=T.OCR_UTIL,
get_default_model=ImageSplitRegions.get_default_model,
pdf_extractor_methods={'default': default_binary_to_image_config},
pdf_col_name_substitutor=substitute_recognized_text_cols,
output_level=L.DOCUMENT,
node=OCR_FEATURE_NODES.nodes[O_A.IMAGE_SPLIT_REGIONS],
description='Convert Image to split regions',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.IMAGE_SPLIT_REGIONS,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
O_A.IMAGE_SPLIT_REGIONS],
applicable_file_types=['DOCX', 'DOC', 'JPEG', 'PNG'],
is_light_pipe_incompatible=True
),


O_A.IMAGE2HOCR: partial(NluComponent,
name=O_A.IMAGE2HOCR,
type=T.OCR_UTIL,
Expand Down
Loading