Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/visual-ner' into release/531
Browse files Browse the repository at this point in the history
# Conflicts:
#	nlu/pipe/col_substitution/col_name_substitution_utils.py
#	nlu/pipe/col_substitution/substitution_map_OCR.py
#	nlu/universe/component_universes.py
  • Loading branch information
C-K-Loan committed Apr 8, 2024
2 parents a758f64 + 32609ec commit 28ca5a0
Show file tree
Hide file tree
Showing 23 changed files with 1,179 additions and 40 deletions.
968 changes: 968 additions & 0 deletions examples/colab/ocr/ocr_visual_document_ner.ipynb

Large diffs are not rendered by default.

36 changes: 18 additions & 18 deletions nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from sparknlp.annotator import MPNetEmbeddings


class MPNetSentence:
@staticmethod
def get_default_model():
return MPNetEmbeddings.pretrained() \
.setInputCols(["documents"]) \
.setOutputCol("mpnet_embeddings")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return MPNetEmbeddings.pretrained(name,language,bucket) \
.setInputCols(["documents"]) \
.setOutputCol("mpnet_embeddings")



# from sparknlp.annotator import MPNetEmbeddings
#
#
# class MPNetSentence:
# @staticmethod
# def get_default_model():
# return MPNetEmbeddings.pretrained() \
# .setInputCols(["documents"]) \
# .setOutputCol("mpnet_embeddings")
#
# @staticmethod
# def get_pretrained_model(name, language, bucket=None):
# return MPNetEmbeddings.pretrained(name,language,bucket) \
# .setInputCols(["documents"]) \
# .setOutputCol("mpnet_embeddings")
#
#
#
Empty file.
7 changes: 7 additions & 0 deletions nlu/ocr_components/utils/hocr_tokenizer/hocr_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class HocrTokenizer:
@staticmethod
def get_default_model():
from sparkocr.transformers import HocrTokenizer
return HocrTokenizer() \
.setInputCol("hocr") \
.setOutputCol("text_tokenized")
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class VisualDocumentNer:
@staticmethod
def get_default_model():
from sparkocr.transformers import VisualDocumentNer
return VisualDocumentNer()\
.pretrained("lilt_roberta_funsd_v1", "en", "clinical/ocr")\
.setInputCols(["text_tokenized", "image"])\
.setOutputCol("text_entity")
24 changes: 20 additions & 4 deletions nlu/pipe/col_substitution/col_name_substitution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

import nlu
from nlu.pipe.col_substitution import substitution_map_OS
from nlu.universe.feature_universes import NLP_FEATURES
from nlu.pipe.col_substitution import substitution_map_OS
from nlu.pipe.col_substitution import col_substitution_OS
import logging

from nlu.pipe.extractors.extractor_base_data_classes import SparkOCRExtractorConfig
Expand Down Expand Up @@ -139,7 +136,26 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:
result_cols = []
if isinstance(configs, SparkOCRExtractorConfig):
# TODO better OCR-EX handling --> Col Name generator function which we use everywhere for unified col naming !!!!!
return ['text']
# return ['text']
for col in df.columns:
if 'meta_' + configs.output_col_prefix in col:
base_meta_prefix = 'meta_' + configs.output_col_prefix
meta_col_name = base_meta_prefix + col.split(base_meta_prefix)[-1]
if meta_col_name in df.columns:
# special case for overlapping names with _
if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and not \
c.spark_output_column_names[0].split('_')[-1].isnumeric(): continue
if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and \
c.spark_output_column_names[0].split('_')[-1].isnumeric():
id1 = int(col.split(base_meta_prefix)[-1].split('_')[1])
id2 = int(c.spark_output_column_names.split('_')[-1])
if id1 != id2: continue
result_cols.append(meta_col_name)
elif c.type == AnnoTypes.CHUNK_CLASSIFIER:
result_cols.append(col)
else:
logger.info(f"Could not find meta col for os_components={c}, col={col}. Ommiting col..")
return result_cols
if isinstance(c.model, MultiDocumentAssembler):
return [f'{NLP_FEATURES.DOCUMENT_QUESTION}_results', f'{NLP_FEATURES.DOCUMENT_QUESTION_CONTEXT}_results']

Expand Down
43 changes: 43 additions & 0 deletions nlu/pipe/col_substitution/col_substitution_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,46 @@ def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identi
# else : logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
# # new_cols[col]= f"{new_base_name}_confidence"
# return new_cols
def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identifier=''):
"""
Drug Norm is always unique
Fetched fields are:
- entities@<storage_ref>_results
- entities@<storage_ref>_<metadata>
- entities@<storage_ref>_entity
- entities@<storage_ref>_confidence
"""
new_cols = {}
for c in cols:
if 'visual_classifier_label.1' in cols:
new_cols['visual_classifier_label.1'] = 'file_path'
if 'visual_classifier_label' in cols:
new_cols['visual_classifier_label'] = 'visual_classifier_prediction'

new_cols[c] = c
return new_cols # TODO

def substitute_document_ner_cols(c, cols, nlu_identifier):
"""
Drug Norm is always unique
Fetched fields are:
- entities@<storage_ref>_results
- entities@<storage_ref>_<metadata>
- entities@<storage_ref>_entity
- entities@<storage_ref>_confidence
"""
new_cols = {}
new_base_name = 'entities' if nlu_identifier == 'UNIQUE' else f'entities_{nlu_identifier}'
for c in cols:
if '_ocr_confidence' in c:
new_cols['meta_text_entity_confidence'] = f'{new_base_name}_confidence'
if '_token' in c:
new_cols['meta_text_entity_token'] = f'{new_base_name}_ner_entity'
if '_entity_x' in c:
new_cols['meta_text_entity_x'] = f'{new_base_name}_x_location'
if '_entity_y' in c:
new_cols['meta_text_entity_y'] = f'{new_base_name}_y_location'

# new_cols[c] = c
return new_cols

14 changes: 4 additions & 10 deletions nlu/pipe/col_substitution/substitution_map_OCR.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
"""
Resolve Annotator Classes in the Pipeline to Extractor Configs and Methods
Every Annotator should have 2 configs. Some might offor multuple configs/method pairs, based on model_anno_obj/NLP reference.
- default/minimalistic -> Just the results of the annotations, no confidences or extra metadata
- with meta -> A config that leverages white/black list and gets the most relevant metadata
- with positions -> With Begins/Ends
- with sentence references -> Reeturn the sentence/chunk no. reference from the metadata.
If a document has multi-sentences, this will map a label back to a corrosponding sentence
"""
# from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OS import *
Expand All @@ -16,15 +14,11 @@
from sparkocr.transformers import *

OCR_anno2substitution_fn = {
VisualDocumentClassifier : {
VisualDocumentClassifier : {
'default': substitute_document_classifier_text_cols ,
},
VisualDocumentNerLilt : {
'default': substitute_document_ner_cols,
},

}







6 changes: 3 additions & 3 deletions nlu/pipe/col_substitution/substitution_map_OS.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@
SentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
MPNetEmbeddings: {
'default': substitute_sent_embed_cols,
},
# MPNetEmbeddings: {
# 'default': substitute_sent_embed_cols,
# },
Tokenizer: {
'default': substitute_tokenizer_cols,
},
Expand Down
2 changes: 1 addition & 1 deletion nlu/pipe/extractors/extractor_base_data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class SparkOCRExtractorConfig(SparkNLPExtractorConfig):
get_image_resolution: bool = field(default=False)
get_image_data: bool = field(default=False)
# General OCR fields
# get_path :bool = field(default = False)# origin is path
get_path: bool = field(default=False)# origin is path
get_modification_time: bool = field(default=False)
get_length: bool = field(default=False)
get_page_num: bool = field(default=False)
Expand Down
16 changes: 16 additions & 0 deletions nlu/pipe/extractors/extractor_configs_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'):
description='Gets label and confidence of visual classifier',
)

def default_visual_ner_config(output_col_prefix='visual_ocr'):
return SparkOCRExtractorConfig(
get_text=True,
get_begin=True,
get_end=True,
get_result=True,
get_meta=True,
get_full_meta=True,
get_image_data=True,
get_path=True,
get_annotator_type=False,
output_col_prefix=output_col_prefix,
meta_white_list=['entity', 'confidence', 'sentence', 'chunk'],
name='visual_ner label, confidence and entities ',
description='Gets label, entities and confidence of visual ner',
)

def default_binary_to_image_config(output_col_prefix='binary_image'):
return SparkOCRExtractorConfig(
Expand Down
10 changes: 8 additions & 2 deletions nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,15 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon
else:
return {'visual_classifier_confidence': row}

# if 'FULL binary to image extractor ' in configs.name:
# if not isinstance(row, str):
# return {'path': row}


else:
# # OCR unpackers (TODO WIP)
# unpack_text = lambda x: unpack_dict_list(x, 'text')
# # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data?
# unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
# # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data? # unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
# unpack_image_height = lambda x: unpack_dict_list(x, 'height')
# unpack_image_width = lambda x: unpack_dict_list(x, 'width')
# unpack_image_n_channels = lambda x: unpack_dict_list(x, 'nChannels')
Expand Down Expand Up @@ -317,6 +321,8 @@ def apply_extractors_and_merge(df, anno_2_ex_config, keep_stranger_features, str
extractor = lambda c: df[c].apply(extract_master, configs=anno_2_ex_config[c])
keep_strangers = lambda c: df[c]

stranger_features.append('path') if 'path' in df.columns and 'text_entity' in anno_2_ex_config.keys() else None

# merged_extraction_df
# apply the extract_master together with it's configs to every column and geenrate a list of output DF's, one per Spark NLP COL
# TODO handle MULTI-COL-OUTPUT. If Anno has multi cols, then we either needs multiple keys in anno_2_ex or use something besides
Expand Down
4 changes: 3 additions & 1 deletion nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -11320,7 +11320,8 @@ class Spellbook:
'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
'en.image_table_detector':'general_model_table_detection_v2'
'en.image_table_detector':'general_model_table_detection_v2',
'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1',
}

# ocr_model_references = {
Expand Down Expand Up @@ -16296,6 +16297,7 @@ class Spellbook:
'general_model_table_detection_v2': 'ImageTableDetector',
'image_table_cell_detector': 'ImageTableCellDetector',
'image_table_cell2text_table': 'ImageCellsToTextTable',
'lilt_roberta_funsd_v1': 'VisualDocumentNer',
'instructor_large':'InstructorEmbeddings',
'instructor_base':'InstructorEmbeddings',
'initial_model': 'MPNetEmbeddings',
Expand Down
2 changes: 2 additions & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,8 @@ class AnnoClassRef:
OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR: 'ImageTableCellDetector',
OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE: 'ImageCellsToTextTable',
OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer',
OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer',
}

@staticmethod
Expand Down
46 changes: 46 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,19 +144,23 @@
from nlu.ocr_components.text_recognizers.img2text.img2text import Img2Text
from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text
from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image
from nlu.ocr_components.utils.hocr_tokenizer.hocr_tokenizer import HocrTokenizer
from nlu.ocr_components.utils.image2hocr.image2hocr import Image2Hocr
from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR
from nlu.ocr_components.visual_ner.visual_document_ner.visual_document_ner import VisualDocumentNer
from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector
from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable
from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions
# from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \
substitute_document_classifier_text_cols
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols,substitute_document_ner_cols
from nlu.pipe.col_substitution.col_substitution_OS import *
from nlu.pipe.extractors.extractor_configs_HC import *
from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \
default_visual_classifier_config
from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, default_visual_ner_config
from nlu.pipe.extractors.extractor_configs_OS import *
from nlu.pipe.nlu_component import NluComponent
from nlu.universe.annotator_class_universe import AnnoClassRef
Expand Down Expand Up @@ -4456,4 +4460,46 @@ class ComponentUniverse:
applicable_file_types=['DOCX', 'DOC'],
),

O_A.HOCR_TOKENIZER: partial(NluComponent,
name=O_A.HOCR_TOKENIZER,
type=T.OCR_UTIL,
get_default_model=HocrTokenizer.get_default_model,
# TODO EXtractor0
pdf_extractor_methods={'default': default_binary_to_image_config},
# TODO substitor
pdf_col_name_substitutor=substitute_recognized_text_cols,
output_level=L.DOCUMENT,
node=OCR_FEATURE_NODES.nodes[O_A.HOCR_TOKENIZER],
description='Convert text to PDF file',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.HOCR_TOKENIZER,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
O_A.HOCR_TOKENIZER],
applicable_file_types=['DOCX', 'DOC'],
),

O_A.VISUAL_DOCUMENT_NER: partial(NluComponent,
name=O_A.VISUAL_DOCUMENT_NER,
type=T.PDF_BUILDER,
get_default_model=VisualDocumentNer.get_default_model,
pdf_extractor_methods={'default': default_visual_ner_config},
# TODO EXtractor
pdf_col_name_substitutor=substitute_document_ner_cols,
# TODO substitor
output_level=L.CHUNK,
node=OCR_FEATURE_NODES.nodes[O_A.VISUAL_DOCUMENT_NER],
description='Convert text to PDF file',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.VISUAL_DOCUMENT_NER,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
O_A.VISUAL_DOCUMENT_NER],
applicable_file_types=['JPG', 'JPEG']
),

}
2 changes: 1 addition & 1 deletion nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ class OCR_NODE_IDS:
"""
# Visual Document Understanding
VISUAL_DOCUMENT_CLASSIFIER = JslAnnoId('visual_document_classifier')
VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_NER')
VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_ner')

# Object Detection
IMAGE_HANDWRITTEN_DETECTOR = JslAnnoId('image_handwritten_detector')
Expand Down
2 changes: 2 additions & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ class OCR_FEATURE_NODES:
A.IMAGE2HOCR: OcrFeatureNode(A.IMAGE2HOCR, [F.OCR_IMAGE], [F.HOCR]),

# VISUAL_DOCUMENT_NER : OcrFeatureNode(A.VISUAL_DOCUMENT_NER, [OcrFeature.HOCR, OcrFeature.FILE_PATH], [NlpFeature.NER_Annotation]), # TODO NlpFeature Space!
A.VISUAL_DOCUMENT_NER: OcrFeatureNode(A.VISUAL_DOCUMENT_NER, [F.TEXT_DOCUMENT_TOKENIZED, F.OCR_IMAGE],
[F.TEXT_ENTITY]),

# Object Detection
A.IMAGE_SPLIT_REGIONS: OcrFeatureNode(A.IMAGE_SPLIT_REGIONS, [F.OCR_IMAGE,F.OCR_REGION], [F.IMG_SPLIT_REGIONS]),
Expand Down
2 changes: 2 additions & 0 deletions nlu/universe/feature_resolutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,5 +114,7 @@ class FeatureResolutions:
OCR_FEATURES.OCR_TABLE_CELLS: ResolvedFeature(OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR,
OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR, 'xx', False,
ComponentUniverse.components[OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR]),
OCR_FEATURES.TEXT_DOCUMENT_TOKENIZED: ResolvedFeature(OCR_NODE_IDS.HOCR_TOKENIZER, OCR_NODE_IDS.HOCR_TOKENIZER, 'xx', False,
ComponentUniverse.components[OCR_NODE_IDS.HOCR_TOKENIZER]),

}
Binary file added tests/datasets/ocr/images/ocr_ner.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/datasets/ocr/images/ocr_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
27 changes: 27 additions & 0 deletions tests/nlu_ocr_tests/ocr_visual_document_ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import sys

sys.path.append(os.getcwd())
import unittest
import nlu

os.environ["PYTHONPATH"] = "F:/Work/repos/nlu"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from johnsnowlabs import nlp, visual

# nlp.install(json_license_path='license.json',visual=True)
nlp.start(visual=True)

class OcrTest(unittest.TestCase):

def test_classify_document(self):
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# text that we generate PDF to has to come from an image struct!
# We need convert text to img struct!
p = nlu.load('en.lilt_roberta_funds.v1').predict('ocr_ner.png',output_level='chunk')
for i,j in p.iterrows():
print(i,'---->',j)

if __name__ == '__main__':
unittest.main()

0 comments on commit 28ca5a0

Please sign in to comment.