Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new Visual Document Classifier Annotator #219

Merged
merged 5 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
343 changes: 343 additions & 0 deletions examples/colab/ocr/ocr_visual_document_classifier.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class VisualDocClassifier:
@staticmethod
def get_default_model():
from sparkocr.transformers import VisualDocumentClassifier
return VisualDocumentClassifier.pretrained("visual_document_classifier_tobacco3482", "en", "clinical/ocr") \
.setMaxSentenceLength(128) \
.setInputCol("hocr") \
.setLabelCol("prediction") \
.setConfidenceCol("conf")
12 changes: 9 additions & 3 deletions nlu/pipe/col_substitution/col_name_substitution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import nlu
from nlu.universe.feature_universes import NLP_FEATURES
from nlu.pipe.col_substitution import substitution_map_OS
from nlu.pipe.col_substitution import substitution_map_OS, substitution_map_OCR
from nlu.pipe.col_substitution import col_substitution_OS
import logging

Expand Down Expand Up @@ -76,7 +76,13 @@ def substitute_col_names(df, anno_2_ex, pipe, stranger_cols=[], get_embeddings=F
anno2final_cols[c.model] = list(old2new_anno_cols.values())
new_cols.update(old2new_anno_cols)
new_cols = {**new_cols, **(old2new_anno_cols)}
continue
if type(c.model) in substitution_map_OCR.OCR_anno2substitution_fn.keys():
cols = df.columns.tolist()
substitution_fn = substitution_map_OCR.OCR_anno2substitution_fn[type(c.model)]['default']
old2new_anno_cols = substitution_fn(c, cols, deducted_component_names[c])
anno2final_cols[c.model] = list(old2new_anno_cols.values())
new_cols = {**new_cols, **(old2new_anno_cols)}
continue
if 'embedding' in c.type and get_embeddings == False: continue
cols_to_substitute = ColSubstitutionUtils.get_final_output_cols_of_component(c, df, anno_2_ex)

Expand All @@ -94,6 +100,7 @@ def substitute_col_names(df, anno_2_ex, pipe, stranger_cols=[], get_embeddings=F
anno2final_cols[c.model] = list(old2new_anno_cols.values())
new_cols.update(old2new_anno_cols)
continue

# dic, key=old_col, value=new_col. Some cols may be omitted and missing from the dic which are deemed irrelevant. Behaivour can be disabled by setting drop_debug_cols=False
old2new_anno_cols = substitution_fn(c, cols_to_substitute, deducted_component_names[c])
anno2final_cols[c.model] = list(old2new_anno_cols.values())
Expand All @@ -113,7 +120,6 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:
"""Get's a list of all columns that have been derived in the pythonify procedure from the component_to_resolve
os_components in dataframe df for anno_2_ex configs """
og_output_col = c.spark_output_column_names[0]

configs = anno_2_ex[og_output_col]
result_cols = []
if isinstance(configs, SparkOCRExtractorConfig):
Expand Down
22 changes: 22 additions & 0 deletions nlu/pipe/col_substitution/col_substitution_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,28 @@ def substitute_recognized_text_cols(c, cols, is_unique=True, nlu_identifier=''):
for c in cols:
new_cols[c] = c
return new_cols # TODO

def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identifier=''):
"""
Drug Norm is always unique
Fetched fields are:
- entities@<storage_ref>_results
- entities@<storage_ref>_<metadata>
- entities@<storage_ref>_entity
- entities@<storage_ref>_confidence
"""
new_cols = {}
for c in cols:
if 'visual_classifier_label.1' in cols:
new_cols['visual_classifier_label.1'] = 'file_path'
if 'visual_classifier_label' in cols:
new_cols['visual_classifier_label'] = 'visual_classifier_prediction'

new_cols[c] = c
return new_cols # TODO



# new_base_name = 'generic_classifier' if is_unique else f'generic_classification_{nlu_identifier}'
# for col in cols :
# if '_results' in col : new_cols[col] = new_base_name
Expand Down
30 changes: 30 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OCR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
Resolve Annotator Classes in the Pipeline to Extractor Configs and Methods

Every Annotator should have 2 configs. Some might offor multuple configs/method pairs, based on model_anno_obj/NLP reference.
- default/minimalistic -> Just the results of the annotations, no confidences or extra metadata
- with meta -> A config that leverages white/black list and gets the most relevant metadata
- with positions -> With Begins/Ends
- with sentence references -> Reeturn the sentence/chunk no. reference from the metadata.
If a document has multi-sentences, this will map a label back to a corrosponding sentence

"""
# from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OS import *
from nlu.pipe.col_substitution.col_substitution_OCR import *

from sparkocr.transformers import *

OCR_anno2substitution_fn = {
VisualDocumentClassifier : {
'default': substitute_document_classifier_text_cols ,
},

}







14 changes: 13 additions & 1 deletion nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,14 +341,26 @@ def zip_and_explode(df: pd.DataFrame, cols_to_explode: List[str]) -> pd.DataFram
Elements of columns which are not in cols_to_explode, will be in lists
"""
# Check cols we want to explode actually exist, if no data extracted cols can be missing
# print(df)
missing = []
for col in cols_to_explode:
if col not in df.columns:
missing.append(col)
for miss in missing:
cols_to_explode.remove(miss)
# Drop duplicate cols
df = df.loc[:, ~df.columns.duplicated()]
# df = df.loc[:, ~df.columns.duplicated()]
if df.columns.duplicated().any():
# If there are duplicate column names, append a suffix to make them unique
cols = pd.Series(df.columns)
for dup in cols[cols.duplicated()].unique():
cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in
range(sum(cols == dup))]
df.columns = cols
else:
# If there are no duplicate column names, remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

if len(cols_to_explode) > 0:
# We must pad all cols we want to explode to the same length because pandas limitation.
# Spark API does not require this since it handles cols with not same length by creating nan. We do it ourselves here manually
Expand Down
1 change: 1 addition & 0 deletions nlu/pipe/extractors/extractor_methods/ocr_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ def use_first_row_as_column_names_for_list_of_dfs(pd_tables):
new_tables = []
for t in pd_tables:
new_tables.append(use_first_row_as_column_names(t))
# print(new_tables)
return new_tables
51 changes: 26 additions & 25 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,17 +123,20 @@
from nlu.ocr_components.table_extractors.doc_table_extractor.doc2table import Doc2TextTable
from nlu.ocr_components.table_extractors.pdf_table_extractor.pdf2table import PDF2TextTable
from nlu.ocr_components.table_extractors.ppt_table_extractor.ppt2table import PPT2TextTable
from nlu.ocr_components.visual_classifiers.visual_document_classifier.visual_document_classifier import VisualDocClassifier
from nlu.ocr_components.text_recognizers.doc2text.doc2text import Doc2Text
from nlu.ocr_components.text_recognizers.img2text.img2text import Img2Text
from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text
from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image
from nlu.ocr_components.utils.image2hocr.image2hocr import Image2Hocr
# from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \
substitute_document_classifier_text_cols
from nlu.pipe.col_substitution.col_substitution_OS import *
from nlu.pipe.extractors.extractor_configs_HC import *
from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config
from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \
default_visual_classifier_config
from nlu.pipe.extractors.extractor_configs_OS import *
from nlu.pipe.nlu_component import NluComponent
from nlu.universe.annotator_class_universe import AnnoClassRef
Expand Down Expand Up @@ -4084,29 +4087,27 @@ class ComponentUniverse:
applicable_file_types=['DOCX', 'DOC']
),

# O_A.VISUAL_DOCUMENT_CLASSIFIER: partial(NluComponent,
# name=O_A.VISUAL_DOCUMENT_CLASSIFIER,
# type=T.PDF_BUILDER,
# get_default_model=VisualDocClassifier.get_default_model,
# get_pretrained_model=VisualDocClassifier.get_pretrained_model,
#
# pdf_extractor_methods={'default': default_visual_classifier_config},
# # TODO EXtractor
# pdf_col_name_substitutor=substitute_recognized_text_cols,
# # TODO substitor
# output_level=L.DOCUMENT,
# node=OCR_FEATURE_NODES.nodes[O_A.VISUAL_DOCUMENT_CLASSIFIER],
# description='Convert text to PDF file',
# provider=ComponentBackends.ocr,
# license=Licenses.ocr,
# computation_context=ComputeContexts.spark,
# output_context=ComputeContexts.spark,
# jsl_anno_class_id=O_A.VISUAL_DOCUMENT_CLASSIFIER,
# jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
# O_A.VISUAL_DOCUMENT_CLASSIFIER],
# applicable_file_types=['JPG', 'JPEG']
# ),
#
O_A.VISUAL_DOCUMENT_CLASSIFIER: partial(NluComponent,
name=O_A.VISUAL_DOCUMENT_CLASSIFIER,
type=T.PDF_BUILDER,
get_default_model=VisualDocClassifier.get_default_model,
pdf_extractor_methods={'default': default_visual_classifier_config},
# TODO EXtractor
pdf_col_name_substitutor=substitute_document_classifier_text_cols,
# TODO substitor
output_level=L.DOCUMENT,
node=OCR_FEATURE_NODES.nodes[O_A.VISUAL_DOCUMENT_CLASSIFIER],
description='Convert text to PDF file',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.VISUAL_DOCUMENT_CLASSIFIER,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
O_A.VISUAL_DOCUMENT_CLASSIFIER],
applicable_file_types=['JPG', 'JPEG']
),

O_A.IMAGE2HOCR: partial(NluComponent,
name=O_A.IMAGE2HOCR,
type=T.OCR_UTIL,
Expand Down
2 changes: 1 addition & 1 deletion nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ class OCR_FEATURE_NODES:
F = OCR_FEATURES
nodes = {
A.VISUAL_DOCUMENT_CLASSIFIER: OcrFeatureNode(A.VISUAL_DOCUMENT_CLASSIFIER, [F.HOCR],
[F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE]),
[F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE, F.FILE_PATH]),

A.IMAGE2HOCR: OcrFeatureNode(A.IMAGE2HOCR, [F.OCR_IMAGE], [F.HOCR]),

Expand Down
Binary file added tests/nlu_ocr_tests/cv_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/nlu_ocr_tests/letter.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
74 changes: 37 additions & 37 deletions tests/nlu_ocr_tests/ocr_pdf_builder_tests.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
# import tests.secrets as sct
# import unittest
# import nlu
#
# SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE
# AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID
# AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY
# JSL_SECRET = sct.JSL_SECRET
# OCR_SECRET = sct.OCR_SECRET
# OCR_LICENSE = sct.OCR_LICENSE
# # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
#
# class OcrTest(unittest.TestCase):
#
# def test_text_to_pdf(self):
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# # text that we generate PDF to has to come from an image struct!
# # We need convert text to img struct!
#
# p = nlu.load('ppt2table',verbose=True)
# dfs = p.predict([f1,f2])
# for df in dfs :
# print(df)
#
# def test_DOC_table_extraction(self):
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# f1 = '/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/table_DOCX/doc2.docx'
# p = nlu.load('doc2table',verbose=True)
# dfs = p.predict([f1])
# for df in dfs :
# print(df)
#
#
#
# if __name__ == '__main__':
# unittest.main()
#
import tests.secrets as sct
import unittest
import nlu

SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE
AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY
JSL_SECRET = sct.JSL_SECRET
OCR_SECRET = sct.OCR_SECRET
OCR_LICENSE = sct.OCR_LICENSE
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)

class OcrTest(unittest.TestCase):

def test_text_to_pdf(self):
nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# text that we generate PDF to has to come from an image struct!
# We need convert text to img struct!

p = nlu.load('ppt2table',verbose=True)
dfs = p.predict([f1,f2])
for df in dfs :
print(df)

def test_DOC_table_extraction(self):
nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
f1 = '/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/table_DOCX/doc2.docx'
p = nlu.load('doc2table',verbose=True)
dfs = p.predict([f1])
for df in dfs :
print(df)



if __name__ == '__main__':
unittest.main()

9 changes: 6 additions & 3 deletions tests/nlu_ocr_tests/ocr_table_extraction_tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import tests.secrets as sct
import os
import sys

sys.path.append(os.getcwd())
import unittest
import nlu
nlu.auth(sct.SPARK_NLP_LICENSE,sct.AWS_ACCESS_KEY_ID,sct.AWS_SECRET_ACCESS_KEY,sct.JSL_SECRET, sct.OCR_LICENSE, sct.OCR_SECRET)
Expand Down Expand Up @@ -34,10 +37,10 @@ def test_PPT_table_extraction(self):
f1 = 'tests/datasets/ocr/table_PPT/54111.ppt'
f2 ='tests/datasets/ocr/table_PPT/mytable.ppt'
p = nlu.load('ppt2table',verbose=True)
dfs = p.predict([f1,f2])
dfs = p.predict([f1 ])
for df in dfs :
print(df)

def test_DOC_table_extraction(self):
f1 = 'tests/datasets/ocr/docx_with_table/doc2.docx'
p = nlu.load('doc2table',verbose=True)
Expand Down
60 changes: 34 additions & 26 deletions tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
# import tests.secrets as sct
# import unittest
# import nlu
#
# SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE
# AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID
# AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY
# JSL_SECRET = sct.JSL_SECRET
# OCR_SECRET = sct.OCR_SECRET
# OCR_LICENSE = sct.OCR_LICENSE
# # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
#
# class OcrTest(unittest.TestCase):
#
# def test_classify_document(self):
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# # text that we generate PDF to has to come from an image struct!
# # We need convert text to img struct!
# p = nlu.load('en.classify_image.tabacco',verbose=True)
# res = p.predict('/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/classification_images/letter.jpg')
# for r in res.columns:
# print(r[res])
#
# if __name__ == '__main__':
# unittest.main()
#

import os
import sys

sys.path.append(os.getcwd())
import unittest
import nlu

os.environ["PYTHONPATH"] = "F:/Work/repos/nlu"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from johnsnowlabs import nlp, visual

# nlp.install(json_license_path='license.json',visual=True)
nlp.start(visual=True)

# print('hi')
class OcrTest(unittest.TestCase):

def test_classify_document(self):
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# text that we generate PDF to has to come from an image struct!
# We need convert text to img struct!
p = nlu.load('en.classify_image.tabacco',verbose=True)
res = p.predict('cv_test.png')
for i,j in res.iterrows():
print(i,j)
print(res)
# for r in res.columns:
# print(r[res])

if __name__ == '__main__':
unittest.main()

Loading