Skip to content

Commit

Permalink
Merge pull request #214 from JohnSnowLabs/sod_embedding_integration
Browse files Browse the repository at this point in the history
InstructorEmbeddings integration
C-K-Loan authored Nov 8, 2023
2 parents 0a58a03 + 6d5f22f commit 30f0961
Showing 11 changed files with 1,604 additions and 0 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from sparknlp.annotator import InstructorEmbeddings


class Instructor:
@staticmethod
def get_default_model():
return InstructorEmbeddings.pretrained() \
.setInstruction("Instruction here: ") \
.setInputCols(["documents"]) \
.setOutputCol("instructor")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return InstructorEmbeddings.pretrained(name,language,bucket) \
.setInstruction("Instruction here: ") \
.setInputCols(["documents"]) \
.setOutputCol("instructor")



Empty file.
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@
ElmoEmbeddings ,
BertSentenceEmbeddings,
UniversalSentenceEncoder,
InstructorEmbeddings,
SentenceEmbeddings,
ContextSpellCheckerModel ,
SymmetricDeleteModel ,
@@ -84,6 +85,7 @@
WordEmbeddingsModel ,
ElmoEmbeddings ,
BertSentenceEmbeddings,
InstructorEmbeddings,
UniversalSentenceEncoder,
SentenceEmbeddings,
MultiClassifierDLModel,
3 changes: 3 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OS.py
Original file line number Diff line number Diff line change
@@ -50,6 +50,9 @@
BertSentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
InstructorEmbeddings: {
'default': substitute_sent_embed_cols,
},

Doc2VecModel: {
'default': substitute_sent_embed_cols,
4 changes: 4 additions & 0 deletions nlu/spellbook.py
Original file line number Diff line number Diff line change
@@ -4801,6 +4801,8 @@ class Spellbook:
'en.embed_sentence.small_bert_L8_256': 'sent_small_bert_L8_256',
'en.embed_sentence.small_bert_L8_512': 'sent_small_bert_L8_512',
'en.embed_sentence.small_bert_L8_768': 'sent_small_bert_L8_768',
'en.embed_sentence.instructor_base':'instructor_base',
'en.embed_sentence.instructor_large':'instructor_large',
'en.embed_sentence.tfhub_use': 'tfhub_use',
'en.embed_sentence.tfhub_use.lg': 'tfhub_use_lg',
'en.embed_sentence.use': 'tfhub_use',
@@ -15584,6 +15586,8 @@ class Spellbook:
'image_classifier_vit_where_am_I_hospital_balcony_hallway_airport_coffee_house_apartment_office': 'ViTForImageClassification',
'image_classifier_vit_world_landmarks': 'ViTForImageClassification',
'image_classifier_convnext_tiny_224_local':'ConvNextImageClassifier',
'instructor_large':'InstructorEmbeddings',
'instructor_base':'InstructorEmbeddings',
'japanese_cc_300d': 'WordEmbeddingsModel',
'jsl_ner_wip_clinical': 'MedicalNerModel',
'jsl_ner_wip_greedy_clinical': 'MedicalNerModel',
2 changes: 2 additions & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
@@ -13,6 +13,8 @@ class AnnoClassRef:
HC_A_N = NLP_HC_NODE_IDS
# Map AnnoID to PyCLass
JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS:'InstructorEmbeddings',

A_N.HUBERT_FOR_CTC: 'HubertForCTC',
A_N.CAMEMBERT_FOR_QUESTION_ANSWERING: 'CamemBertForQuestionAnswering',
A_N.SWIN_IMAGE_CLASSIFICATION: 'SwinForImageClassification',
22 changes: 22 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
@@ -74,6 +74,7 @@
from nlu.components.embeddings.longformer.longformer import Longformer
from nlu.components.embeddings.roberta.roberta import Roberta
from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence
from nlu.components.embeddings.instructor_sentence.InstructorEmbeddings import Instructor
from nlu.components.embeddings.sentence_xlm.sentence_xlm import Sentence_XLM
from nlu.components.embeddings.use.spark_nlp_use import SparkNLPUse
from nlu.components.embeddings.word2vec.word2vec import Word2Vec
@@ -2329,6 +2330,27 @@ class ComponentUniverse:
has_storage_ref=True,
is_storage_ref_producer=True,
),
A.INSTRUCTOR_SENTENCE_EMBEDDINGS: partial(NluComponent,
name=A.INSTRUCTOR_SENTENCE_EMBEDDINGS,
type=T.DOCUMENT_EMBEDDING,
get_default_model=Instructor.get_default_model,
get_pretrained_model=Instructor.get_pretrained_model,
pdf_extractor_methods={'default': default_sentence_embedding_config,
'default_full': default_full_config, },
pdf_col_name_substitutor=substitute_sent_embed_cols,
output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING,
node=NLP_FEATURE_NODES.nodes[A.INSTRUCTOR_SENTENCE_EMBEDDINGS],
description='Sentence-level embeddings using Instructor. Instructor, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning.',
provider=ComponentBackends.open_source,
license=Licenses.open_source,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=A.INSTRUCTOR_SENTENCE_EMBEDDINGS,
jsl_anno_py_class=ACR.JSL_anno2_py_class[
A.INSTRUCTOR_SENTENCE_EMBEDDINGS],
has_storage_ref=True,
is_storage_ref_producer=True,
),
A.BERT_SENTENCE_EMBEDDINGS: partial(NluComponent,
name=A.BERT_SENTENCE_EMBEDDINGS,
type=T.DOCUMENT_EMBEDDING,
2 changes: 2 additions & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@ class NLP_NODE_IDS:
Used to cast the pipeline dependency resolution algorithm into an abstract graph
"""
# Visual Document Understanding

BIG_TEXT_MATCHER = JslAnnoId('big_text_matcher')
CHUNK2DOC = JslAnnoId('chunk2doc')
CHUNK_EMBEDDINGS_CONVERTER = JslAnnoId('chunk_embeddings_converter')
@@ -105,6 +106,7 @@ class NLP_NODE_IDS:
CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
BART_TRANSFORMER = JslAnnoId("bart_transformer")
INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')



1 change: 1 addition & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
@@ -73,6 +73,7 @@ class NLP_FEATURE_NODES: # or Mode Node?
A = NLP_NODE_IDS
F = NLP_FEATURES
nodes = {
A.INSTRUCTOR_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.INSTRUCTOR_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),

A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),

Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import unittest

from nlu import *


class TestInstructorSentenceEmbeddings(unittest.TestCase):
def test_instructor_embeds_sentence_level(self):
pipe = nlu.load("en.embed_sentence.instructor_base", verbose=True)
pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_1c5e51202650'].setInstruction(
"Represent the Amazon title for retrieving relevant reviews: ")
res = pipe.predict("Loved it! It is Exciting, interesting, and even including information about the space program.",
output_level='sentence')

for c in res:
print(res[c])

pipe = nlu.load("en.embed_sentence.instructor_large", verbose=True)
pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_46e0451abc97'].setInstruction(
"Represent the Amazon title for retrieving relevant reviews: ")
res = pipe.predict("Loved it! It is Exciting, interesting, and even including information about the space program.",
output_level='sentence')

for c in res:
print(res[c])

def test_instructor_embeds_document_level(self):
pipe = nlu.load("en.embed_sentence.instructor_base", verbose=True)
pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_1c5e51202650'].setInstruction(
"Represent the Amazon title for retrieving relevant reviews: ")
res = pipe.predict("Loved it! It is Exciting, interesting, and even including information about the space program.",
output_level='document')

for c in res:
print(res[c])

pipe = nlu.load("en.embed_sentence.instructor_large", verbose=True)
pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_46e0451abc97'].setInstruction(
"Represent the Amazon title for retrieving relevant reviews: ")
res = pipe.predict("Loved it! It is Exciting, interesting, and even including information about the space program.",
output_level='document')

for c in res:
print(res[c])

if __name__ == "__main__":
unittest.main()

0 comments on commit 30f0961

Please sign in to comment.