Merge pull request #214 from JohnSnowLabs/sod_embedding_integration

InstructorEmbeddings integration
JohnSnowLabs · Nov 8, 2023 · 30f0961 · 30f0961
2 parents 0a58a03 + 6d5f22f
commit 30f0961
Showing 11 changed files with 1,604 additions and 0 deletions.
diff --git a/...les/colab/component_examples/sentence_embeddings/NLU_INSTRUCTOR_sentence_embeddings.ipynb b/...les/colab/component_examples/sentence_embeddings/NLU_INSTRUCTOR_sentence_embeddings.ipynb
diff --git a/nlu/components/embeddings/instructor_sentence/InstructorEmbeddings.py b/nlu/components/embeddings/instructor_sentence/InstructorEmbeddings.py
@@ -0,0 +1,20 @@
+from sparknlp.annotator import InstructorEmbeddings
+
+
+class Instructor:
+    @staticmethod
+    def get_default_model():
+        return InstructorEmbeddings.pretrained() \
+            .setInstruction("Instruction here: ") \
+            .setInputCols(["documents"]) \
+            .setOutputCol("instructor")
+
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        return InstructorEmbeddings.pretrained(name,language,bucket) \
+            .setInstruction("Instruction here: ") \
+            .setInputCols(["documents"]) \
+            .setOutputCol("instructor")
+
+
+
diff --git a/nlu/components/embeddings/instructor_sentence/__init__.py b/nlu/components/embeddings/instructor_sentence/__init__.py
diff --git a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py
@@ -21,6 +21,7 @@
     ElmoEmbeddings ,
     BertSentenceEmbeddings,
     UniversalSentenceEncoder,
+    InstructorEmbeddings,
     SentenceEmbeddings,
     ContextSpellCheckerModel ,
     SymmetricDeleteModel ,
@@ -84,6 +85,7 @@
     WordEmbeddingsModel ,
     ElmoEmbeddings ,
     BertSentenceEmbeddings,
+    InstructorEmbeddings,
     UniversalSentenceEncoder,
     SentenceEmbeddings,
     MultiClassifierDLModel,

diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py
@@ -50,6 +50,9 @@
     BertSentenceEmbeddings: {
         'default': substitute_sent_embed_cols,
     },
+    InstructorEmbeddings: {
+        'default': substitute_sent_embed_cols,
+    },
 
     Doc2VecModel: {
         'default': substitute_sent_embed_cols,

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -4801,6 +4801,8 @@ class Spellbook:
             'en.embed_sentence.small_bert_L8_256': 'sent_small_bert_L8_256',
             'en.embed_sentence.small_bert_L8_512': 'sent_small_bert_L8_512',
             'en.embed_sentence.small_bert_L8_768': 'sent_small_bert_L8_768',
+            'en.embed_sentence.instructor_base':'instructor_base',
+            'en.embed_sentence.instructor_large':'instructor_large',
             'en.embed_sentence.tfhub_use': 'tfhub_use',
             'en.embed_sentence.tfhub_use.lg': 'tfhub_use_lg',
             'en.embed_sentence.use': 'tfhub_use',
@@ -15584,6 +15586,8 @@ class Spellbook:
                              'image_classifier_vit_where_am_I_hospital_balcony_hallway_airport_coffee_house_apartment_office': 'ViTForImageClassification',
                              'image_classifier_vit_world_landmarks': 'ViTForImageClassification',
                              'image_classifier_convnext_tiny_224_local':'ConvNextImageClassifier',
+                             'instructor_large':'InstructorEmbeddings',
+                             'instructor_base':'InstructorEmbeddings',
                              'japanese_cc_300d': 'WordEmbeddingsModel',
                              'jsl_ner_wip_clinical': 'MedicalNerModel',
                              'jsl_ner_wip_greedy_clinical': 'MedicalNerModel',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -13,6 +13,8 @@ class AnnoClassRef:
     HC_A_N = NLP_HC_NODE_IDS
     # Map AnnoID to PyCLass
     JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
+        A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS:'InstructorEmbeddings',
+
         A_N.HUBERT_FOR_CTC: 'HubertForCTC',
         A_N.CAMEMBERT_FOR_QUESTION_ANSWERING: 'CamemBertForQuestionAnswering',
         A_N.SWIN_IMAGE_CLASSIFICATION: 'SwinForImageClassification',

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -74,6 +74,7 @@
 from nlu.components.embeddings.longformer.longformer import Longformer
 from nlu.components.embeddings.roberta.roberta import Roberta
 from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence
+from nlu.components.embeddings.instructor_sentence.InstructorEmbeddings import Instructor
 from nlu.components.embeddings.sentence_xlm.sentence_xlm import Sentence_XLM
 from nlu.components.embeddings.use.spark_nlp_use import SparkNLPUse
 from nlu.components.embeddings.word2vec.word2vec import Word2Vec
@@ -2329,6 +2330,27 @@ class ComponentUniverse:
                                    has_storage_ref=True,
                                    is_storage_ref_producer=True,
                                    ),
+        A.INSTRUCTOR_SENTENCE_EMBEDDINGS: partial(NluComponent,
+                                                  name=A.INSTRUCTOR_SENTENCE_EMBEDDINGS,
+                                                  type=T.DOCUMENT_EMBEDDING,
+                                                  get_default_model=Instructor.get_default_model,
+                                                  get_pretrained_model=Instructor.get_pretrained_model,
+                                                  pdf_extractor_methods={'default': default_sentence_embedding_config,
+                                                                         'default_full': default_full_config, },
+                                                  pdf_col_name_substitutor=substitute_sent_embed_cols,
+                                                  output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING,
+                                                  node=NLP_FEATURE_NODES.nodes[A.INSTRUCTOR_SENTENCE_EMBEDDINGS],
+                                                  description='Sentence-level embeddings using Instructor. Instructor, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning.',
+                                                  provider=ComponentBackends.open_source,
+                                                  license=Licenses.open_source,
+                                                  computation_context=ComputeContexts.spark,
+                                                  output_context=ComputeContexts.spark,
+                                                  jsl_anno_class_id=A.INSTRUCTOR_SENTENCE_EMBEDDINGS,
+                                                  jsl_anno_py_class=ACR.JSL_anno2_py_class[
+                                                      A.INSTRUCTOR_SENTENCE_EMBEDDINGS],
+                                                  has_storage_ref=True,
+                                                  is_storage_ref_producer=True,
+                                                  ),
         A.BERT_SENTENCE_EMBEDDINGS: partial(NluComponent,
                                             name=A.BERT_SENTENCE_EMBEDDINGS,
                                             type=T.DOCUMENT_EMBEDDING,

diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -9,6 +9,7 @@ class NLP_NODE_IDS:
     Used to cast the pipeline dependency resolution algorithm into an abstract graph
     """
     # Visual Document Understanding
+
     BIG_TEXT_MATCHER = JslAnnoId('big_text_matcher')
     CHUNK2DOC = JslAnnoId('chunk2doc')
     CHUNK_EMBEDDINGS_CONVERTER = JslAnnoId('chunk_embeddings_converter')
@@ -105,6 +106,7 @@ class NLP_NODE_IDS:
     CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
     SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
     BART_TRANSFORMER = JslAnnoId("bart_transformer")
+    INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')
 
 
 

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -73,6 +73,7 @@ class NLP_FEATURE_NODES:  # or Mode Node?
     A = NLP_NODE_IDS
     F = NLP_FEATURES
     nodes = {
+        A.INSTRUCTOR_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.INSTRUCTOR_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
 
         A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),
 

diff --git a/...u_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_instructor_tests.py b/...u_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_instructor_tests.py
@@ -0,0 +1,46 @@
+import unittest
+
+from nlu import *
+
+
+class TestInstructorSentenceEmbeddings(unittest.TestCase):
+    def test_instructor_embeds_sentence_level(self):
+        pipe = nlu.load("en.embed_sentence.instructor_base", verbose=True)
+        pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_1c5e51202650'].setInstruction(
+            "Represent the Amazon title for retrieving relevant reviews: ")
+        res = pipe.predict("Loved it!  It is Exciting, interesting, and even including information about the space program.",
+                           output_level='sentence')
+
+        for c in res:
+            print(res[c])
+
+        pipe = nlu.load("en.embed_sentence.instructor_large", verbose=True)
+        pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_46e0451abc97'].setInstruction(
+            "Represent the Amazon title for retrieving relevant reviews: ")
+        res = pipe.predict("Loved it!  It is Exciting, interesting, and even including information about the space program.",
+                           output_level='sentence')
+
+        for c in res:
+            print(res[c])
+
+    def test_instructor_embeds_document_level(self):
+        pipe = nlu.load("en.embed_sentence.instructor_base", verbose=True)
+        pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_1c5e51202650'].setInstruction(
+            "Represent the Amazon title for retrieving relevant reviews: ")
+        res = pipe.predict("Loved it!  It is Exciting, interesting, and even including information about the space program.",
+                           output_level='document')
+
+        for c in res:
+            print(res[c])
+
+        pipe = nlu.load("en.embed_sentence.instructor_large", verbose=True)
+        pipe['instructor_sentence_embeddings@INSTRUCTOR_EMBEDDINGS_46e0451abc97'].setInstruction(
+            "Represent the Amazon title for retrieving relevant reviews: ")
+        res = pipe.predict("Loved it!  It is Exciting, interesting, and even including information about the space program.",
+                           output_level='document')
+
+        for c in res:
+            print(res[c])
+
+if __name__ == "__main__":
+    unittest.main()