Merge remote-tracking branch 'origin/sod_bgee_integration' into relea…

…se/515 # Conflicts: # nlu/universe/annotator_class_universe.py
JohnSnowLabs · Mar 5, 2024 · dd82d0d · dd82d0d
2 parents c35d636 + 9abff28
commit dd82d0d
Show file tree

Hide file tree

Showing 11 changed files with 1,227 additions and 0 deletions.
diff --git a/examples/colab/component_examples/sentence_embeddings/NLU_BGE_sentence_embeddings.ipynb b/examples/colab/component_examples/sentence_embeddings/NLU_BGE_sentence_embeddings.ipynb
diff --git a/nlu/components/embeddings/sentence_bge/BGESentenceEmbedding.py b/nlu/components/embeddings/sentence_bge/BGESentenceEmbedding.py
@@ -0,0 +1,16 @@
+import sparknlp
+from sparknlp.annotator import BGEEmbeddings
+
+
+class BGE:
+    @staticmethod
+    def get_default_model():
+        return BGEEmbeddings.pretrained() \
+            .setInputCols(["document"]) \
+            .setOutputCol("bge_embeddings")
+    sparknlp.start()
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        return BGEEmbeddings.pretrained(name,language,bucket) \
+            .setInputCols(["document"]) \
+            .setOutputCol("bge_embeddings")
diff --git a/nlu/components/embeddings/sentence_bge/__init__.py b/nlu/components/embeddings/sentence_bge/__init__.py
diff --git a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py
@@ -20,6 +20,7 @@
     WordEmbeddingsModel ,
     ElmoEmbeddings ,
     E5Embeddings,
+    BGEEmbeddings,
     BertSentenceEmbeddings,
     RoBertaSentenceEmbeddings,
     UniversalSentenceEncoder,
@@ -88,6 +89,7 @@
     WordEmbeddingsModel ,
     ElmoEmbeddings ,
     E5Embeddings,
+    BGEEmbeddings,
     BertSentenceEmbeddings,
     RoBertaSentenceEmbeddings,
     InstructorEmbeddings,

diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py
@@ -50,6 +50,9 @@
     E5Embeddings: {
         'default': substitute_word_embed_cols,
     },
+    BGEEmbeddings: {
+        'default': substitute_sent_embed_cols,
+    },
     BertSentenceEmbeddings: {
         'default': substitute_sent_embed_cols,
     },

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -4776,6 +4776,9 @@ class Spellbook:
             'en.embed_sentence.biobert.pubmed_base_cased': 'sent_biobert_pubmed_base_cased',
             'en.embed_sentence.biobert.pubmed_large_cased': 'sent_biobert_pubmed_large_cased',
             'en.embed_sentence.biobert.pubmed_pmc_base_cased': 'sent_biobert_pubmed_pmc_base_cased',
+            'en.embed_sentence.bge_base': 'bge_base',
+            'en.embed_sentence.bge_small': 'bge_small',
+            'en.embed_sentence.bge_large': 'bge_large',
             'en.embed_sentence.covidbert.large_uncased': 'sent_covidbert_large_uncased',
             'en.embed_sentence.distil_roberta.distilled_base': 'sent_distilroberta_base',
             'en.embed_sentence.doc2vec': 'doc2vec_gigaword_300',
@@ -15352,6 +15355,9 @@ class Spellbook:
                              'bert_wiki_books_squad2': 'BertEmbeddings',
                              'bert_wiki_books_sst2': 'BertEmbeddings',
                              'beto_sentiment': 'BertForSequenceClassification',
+                             'bge_small': 'BGEEmbeddings',
+                             'bge_base': 'BGEEmbeddings',
+                             'bge_large': 'BGEEmbeddings',
                              'binary2image': 'BinaryToImage',
                              'biobert_clinical_base_cased': 'BertEmbeddings',
                              'biobert_discharge_base_cased': 'BertEmbeddings',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -15,6 +15,7 @@ class AnnoClassRef:
     JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
 
         A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings',
+        A_N.BGE_SENTENCE_EMBEDDINGS: 'BGEEmbeddings',
         A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings',
 
         A_N.WHISPER_FOR_CTC: 'WhisperForCTC',

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -79,6 +79,7 @@
 from nlu.components.embeddings.longformer.longformer import Longformer
 from nlu.components.embeddings.roberta.roberta import Roberta
 from nlu.components.embeddings.sentence_e5.E5SentenceEmbedding import E5
+from nlu.components.embeddings.sentence_bge.BGESentenceEmbedding import BGE
 from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence
 from nlu.components.embeddings.sentence_roberta.RobertaSentenceEmbedding import RobertaSentence
 from nlu.components.embeddings.sentence_mpnet.MPNetSentenceEmbedding import MPNetSentence
@@ -2421,6 +2422,27 @@ class ComponentUniverse:
                                             has_storage_ref=True,
                                             is_storage_ref_producer=True,
                                             ),
+        A.BGE_SENTENCE_EMBEDDINGS: partial(NluComponent,
+                                          name=A.BGE_SENTENCE_EMBEDDINGS,
+                                          type=T.DOCUMENT_EMBEDDING,
+                                          get_default_model=BGE.get_default_model,
+                                          get_pretrained_model=BGE.get_pretrained_model,
+                                          pdf_extractor_methods={'default': default_sentence_embedding_config,
+                                                                 'default_full': default_full_config, },
+                                          pdf_col_name_substitutor=substitute_sent_embed_cols,
+                                          output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING,
+                                          node=NLP_FEATURE_NODES.nodes[A.BGE_SENTENCE_EMBEDDINGS],
+                                          description='Sentence-level embeddings using BGE. E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.).',
+                                          provider=ComponentBackends.open_source,
+                                          license=Licenses.open_source,
+                                          computation_context=ComputeContexts.spark,
+                                          output_context=ComputeContexts.spark,
+                                          jsl_anno_class_id=A.BGE_SENTENCE_EMBEDDINGS,
+                                          jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BGE_SENTENCE_EMBEDDINGS],
+                                          has_storage_ref=True,
+                                          is_storage_ref_producer=True,
+                                          ),
+
         A.BERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent,
                                                  name=A.BERT_FOR_TOKEN_CLASSIFICATION,
                                                  type=T.TRANSFORMER_TOKEN_CLASSIFIER,

diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -91,6 +91,7 @@ class NLP_NODE_IDS:
     CAMEMBERT_FOR_TOKEN_CLASSIFICATION = JslAnnoId('camenbert_for_token_classification')
     CAMEMBERT_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('camenbert_for_sequence_classification')
     E5_SENTENCE_EMBEDDINGS = JslAnnoId('e5_sentence_embeddings')
+    BGE_SENTENCE_EMBEDDINGS = JslAnnoId('bge_sentence_embeddings')
     BERT_SENTENCE_EMBEDDINGS = JslAnnoId('bert_sentence_embeddings')
     DISTIL_BERT_EMBEDDINGS = JslAnnoId('distil_bert_embeddings')
     DISTIL_BERT_FOR_TOKEN_CLASSIFICATION = JslAnnoId('distil_bert_for_token_classification')

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -76,6 +76,7 @@ class NLP_FEATURE_NODES:  # or Mode Node?
         A.INSTRUCTOR_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.INSTRUCTOR_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
 
         A.E5_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.E5_SENTENCE_EMBEDDINGS, [F.DOCUMENT],[F.SENTENCE_EMBEDDINGS]),
+        A.BGE_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.BGE_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
         A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
 
         A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),

diff --git a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bge_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bge_tests.py
@@ -0,0 +1,18 @@
+import unittest
+
+from nlu import *
+
+
+class TestBGESentenceEmbeddings(unittest.TestCase):
+    def test_bge_embeds(self):
+        pipe = nlu.load("en.embed_sentence.bge_small", verbose=True)
+        res = pipe.predict(
+            "query: how much protein should a female eat",
+            output_level="document"
+        )
+        for c in res:
+            print(res[c])
+
+
+if __name__ == "__main__":
+    unittest.main()