Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/sod_bgee_integration' into relea…
Browse files Browse the repository at this point in the history
…se/515

# Conflicts:
#	nlu/universe/annotator_class_universe.py
  • Loading branch information
C-K-Loan committed Mar 5, 2024
2 parents c35d636 + 9abff28 commit dd82d0d
Show file tree
Hide file tree
Showing 11 changed files with 1,227 additions and 0 deletions.

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions nlu/components/embeddings/sentence_bge/BGESentenceEmbedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import sparknlp
from sparknlp.annotator import BGEEmbeddings


class BGE:
@staticmethod
def get_default_model():
return BGEEmbeddings.pretrained() \
.setInputCols(["document"]) \
.setOutputCol("bge_embeddings")
sparknlp.start()
@staticmethod
def get_pretrained_model(name, language, bucket=None):
return BGEEmbeddings.pretrained(name,language,bucket) \
.setInputCols(["document"]) \
.setOutputCol("bge_embeddings")
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
WordEmbeddingsModel ,
ElmoEmbeddings ,
E5Embeddings,
BGEEmbeddings,
BertSentenceEmbeddings,
RoBertaSentenceEmbeddings,
UniversalSentenceEncoder,
Expand Down Expand Up @@ -88,6 +89,7 @@
WordEmbeddingsModel ,
ElmoEmbeddings ,
E5Embeddings,
BGEEmbeddings,
BertSentenceEmbeddings,
RoBertaSentenceEmbeddings,
InstructorEmbeddings,
Expand Down
3 changes: 3 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OS.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
E5Embeddings: {
'default': substitute_word_embed_cols,
},
BGEEmbeddings: {
'default': substitute_sent_embed_cols,
},
BertSentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
Expand Down
6 changes: 6 additions & 0 deletions nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4776,6 +4776,9 @@ class Spellbook:
'en.embed_sentence.biobert.pubmed_base_cased': 'sent_biobert_pubmed_base_cased',
'en.embed_sentence.biobert.pubmed_large_cased': 'sent_biobert_pubmed_large_cased',
'en.embed_sentence.biobert.pubmed_pmc_base_cased': 'sent_biobert_pubmed_pmc_base_cased',
'en.embed_sentence.bge_base': 'bge_base',
'en.embed_sentence.bge_small': 'bge_small',
'en.embed_sentence.bge_large': 'bge_large',
'en.embed_sentence.covidbert.large_uncased': 'sent_covidbert_large_uncased',
'en.embed_sentence.distil_roberta.distilled_base': 'sent_distilroberta_base',
'en.embed_sentence.doc2vec': 'doc2vec_gigaword_300',
Expand Down Expand Up @@ -15352,6 +15355,9 @@ class Spellbook:
'bert_wiki_books_squad2': 'BertEmbeddings',
'bert_wiki_books_sst2': 'BertEmbeddings',
'beto_sentiment': 'BertForSequenceClassification',
'bge_small': 'BGEEmbeddings',
'bge_base': 'BGEEmbeddings',
'bge_large': 'BGEEmbeddings',
'binary2image': 'BinaryToImage',
'biobert_clinical_base_cased': 'BertEmbeddings',
'biobert_discharge_base_cased': 'BertEmbeddings',
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class AnnoClassRef:
JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {

A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings',
A_N.BGE_SENTENCE_EMBEDDINGS: 'BGEEmbeddings',
A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings',

A_N.WHISPER_FOR_CTC: 'WhisperForCTC',
Expand Down
22 changes: 22 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
from nlu.components.embeddings.longformer.longformer import Longformer
from nlu.components.embeddings.roberta.roberta import Roberta
from nlu.components.embeddings.sentence_e5.E5SentenceEmbedding import E5
from nlu.components.embeddings.sentence_bge.BGESentenceEmbedding import BGE
from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence
from nlu.components.embeddings.sentence_roberta.RobertaSentenceEmbedding import RobertaSentence
from nlu.components.embeddings.sentence_mpnet.MPNetSentenceEmbedding import MPNetSentence
Expand Down Expand Up @@ -2421,6 +2422,27 @@ class ComponentUniverse:
has_storage_ref=True,
is_storage_ref_producer=True,
),
A.BGE_SENTENCE_EMBEDDINGS: partial(NluComponent,
name=A.BGE_SENTENCE_EMBEDDINGS,
type=T.DOCUMENT_EMBEDDING,
get_default_model=BGE.get_default_model,
get_pretrained_model=BGE.get_pretrained_model,
pdf_extractor_methods={'default': default_sentence_embedding_config,
'default_full': default_full_config, },
pdf_col_name_substitutor=substitute_sent_embed_cols,
output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING,
node=NLP_FEATURE_NODES.nodes[A.BGE_SENTENCE_EMBEDDINGS],
description='Sentence-level embeddings using BGE. E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.).',
provider=ComponentBackends.open_source,
license=Licenses.open_source,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=A.BGE_SENTENCE_EMBEDDINGS,
jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BGE_SENTENCE_EMBEDDINGS],
has_storage_ref=True,
is_storage_ref_producer=True,
),

A.BERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent,
name=A.BERT_FOR_TOKEN_CLASSIFICATION,
type=T.TRANSFORMER_TOKEN_CLASSIFIER,
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class NLP_NODE_IDS:
CAMEMBERT_FOR_TOKEN_CLASSIFICATION = JslAnnoId('camenbert_for_token_classification')
CAMEMBERT_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('camenbert_for_sequence_classification')
E5_SENTENCE_EMBEDDINGS = JslAnnoId('e5_sentence_embeddings')
BGE_SENTENCE_EMBEDDINGS = JslAnnoId('bge_sentence_embeddings')
BERT_SENTENCE_EMBEDDINGS = JslAnnoId('bert_sentence_embeddings')
DISTIL_BERT_EMBEDDINGS = JslAnnoId('distil_bert_embeddings')
DISTIL_BERT_FOR_TOKEN_CLASSIFICATION = JslAnnoId('distil_bert_for_token_classification')
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class NLP_FEATURE_NODES: # or Mode Node?
A.INSTRUCTOR_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.INSTRUCTOR_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),

A.E5_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.E5_SENTENCE_EMBEDDINGS, [F.DOCUMENT],[F.SENTENCE_EMBEDDINGS]),
A.BGE_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.BGE_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),

A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import unittest

from nlu import *


class TestBGESentenceEmbeddings(unittest.TestCase):
def test_bge_embeds(self):
pipe = nlu.load("en.embed_sentence.bge_small", verbose=True)
res = pipe.predict(
"query: how much protein should a female eat",
output_level="document"
)
for c in res:
print(res[c])


if __name__ == "__main__":
unittest.main()

0 comments on commit dd82d0d

Please sign in to comment.