Skip to content

Commit

Permalink
Add nomic.
Browse files Browse the repository at this point in the history
  • Loading branch information
nsthorat committed Feb 14, 2024
1 parent 20b80b1 commit c19bafa
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 58 deletions.
18 changes: 9 additions & 9 deletions lilac/embeddings/bge.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
import gc
from typing import TYPE_CHECKING, ClassVar, Iterator, Optional
from typing import TYPE_CHECKING, ClassVar, Optional

from typing_extensions import override

from ..utils import log

if TYPE_CHECKING:
from FlagEmbedding import BGEM3FlagModel

Expand All @@ -15,7 +17,7 @@
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
BGE_M3 = 'BAAI/bge-m3'
Expand All @@ -33,8 +35,11 @@ def _get_and_cache_bge_m3(model_name: str) -> 'BGEM3FlagModel':
model = BGEM3FlagModel(
'BAAI/bge-m3', use_fp16=True
) # Setting use_fp16 to True speeds up computation with a slight performance degradation

log(f'[{model_name}] Using device:', model.device)

# NOTE: we don't call setup model and device here as this happens internally.
return model
return setup_model_device(model, model_name)


class BGEM3(TextEmbeddingSignal):
Expand Down Expand Up @@ -62,7 +67,7 @@ def setup(self) -> None:
def compute(self, docs: list[str]) -> list[Optional[Item]]:
"""Call the embedding function."""

def _encode(doc):
def _encode(doc: list[str]):
# Extract the dense vectors from the model.
return self._model.encode(doc)['dense_vecs']

Expand All @@ -73,16 +78,11 @@ def _encode(doc):
_encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
)

@override
def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]:
raise NotImplementedError('Garden computation is not supported for BGE-M3.')

@override
def teardown(self) -> None:
if not hasattr(self, '_model'):
return

self._model.cpu()
del self._model
gc.collect()

Expand Down
108 changes: 108 additions & 0 deletions lilac/embeddings/nomic_embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
import gc
from typing import TYPE_CHECKING, ClassVar, Optional

from typing_extensions import override

if TYPE_CHECKING:
from sentence_transformers import SentenceTransformer

import functools

from ..schema import Item
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
NOMIC_EMBED = 'nomic-ai/nomic-embed-text-v1.5'


@functools.cache
def _get_and_cache_model(model_name: str) -> 'SentenceTransformer':
try:
from sentence_transformers import SentenceTransformer
except ImportError:
raise ImportError(
'Could not import the "sentence_transformers" python package. '
'Please install it with `pip install "sentence_transformers".'
)
return setup_model_device(SentenceTransformer(model_name, trust_remote_code=True), model_name)


class NomicEmbed15(TextEmbeddingSignal):
"""Computes Nomic Embeddings 1.5 full (768 dimensions).
<br>This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
for details.
"""

name: ClassVar[str] = 'nomic-embed-1.5-768'
display_name: ClassVar[str] = 'Nomic Embeddings 1.5 784'
local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE
local_parallelism: ClassVar[int] = 1
local_strategy: ClassVar[TaskExecutionType] = 'threads'
supports_garden: ClassVar[bool] = False

_model_name = NOMIC_EMBED
_model: 'SentenceTransformer'
_matryoshka_dim = 768

@override
def setup(self) -> None:
self._model = _get_and_cache_model(self._model_name)

@override
def compute(self, docs: list[str]) -> list[Optional[Item]]:
"""Call the embedding function."""
try:
import torch.nn.functional as F
except ImportError:
raise ImportError(
'Could not import the "sentence_transformers" python package. '
'Please install it with `pip install "sentence_transformers".'
)

def _encode(doc: list[str]):
embeddings = self._model.encode(doc, convert_to_tensor=True)
# Extract the dense vectors from the model.
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, : self._matryoshka_dim]
return embeddings.cpu().numpy()

# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
return chunked_compute_embedding(
_encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
)

@override
def teardown(self) -> None:
if not hasattr(self, '_model'):
return

self._model.cpu()
del self._model
gc.collect()

try:
import torch

torch.cuda.empty_cache()
except ImportError:
pass


class NomicEmbed15_256(NomicEmbed15):
"""Computes Nomic Embeddings 1.5 (256 dimensions).
<br>This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
for details.
"""

name: ClassVar[str] = 'nomic-embed-1.5-256'
display_name: ClassVar[str] = 'Nomic Embeddings 1.5 256'
_matryoshka_dim = 256
3 changes: 3 additions & 0 deletions lilac/signals/default_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from ..embeddings.cohere import Cohere
from ..embeddings.gte import GTEBase, GTESmall, GTETiny
from ..embeddings.jina import JinaV2Base, JinaV2Small
from ..embeddings.nomic_embed import NomicEmbed15, NomicEmbed15_256
from ..embeddings.openai import OpenAIEmbedding
from ..embeddings.sbert import SBERT
from ..signal import register_signal
Expand Down Expand Up @@ -46,3 +47,5 @@ def register_default_signals() -> None:
register_signal(JinaV2Base, exists_ok=True)

register_signal(BGEM3, exists_ok=True)
register_signal(NomicEmbed15, exists_ok=True)
register_signal(NomicEmbed15_256, exists_ok=True)
67 changes: 21 additions & 46 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ jinja2 = "^3.1.3" # Used for directory li
# LLM providers.
cohere = { version = "^4.32", optional = true }
openai = { version = "^1.7.1", optional = true }
sentence-transformers = { version = "^2.2.2", optional = true } # SBERT on-device embeddings.
sentence-transformers = { version = "^2.3.1", optional = true } # SBERT on-device embeddings.
FlagEmbedding = { version = "^1.2.3", optional = true } # bge on-device embeddings.
transformers = { version = "^4.37.2", optional = true } # bge on-device embeddings.
einops = { version = "^0.7.0", optional = true } # Nomic on-device embeddings.

# Gmail source.
email-reply-parser = { version = "^0.5.12", optional = true }
Expand Down Expand Up @@ -100,6 +101,7 @@ all = [
"langsmith",
"llama-hub",
"llama-index",
"nomic",
"openai",
"presidio_analyzer",
"sentence-transformers",
Expand Down Expand Up @@ -135,6 +137,7 @@ text_stats = ["textacy"] # Text statistics.
# Individual embeddings.
gte = ["sentence-transformers"]
bge = ["FlagEmbedding", "transformers"]
nomic = ["sentence-transformers", "einops"]
sbert = ["sentence-transformers"]
cohere = ["cohere"]
openai = ["openai"]
Expand Down
2 changes: 1 addition & 1 deletion web/lib/fastapi_client/models/ConceptSignal.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export type ConceptSignal = {
/**
* The name of the pre-computed embedding.
*/
embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3';
embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3' | 'nomic-embed-1.5-768' | 'nomic-embed-1.5-256';
namespace: string;
concept_name: string;
version?: (number | null);
Expand Down
2 changes: 1 addition & 1 deletion web/lib/fastapi_client/models/SemanticSimilaritySignal.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export type SemanticSimilaritySignal = {
/**
* The name of the pre-computed embedding.
*/
embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3';
embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3' | 'nomic-embed-1.5-768' | 'nomic-embed-1.5-256';
query: string;
/**
* The input type of the query, used for the query embedding.
Expand Down

0 comments on commit c19bafa

Please sign in to comment.