diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index 0c578b1e..5ab1560f 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -1,9 +1,11 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc -from typing import TYPE_CHECKING, ClassVar, Iterator, Optional +from typing import TYPE_CHECKING, ClassVar, Optional from typing_extensions import override +from ..utils import log + if TYPE_CHECKING: from FlagEmbedding import BGEM3FlagModel @@ -15,7 +17,7 @@ from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType from .embedding import chunked_compute_embedding -from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device +from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. BGE_M3 = 'BAAI/bge-m3' @@ -33,8 +35,11 @@ def _get_and_cache_bge_m3(model_name: str) -> 'BGEM3FlagModel': model = BGEM3FlagModel( 'BAAI/bge-m3', use_fp16=True ) # Setting use_fp16 to True speeds up computation with a slight performance degradation + + log(f'[{model_name}] Using device:', model.device) + + # NOTE: we don't call setup model and device here as this happens internally. return model - return setup_model_device(model, model_name) class BGEM3(TextEmbeddingSignal): @@ -62,7 +67,7 @@ def setup(self) -> None: def compute(self, docs: list[str]) -> list[Optional[Item]]: """Call the embedding function.""" - def _encode(doc): + def _encode(doc: list[str]): # Extract the dense vectors from the model. return self._model.encode(doc)['dense_vecs'] @@ -73,16 +78,11 @@ def _encode(doc): _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker ) - @override - def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]: - raise NotImplementedError('Garden computation is not supported for BGE-M3.') - @override def teardown(self) -> None: if not hasattr(self, '_model'): return - self._model.cpu() del self._model gc.collect() diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py new file mode 100644 index 00000000..77c525a7 --- /dev/null +++ b/lilac/embeddings/nomic_embed.py @@ -0,0 +1,108 @@ +"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" +import gc +from typing import TYPE_CHECKING, ClassVar, Optional + +from typing_extensions import override + +if TYPE_CHECKING: + from sentence_transformers import SentenceTransformer + +import functools + +from ..schema import Item +from ..signal import TextEmbeddingSignal +from ..splitters.spacy_splitter import clustering_spacy_chunker +from ..tasks import TaskExecutionType +from .embedding import chunked_compute_embedding +from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device + +# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. +NOMIC_EMBED = 'nomic-ai/nomic-embed-text-v1.5' + + +@functools.cache +def _get_and_cache_model(model_name: str) -> 'SentenceTransformer': + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + 'Could not import the "sentence_transformers" python package. ' + 'Please install it with `pip install "sentence_transformers".' + ) + return setup_model_device(SentenceTransformer(model_name, trust_remote_code=True), model_name) + + +class NomicEmbed15(TextEmbeddingSignal): + """Computes Nomic Embeddings 1.5 full (768 dimensions). + +
This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) + for details. + """ + + name: ClassVar[str] = 'nomic-embed-1.5-768' + display_name: ClassVar[str] = 'Nomic Embeddings 1.5 784' + local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE + local_parallelism: ClassVar[int] = 1 + local_strategy: ClassVar[TaskExecutionType] = 'threads' + supports_garden: ClassVar[bool] = False + + _model_name = NOMIC_EMBED + _model: 'SentenceTransformer' + _matryoshka_dim = 768 + + @override + def setup(self) -> None: + self._model = _get_and_cache_model(self._model_name) + + @override + def compute(self, docs: list[str]) -> list[Optional[Item]]: + """Call the embedding function.""" + try: + import torch.nn.functional as F + except ImportError: + raise ImportError( + 'Could not import the "sentence_transformers" python package. ' + 'Please install it with `pip install "sentence_transformers".' + ) + + def _encode(doc: list[str]): + embeddings = self._model.encode(doc, convert_to_tensor=True) + # Extract the dense vectors from the model. + embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) + embeddings = embeddings[:, : self._matryoshka_dim] + return embeddings.cpu().numpy() + + # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. + # The sentence transformer API actually does batching internally, so we pass + # local_batch_size * 16 to allow the library to see all the chunks at once. + return chunked_compute_embedding( + _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + ) + + @override + def teardown(self) -> None: + if not hasattr(self, '_model'): + return + + self._model.cpu() + del self._model + gc.collect() + + try: + import torch + + torch.cuda.empty_cache() + except ImportError: + pass + + +class NomicEmbed15_256(NomicEmbed15): + """Computes Nomic Embeddings 1.5 (256 dimensions). + +
This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) + for details. + """ + + name: ClassVar[str] = 'nomic-embed-1.5-256' + display_name: ClassVar[str] = 'Nomic Embeddings 1.5 256' + _matryoshka_dim = 256 diff --git a/lilac/signals/default_signals.py b/lilac/signals/default_signals.py index 451788ec..b85c7193 100644 --- a/lilac/signals/default_signals.py +++ b/lilac/signals/default_signals.py @@ -4,6 +4,7 @@ from ..embeddings.cohere import Cohere from ..embeddings.gte import GTEBase, GTESmall, GTETiny from ..embeddings.jina import JinaV2Base, JinaV2Small +from ..embeddings.nomic_embed import NomicEmbed15, NomicEmbed15_256 from ..embeddings.openai import OpenAIEmbedding from ..embeddings.sbert import SBERT from ..signal import register_signal @@ -46,3 +47,5 @@ def register_default_signals() -> None: register_signal(JinaV2Base, exists_ok=True) register_signal(BGEM3, exists_ok=True) + register_signal(NomicEmbed15, exists_ok=True) + register_signal(NomicEmbed15_256, exists_ok=True) diff --git a/poetry.lock b/poetry.lock index f5717225..df5a2091 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1421,6 +1421,17 @@ files = [ {file = "duckdb-0.9.2.tar.gz", hash = "sha256:3843afeab7c3fc4a4c0b53686a4cc1d9cdbdadcbb468d60fef910355ecafd447"}, ] +[[package]] +name = "einops" +version = "0.7.0" +description = "A new flavour of deep learning operations" +optional = true +python-versions = ">=3.8" +files = [ + {file = "einops-0.7.0-py3-none-any.whl", hash = "sha256:0f3096f26b914f465f6ff3c66f5478f9a5e380bb367ffc6493a68143fbbf1fd1"}, + {file = "einops-0.7.0.tar.gz", hash = "sha256:b2b04ad6081a3b227080c9bf5e3ace7160357ff03043cd66cc5b2319eb7031d1"}, +] + [[package]] name = "email-reply-parser" version = "0.5.12" @@ -6056,25 +6067,26 @@ win32 = ["pywin32"] [[package]] name = "sentence-transformers" -version = "2.2.2" +version = "2.3.1" description = "Multilingual text embeddings" optional = true -python-versions = ">=3.6.0" +python-versions = ">=3.8.0" files = [ - {file = "sentence-transformers-2.2.2.tar.gz", hash = "sha256:dbc60163b27de21076c9a30d24b5b7b6fa05141d68cf2553fa9a77bf79a29136"}, + {file = "sentence-transformers-2.3.1.tar.gz", hash = "sha256:d589d85a464f45338cdbdf99ea715f8068e1fb01c582e0bcdbf60bcf3eade6d0"}, + {file = "sentence_transformers-2.3.1-py3-none-any.whl", hash = "sha256:285d6637726c3b002186aa4b8bcace1101364b32671fb605297c4c2636b8190e"}, ] [package.dependencies] -huggingface-hub = ">=0.4.0" +huggingface-hub = ">=0.15.1" nltk = "*" numpy = "*" +Pillow = "*" scikit-learn = "*" scipy = "*" sentencepiece = "*" -torch = ">=1.6.0" -torchvision = "*" +torch = ">=1.11.0" tqdm = "*" -transformers = ">=4.6.0,<5.0.0" +transformers = ">=4.32.0,<5.0.0" [[package]] name = "sentencepiece" @@ -7007,44 +7019,6 @@ typing-extensions = "*" dynamo = ["jinja2"] opt-einsum = ["opt-einsum (>=3.3)"] -[[package]] -name = "torchvision" -version = "0.16.2" -description = "image and video datasets and models for torch deep learning" -optional = true -python-versions = ">=3.8" -files = [ - {file = "torchvision-0.16.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:bc86f2800cb2c0c1a09c581409cdd6bff66e62f103dc83fc63f73346264c3756"}, - {file = "torchvision-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b024bd412df6d3a007dcebf311a894eb3c5c21e1af80d12be382bbcb097a7c3a"}, - {file = "torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:e89f10f3c8351972b6e3fda95bc3e479ea8dbfc9dfcfd2c32902dbad4ba5cfc5"}, - {file = "torchvision-0.16.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:96c7583700112a410bdc4e1e4f118c429dab49c29c9a31a2cc3579bc9b08b19d"}, - {file = "torchvision-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:9f4032ebb3277fb07ff6a9b818d50a547fb8fcd89d958cfd9e773322454bb688"}, - {file = "torchvision-0.16.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:67b1aaf8b8cb02ce75dd445f291a27c8036a502f8c0aa76e28c37a0faac2e153"}, - {file = "torchvision-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bef30d03e1d1c629761f4dca51d3b7d8a0dc0acce6f4068ab2a1634e8e7b64e0"}, - {file = "torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e59cc7b2bd1ab5c0ce4ae382e4e37be8f1c174e8b5de2f6a23c170de9ae28495"}, - {file = "torchvision-0.16.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:e130b08cc9b3cc73a6c59d6edf032394a322f9579bfd21d14bc2e1d0999aa758"}, - {file = "torchvision-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:8692ab1e48807e9604046a6f4beeb67b523294cee1b00828654bb0df2cfce2b2"}, - {file = "torchvision-0.16.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:b82732dcf876a37c852772342aa6ee3480c03bb3e2a802ae109fc5f7e28d26e9"}, - {file = "torchvision-0.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4b065143d1a720fe8a9077fd4be35d491f98819ec80b3dbbc3ec64d0b707a906"}, - {file = "torchvision-0.16.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:bc5f274e4ecd1b86062063cdf4fd385a1d39d147a3a2685fbbde9ff08bb720b8"}, - {file = "torchvision-0.16.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:335959c43b371c0474af34c1ef2a52efdc7603c45700d29e4475eeb02984170c"}, - {file = "torchvision-0.16.2-cp38-cp38-win_amd64.whl", hash = "sha256:7fd22d86e08eba321af70cad291020c2cdeac069b00ce88b923ca52e06174769"}, - {file = "torchvision-0.16.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:56115268b37f0b75364e3654e47ad9abc66ac34c1f9e5e3dfa89a22d6a40017a"}, - {file = "torchvision-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:82805f8445b094f9d1e770390ee6cc86855e89955e08ce34af2e2274fc0e5c45"}, - {file = "torchvision-0.16.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3f4bd5fcbc361476e2e78016636ac7d5509e59d9962521f06eb98e6803898182"}, - {file = "torchvision-0.16.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:8199acdf8ab066a28b84a5b6f4d97b58976d9e164b1acc3a9d14fccfaf74bb3a"}, - {file = "torchvision-0.16.2-cp39-cp39-win_amd64.whl", hash = "sha256:41dd4fa9f176d563fe9f1b9adef3b7e582cdfb60ce8c9bc51b094a025be687c9"}, -] - -[package.dependencies] -numpy = "*" -pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" -requests = "*" -torch = "2.1.2" - -[package.extras] -scipy = ["scipy"] - [[package]] name = "tornado" version = "6.4" @@ -8161,6 +8135,7 @@ gte = ["sentence-transformers"] lang-detection = ["langdetect"] langsmith = ["langsmith"] llms = ["openai"] +nomic = ["einops", "sentence-transformers"] openai = ["openai"] pii = ["detect-secrets", "presidio_analyzer"] sbert = ["sentence-transformers"] @@ -8171,4 +8146,4 @@ text-stats = ["textacy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "94b8c942172e5a02cb89c0fe8f7ea134169bdf5356d5486549bd8facb73ba8aa" +content-hash = "5aec3cf990d020b4c1c66a1ebce20b00778021db680a4c4dd8fdf65aa9fb4295" diff --git a/pyproject.toml b/pyproject.toml index 60171251..ec1927fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,9 +52,10 @@ jinja2 = "^3.1.3" # Used for directory li # LLM providers. cohere = { version = "^4.32", optional = true } openai = { version = "^1.7.1", optional = true } -sentence-transformers = { version = "^2.2.2", optional = true } # SBERT on-device embeddings. +sentence-transformers = { version = "^2.3.1", optional = true } # SBERT on-device embeddings. FlagEmbedding = { version = "^1.2.3", optional = true } # bge on-device embeddings. transformers = { version = "^4.37.2", optional = true } # bge on-device embeddings. +einops = { version = "^0.7.0", optional = true } # Nomic on-device embeddings. # Gmail source. email-reply-parser = { version = "^0.5.12", optional = true } @@ -100,6 +101,7 @@ all = [ "langsmith", "llama-hub", "llama-index", + "nomic", "openai", "presidio_analyzer", "sentence-transformers", @@ -135,6 +137,7 @@ text_stats = ["textacy"] # Text statistics. # Individual embeddings. gte = ["sentence-transformers"] bge = ["FlagEmbedding", "transformers"] +nomic = ["sentence-transformers", "einops"] sbert = ["sentence-transformers"] cohere = ["cohere"] openai = ["openai"] diff --git a/web/lib/fastapi_client/models/ConceptSignal.ts b/web/lib/fastapi_client/models/ConceptSignal.ts index e67095f9..19ea8906 100644 --- a/web/lib/fastapi_client/models/ConceptSignal.ts +++ b/web/lib/fastapi_client/models/ConceptSignal.ts @@ -11,7 +11,7 @@ export type ConceptSignal = { /** * The name of the pre-computed embedding. */ - embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3'; + embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3' | 'nomic-embed-1.5-768' | 'nomic-embed-1.5-256'; namespace: string; concept_name: string; version?: (number | null); diff --git a/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts b/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts index 7adb8843..ec964d36 100644 --- a/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts +++ b/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts @@ -14,7 +14,7 @@ export type SemanticSimilaritySignal = { /** * The name of the pre-computed embedding. */ - embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3'; + embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3' | 'nomic-embed-1.5-768' | 'nomic-embed-1.5-256'; query: string; /** * The input type of the query, used for the query embedding.