From 9d9b2e1d18e60669b02139e0e5917089cf6234a4 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Tue, 27 Feb 2024 10:11:44 -0500 Subject: [PATCH] save --- lilac/concepts/concept.py | 4 +++- lilac/embeddings/bge.py | 8 ++++++-- lilac/embeddings/cohere.py | 8 ++++++-- lilac/embeddings/gte.py | 12 +++++++++--- lilac/embeddings/nomic_embed.py | 9 +++++++-- lilac/embeddings/openai.py | 8 ++++++-- lilac/embeddings/sbert.py | 8 ++++++-- 7 files changed, 43 insertions(+), 14 deletions(-) diff --git a/lilac/concepts/concept.py b/lilac/concepts/concept.py index 7580a369..de353761 100644 --- a/lilac/concepts/concept.py +++ b/lilac/concepts/concept.py @@ -66,8 +66,10 @@ class ExampleIn(BaseModel): @field_validator('text') @classmethod - def parse_text(cls, text: str) -> str: + def parse_text(cls, text: Optional[str]) -> Optional[str]: """Fixes surrogate errors in text: https://github.com/ijl/orjson/blob/master/README.md#str .""" + if not text: + return None return text.encode('utf-8', 'replace').decode('utf-8') diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index 696565ea..a65d51ff 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -1,9 +1,10 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast from typing_extensions import override +from ..splitters.chunk_splitter import TextChunk from ..utils import log if TYPE_CHECKING: @@ -69,7 +70,10 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding( lambda docs: self._model.encode(docs)['dense_vecs'], docs, diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py index 7559bc31..4fcff463 100644 --- a/lilac/embeddings/cohere.py +++ b/lilac/embeddings/cohere.py @@ -1,5 +1,5 @@ """Cohere embeddings.""" -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast import numpy as np from typing_extensions import override @@ -7,6 +7,7 @@ from ..env import env from ..schema import Item from ..signal import TextEmbeddingSignal +from ..splitters.chunk_splitter import TextChunk from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType from .embedding import chunked_compute_embedding, identity_chunker @@ -65,5 +66,8 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]: ).embeddings ] - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker) diff --git a/lilac/embeddings/gte.py b/lilac/embeddings/gte.py index 5e4d7192..074d57e3 100644 --- a/lilac/embeddings/gte.py +++ b/lilac/embeddings/gte.py @@ -1,7 +1,7 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc import itertools -from typing import TYPE_CHECKING, ClassVar, Iterator, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Iterator, Optional, cast import modal from typing_extensions import override @@ -69,7 +69,10 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding( self._model.encode, docs, self.local_batch_size * 16, chunker=chunker ) @@ -79,7 +82,10 @@ def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]: # Trim the docs to the max context size. trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs) - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) text_chunks: Iterator[tuple[int, TextChunk]] = ( (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc) ) diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py index 32c52e01..118f7974 100644 --- a/lilac/embeddings/nomic_embed.py +++ b/lilac/embeddings/nomic_embed.py @@ -1,10 +1,12 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast import numpy as np from typing_extensions import override +from ..splitters.chunk_splitter import TextChunk + if TYPE_CHECKING: from sentence_transformers import SentenceTransformer @@ -76,7 +78,10 @@ def _encode(doc: list[str]) -> list[np.ndarray]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker) @override diff --git a/lilac/embeddings/openai.py b/lilac/embeddings/openai.py index 318cfe4f..0da8653c 100644 --- a/lilac/embeddings/openai.py +++ b/lilac/embeddings/openai.py @@ -1,5 +1,5 @@ """OpenAI embeddings.""" -from typing import ClassVar, Optional +from typing import Callable, ClassVar, Optional, cast import numpy as np from tenacity import retry, stop_after_attempt, wait_random_exponential @@ -8,6 +8,7 @@ from ..env import env from ..schema import Item from ..signal import TextEmbeddingSignal +from ..splitters.chunk_splitter import TextChunk from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType from .embedding import chunked_compute_embedding, identity_chunker @@ -92,5 +93,8 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]: ) return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data] - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker) diff --git a/lilac/embeddings/sbert.py b/lilac/embeddings/sbert.py index 64562302..91c586e4 100644 --- a/lilac/embeddings/sbert.py +++ b/lilac/embeddings/sbert.py @@ -1,8 +1,9 @@ """Sentence-BERT embeddings. Open-source models, designed to run on device.""" -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast from typing_extensions import override +from ..splitters.chunk_splitter import TextChunk from ..tasks import TaskExecutionType if TYPE_CHECKING: @@ -47,7 +48,10 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. - chunker = clustering_spacy_chunker if self._split else identity_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding( self._model.encode, docs, self.local_batch_size * 16, chunker=chunker )