Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed the dependency on numpy. #20

Merged
merged 5 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions llmclient/embeddings.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import asyncio
from abc import ABC, abstractmethod
from collections import Counter
from enum import StrEnum
from itertools import chain
from typing import Any

import litellm
import numpy as np
import tiktoken
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

Expand Down Expand Up @@ -171,13 +172,9 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
enc_batch = self.enc.encode_ordinary_batch(texts)
# now get frequency of each token rel to length
return [
(
np.bincount([xi % self.ndim for xi in x], minlength=self.ndim).astype(
float
)
/ len(x)
).tolist()
[token_counts.get(xi, 0) / len(x) for xi in range(self.ndim)]
for x in enc_batch
if (token_counts := Counter(xi % self.ndim for xi in x))
]


Expand All @@ -199,7 +196,11 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
all_embeds = await asyncio.gather(
*[m.embed_documents(texts) for m in self.models]
)
return np.concatenate(all_embeds, axis=1).tolist()

return [
list(chain.from_iterable(embed_group))
for embed_group in zip(*all_embeds, strict=True)
]

def set_mode(self, mode: EmbeddingModes) -> None:
# Set mode for all component models
Expand All @@ -217,6 +218,7 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
try:
import numpy as np # noqa: F401
from sentence_transformers import SentenceTransformer
except ImportError as exc:
raise ImportError(
Expand All @@ -240,6 +242,8 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
Returns:
A list of embedding vectors.
"""
import numpy as np

# Extract additional configurations if needed
batch_size = self.config.get("batch_size", 32)
device = self.config.get("device", "cpu")
Expand Down
7 changes: 2 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ dependencies = [
"fhaviary>=0.8.2", # For core namespace
"limits",
"litellm>=1.44", # For LITELLM_LOG addition
"numpy",
"pydantic~=2.0,>=2.10.1,<2.10.2",
"tiktoken>=0.4.0",
"typing-extensions; python_version <= '3.11'", # for typing.override
Expand All @@ -40,7 +39,7 @@ requires-python = ">=3.11"

[project.optional-dependencies]
dev = [
"fh-llm-client[image,local]",
"fh-llm-client[local]",
"fhaviary[xml]",
"ipython>=8", # Pin to keep recent
"mypy>=1.8", # Pin for mutable-override
Expand All @@ -58,10 +57,8 @@ dev = [
"python-dotenv",
"refurb>=2", # Pin to keep recent
]
image = [
"Pillow",
]
local = [
"numpy",
"sentence-transformers",
]

Expand Down
18 changes: 7 additions & 11 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading