Skip to content

Commit

Permalink
Added batching to embed documents (#262)
Browse files Browse the repository at this point in the history
  • Loading branch information
whitead authored Mar 30, 2024
1 parent 57631a2 commit 7ce17f0
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
16 changes: 11 additions & 5 deletions paperqa/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,23 @@ def process_llm_config(


async def embed_documents(
client: AsyncOpenAI, texts: list[str], embedding_model: str
client: AsyncOpenAI, texts: list[str], embedding_model: str, batch_size: int = 16
) -> list[list[float]]:
"""Embed a list of documents with batching."""
if client is None:
raise ValueError(
"Your client is None - did you forget to set it after pickling?"
)
response = await client.embeddings.create(
model=embedding_model, input=texts, encoding_format="float"
)
return [e.embedding for e in response.data]
N = len(texts)
embeddings = []
for i in range(0, N, batch_size):
response = await client.embeddings.create(
model=embedding_model,
input=texts[i : i + batch_size],
encoding_format="float",
)
embeddings.extend([e.embedding for e in response.data])
return embeddings


class EmbeddingModel(ABC, BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ name = "paper-qa"
readme = "README.md"
requires-python = ">=3.8"
urls = {repository = "https://github.com/whitead/paper-qa"}
version = "4.4.0"
version = "4.4.1"

[tool.codespell]
check-filenames = true
Expand Down

0 comments on commit 7ce17f0

Please sign in to comment.