Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delaying of state mutation in Docs.aadd_texts #264

Merged
merged 1 commit into from
Mar 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 30 additions & 19 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,36 +403,47 @@ def add_texts(
loop = get_loop()
return loop.run_until_complete(self.aadd_texts(texts, doc))

async def aadd_texts(
self,
texts: list[Text],
doc: Doc,
) -> bool:
"""Add chunked texts to the collection. This is useful if you have already chunked the texts yourself.
async def aadd_texts(self, texts: list[Text], doc: Doc) -> bool:
"""
Add chunked texts to the collection.

Returns True if the document was added, False if it was already in the collection.
NOTE: this is useful if you have already chunked the texts yourself.

Returns:
True if the doc was added, otherwise False if already in the collection.
"""
if doc.dockey in self.docs:
return False
if len(texts) == 0:
if not texts:
raise ValueError("No texts to add.")
if doc.docname in self.docnames:
new_docname = self._get_unique_name(doc.docname)
for t in texts:
t.name = t.name.replace(doc.docname, new_docname)
doc.docname = new_docname
if texts[0].embedding is None:
text_embeddings = await self.texts_index.embedding_model.embed_documents(
self._embedding_client, [t.text for t in texts]
# 1. Calculate text embeddings if not already present, but don't set them into
# the texts until we've set up the Doc's embedding, so callers can retry upon
# OpenAI rate limit errors
text_embeddings: list[list[float]] | None = (
await self.texts_index.embedding_model.embed_documents(
self._embedding_client, texts=[t.text for t in texts]
)
for i, t in enumerate(texts):
t.embedding = text_embeddings[i]
if texts[0].embedding is None
else None
)
# 2. Set the Doc's embedding to be the Doc's citation embedded
if doc.embedding is None:
doc.embedding = (
await self.docs_index.embedding_model.embed_documents(
self._embedding_client, [doc.citation]
self._embedding_client, texts=[doc.citation]
)
)[0]
# 3. Now we can set the text embeddings
if text_embeddings is not None:
for t, t_embedding in zip(texts, text_embeddings, strict=True):
t.embedding = t_embedding
# 4. Update texts and the Doc's name
if doc.docname in self.docnames:
new_docname = self._get_unique_name(doc.docname)
for t in texts:
t.name = t.name.replace(doc.docname, new_docname)
doc.docname = new_docname
# 5. Index remaining updates
if not self.jit_texts_index:
self.texts_index.add_texts_and_embeddings(texts)
self.docs_index.add_texts_and_embeddings([doc])
Expand Down
Loading