From bdc10421216246b5e49dd862d9ab021c4067864f Mon Sep 17 00:00:00 2001 From: Taqi Jaffri Date: Thu, 7 Dec 2023 13:00:32 -0800 Subject: [PATCH] fix chroma indexing overwrite issue --- README.md | 1 + docugami_kg_rag/helpers/indexing.py | 34 +++++++++++++++++++++-------- index.py | 9 +++++--- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index ba735b9..90a55e6 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Before you can run your app, you need to build your index in Chroma. See [index. Indexing in this template uses the Docugami Loader for LangChain to create semantic chunks out of your documents. Refer to this [documentation](https://python.langchain.com/docs/integrations/document_loaders/docugami) for details. +Note that if you previously ran indexing for the same docset, the index will not be recreated. If you want to force recreate the index (e.g. if you have new docs in the docset or changed your chunking config parameters) please specify `poetry run index.py --overwrite` ### Creating app To use this package, you should first have the LangChain CLI installed: diff --git a/docugami_kg_rag/helpers/indexing.py b/docugami_kg_rag/helpers/indexing.py index dfd0316..82c23d0 100644 --- a/docugami_kg_rag/helpers/indexing.py +++ b/docugami_kg_rag/helpers/indexing.py @@ -8,6 +8,7 @@ from langchain.schema import Document from langchain.storage.in_memory import InMemoryStore from langchain.vectorstores import Chroma +import chromadb from docugami_kg_rag.config import ( CHROMA_DIRECTORY, @@ -73,21 +74,36 @@ def update_local_index( pickle.dump(state, file) -def populate_chroma_index(docset_id: str, chunks: List[Document]): +def populate_chroma_index(docset_id: str, chunks: List[Document], overwrite=False): """ - Create index if it does not exist + Create index if it does not exist, delete and overwrite if overwrite is specified. """ - print(f"Creating index for {docset_id}...") + persistent_client = chromadb.PersistentClient(path=CHROMA_DIRECTORY) + collections = persistent_client.list_collections() + matching_collection = None + for c in collections: + if c.name == docset_id: + matching_collection = c + + if matching_collection: + print(f"Chroma collection already exists for {docset_id}.") + if overwrite == True: + print(f"Overwrite is {overwrite}, deleting existing index") + persistent_client.delete_collection(docset_id) + else: + print(f"Overwrite is {overwrite}, will just reuse existing index (any new docs will not be added)") + return + + print(f"Embedding documents into chroma for {docset_id}...") - # Reset the collection - chroma = Chroma.from_documents(chunks, EMBEDDINGS, persist_directory=CHROMA_DIRECTORY) - chroma.persist() + langchain_chroma = Chroma.from_documents(documents=chunks, embedding=EMBEDDINGS, persist_directory=CHROMA_DIRECTORY) + langchain_chroma.persist() - print(f"Done embedding documents to chroma collection {docset_id}!") + print(f"Done embedding documents into chroma for {docset_id}!") -def index_docset(docset_id: str, name: str): +def index_docset(docset_id: str, name: str, overwrite=False): """ Indexes the given docset """ @@ -161,4 +177,4 @@ def index_docset(docset_id: str, name: str): report_details=report_details, ) - populate_chroma_index(docset_id, list(chunk_summaries_by_id.values())) + populate_chroma_index(docset_id, chunks=list(chunk_summaries_by_id.values()), overwrite=overwrite) diff --git a/index.py b/index.py index 19b063b..7baba9f 100644 --- a/index.py +++ b/index.py @@ -6,8 +6,11 @@ docugami_client = Docugami() +app = typer.Typer() -def main(): + +@app.command() +def main(overwrite: bool = False): docsets_response = docugami_client.docsets.list() if not docsets_response or not docsets_response.docsets: @@ -32,7 +35,7 @@ def main(): if not docset.id or not docset.name: raise Exception(f"Docset must have ID as well as Name: {docset}") - index_docset(docset.id, docset.name) + index_docset(docset.id, docset.name, overwrite) if __name__ == "__main__": @@ -40,4 +43,4 @@ def main(): # This code will only run if a debugger is attached index_docset(docset_id="clajbjkbnuye", name="Semi-Structured") else: - main() + app()