Skip to content

Commit

Permalink
fix chroma indexing overwrite issue
Browse files Browse the repository at this point in the history
  • Loading branch information
Taqi Jaffri committed Dec 7, 2023
1 parent 48699ae commit bdc1042
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 12 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Before you can run your app, you need to build your index in Chroma. See [index.

Indexing in this template uses the Docugami Loader for LangChain to create semantic chunks out of your documents. Refer to this [documentation](https://python.langchain.com/docs/integrations/document_loaders/docugami) for details.

Note that if you previously ran indexing for the same docset, the index will not be recreated. If you want to force recreate the index (e.g. if you have new docs in the docset or changed your chunking config parameters) please specify `poetry run index.py --overwrite`

### Creating app
To use this package, you should first have the LangChain CLI installed:
Expand Down
34 changes: 25 additions & 9 deletions docugami_kg_rag/helpers/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from langchain.schema import Document
from langchain.storage.in_memory import InMemoryStore
from langchain.vectorstores import Chroma
import chromadb

from docugami_kg_rag.config import (
CHROMA_DIRECTORY,
Expand Down Expand Up @@ -73,21 +74,36 @@ def update_local_index(
pickle.dump(state, file)


def populate_chroma_index(docset_id: str, chunks: List[Document]):
def populate_chroma_index(docset_id: str, chunks: List[Document], overwrite=False):
"""
Create index if it does not exist
Create index if it does not exist, delete and overwrite if overwrite is specified.
"""

print(f"Creating index for {docset_id}...")
persistent_client = chromadb.PersistentClient(path=CHROMA_DIRECTORY)
collections = persistent_client.list_collections()
matching_collection = None
for c in collections:
if c.name == docset_id:
matching_collection = c

if matching_collection:
print(f"Chroma collection already exists for {docset_id}.")
if overwrite == True:
print(f"Overwrite is {overwrite}, deleting existing index")
persistent_client.delete_collection(docset_id)
else:
print(f"Overwrite is {overwrite}, will just reuse existing index (any new docs will not be added)")
return

print(f"Embedding documents into chroma for {docset_id}...")

# Reset the collection
chroma = Chroma.from_documents(chunks, EMBEDDINGS, persist_directory=CHROMA_DIRECTORY)
chroma.persist()
langchain_chroma = Chroma.from_documents(documents=chunks, embedding=EMBEDDINGS, persist_directory=CHROMA_DIRECTORY)
langchain_chroma.persist()

print(f"Done embedding documents to chroma collection {docset_id}!")
print(f"Done embedding documents into chroma for {docset_id}!")


def index_docset(docset_id: str, name: str):
def index_docset(docset_id: str, name: str, overwrite=False):
"""
Indexes the given docset
"""
Expand Down Expand Up @@ -161,4 +177,4 @@ def index_docset(docset_id: str, name: str):
report_details=report_details,
)

populate_chroma_index(docset_id, list(chunk_summaries_by_id.values()))
populate_chroma_index(docset_id, chunks=list(chunk_summaries_by_id.values()), overwrite=overwrite)
9 changes: 6 additions & 3 deletions index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

docugami_client = Docugami()

app = typer.Typer()

def main():

@app.command()
def main(overwrite: bool = False):
docsets_response = docugami_client.docsets.list()

if not docsets_response or not docsets_response.docsets:
Expand All @@ -32,12 +35,12 @@ def main():
if not docset.id or not docset.name:
raise Exception(f"Docset must have ID as well as Name: {docset}")

index_docset(docset.id, docset.name)
index_docset(docset.id, docset.name, overwrite)


if __name__ == "__main__":
if sys.gettrace():
# This code will only run if a debugger is attached
index_docset(docset_id="clajbjkbnuye", name="Semi-Structured")
else:
main()
app()

0 comments on commit bdc1042

Please sign in to comment.