Skip to content

Commit

Permalink
Merge pull request #10 from docugami/tjaffri/ci
Browse files Browse the repository at this point in the history
Added eval notebook and dataset
  • Loading branch information
tjaffri authored Dec 6, 2023
2 parents 2edf148 + 885692a commit f8c9e11
Show file tree
Hide file tree
Showing 17 changed files with 705 additions and 42 deletions.
8 changes: 4 additions & 4 deletions docugami_kg_rag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ class LocalIndexState:

# Lengths for the loader are in terms of characters, 1 token ~= 4 chars in English
# Reference: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
MAX_CHUNK_TEXT_LENGTH = 1024 * 24 # ~6k tokens
MIN_CHUNK_TEXT_LENGTH = 1024 * 8 # ~2k tokens
MAX_CHUNK_TEXT_LENGTH = 1024 * 28 # ~7k tokens
MIN_CHUNK_TEXT_LENGTH = 1024 * 1 # ~1k tokens
SUB_CHUNK_TABLES = False
INCLUDE_XML_TAGS = True
PARENT_HIERARCHY_LEVELS = 1
RETRIEVER_K = 10
PARENT_HIERARCHY_LEVELS = 4
RETRIEVER_K = 6

BATCH_SIZE = 16
13 changes: 10 additions & 3 deletions docugami_kg_rag/helpers/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,23 @@
All your answers must contain citations to help the user understand how you created the citation, specifically:
- If the given context contains the names of document(s), make sure you include that in your answer as
a citation, e.g. include "\\n\\nSOURCE(S): foo.pdf, bar.pdf" at the end of your answer.
- If the given context contains the names of document(s), make sure you include the document you got the
answer from as a citation, e.g. include "\\n\\nSOURCE(S): foo.pdf, bar.pdf" at the end of your answer.
- If the answer was generated via a SQL Query, make sure you include the SQL query in your answer as
a citation, e.g. include "\\n\\nSOURCE(S): SELECT AVG('square footage') from Leases". The SQL query should be
in the agent scratchpad provided.
in the agent scratchpad provided, if you are using an agent.
- Make sure there an actual answer if you show a SOURCE citation, i.e. make sure you don't show only
a bare citation with no actual answer.
"""

HUMAN_MESSAGE_TEMPLATE = """{context}
Using the context above, which can include text and tables, answer the following question.
Question: {question}
"""

CREATE_DIRECT_RETRIEVAL_TOOL_DESCRIPTION_PROMPT = """Here is a snippet from a sample document of type {docset_name}:
{document}
Expand Down
31 changes: 19 additions & 12 deletions docugami_kg_rag/helpers/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain.schema import BaseRetriever, Document, StrOutputParser
from langchain.tools.base import BaseTool
from langchain.vectorstores import Chroma

Expand All @@ -25,6 +25,22 @@
)


def get_retriever_for_docset(docset_state: LocalIndexState) -> BaseRetriever:
"""
Gets a retriever for a docset. Chunks are in the vector store, and full documents
are in the store inside the local state.
"""
chunk_vectorstore = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=EMBEDDINGS)

return FusedSummaryRetriever(
vectorstore=chunk_vectorstore,
parent_doc_store=docset_state.chunks_by_id,
full_doc_summary_store=docset_state.full_doc_summaries_by_id,
search_kwargs={"k": RETRIEVER_K},
search_type=SearchType.mmr,
)


def docset_name_to_direct_retriever_tool_function_name(name: str) -> str:
"""
Converts a docset name to a direct retriever tool function name.
Expand Down Expand Up @@ -75,19 +91,10 @@ def chunks_to_direct_retriever_tool_description(name: str, chunks: List[Document

def get_retrieval_tool_for_docset(docset_state: LocalIndexState) -> Optional[BaseTool]:
"""
Chunks are in the vector store, and full documents are in the store inside the local state
Gets a retrieval tool for an agent.
"""

chunk_vectorstore = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=EMBEDDINGS)

retriever = FusedSummaryRetriever(
vectorstore=chunk_vectorstore,
parent_doc_store=docset_state.chunks_by_id,
full_doc_summary_store=docset_state.full_doc_summaries_by_id,
search_kwargs={"k": RETRIEVER_K},
search_type=SearchType.mmr,
)

retriever = get_retriever_for_docset(docset_state=docset_state)
return create_retriever_tool(
retriever=retriever,
name=docset_state.retrieval_tool_function_name,
Expand Down
Loading

0 comments on commit f8c9e11

Please sign in to comment.