Merge pull request #10 from docugami/tjaffri/ci

Added eval notebook and dataset
docugami · Dec 6, 2023 · f8c9e11 · f8c9e11
2 parents 2edf148 + 885692a
commit f8c9e11
Show file tree

Hide file tree

Showing 17 changed files with 705 additions and 42 deletions.
diff --git a/docugami_kg_rag/config.py b/docugami_kg_rag/config.py
@@ -72,11 +72,11 @@ class LocalIndexState:
 
 # Lengths for the loader are in terms of characters, 1 token ~= 4 chars in English
 # Reference: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-MAX_CHUNK_TEXT_LENGTH = 1024 * 24  # ~6k tokens
-MIN_CHUNK_TEXT_LENGTH = 1024 * 8  # ~2k tokens
+MAX_CHUNK_TEXT_LENGTH = 1024 * 28  # ~7k tokens
+MIN_CHUNK_TEXT_LENGTH = 1024 * 1  # ~1k tokens
 SUB_CHUNK_TABLES = False
 INCLUDE_XML_TAGS = True
-PARENT_HIERARCHY_LEVELS = 1
-RETRIEVER_K = 10
+PARENT_HIERARCHY_LEVELS = 4
+RETRIEVER_K = 6
 
 BATCH_SIZE = 16
diff --git a/docugami_kg_rag/helpers/prompts.py b/docugami_kg_rag/helpers/prompts.py
@@ -9,16 +9,23 @@
 
 All your answers must contain citations to help the user understand how you created the citation, specifically:
 
-- If the given context contains the names of document(s), make sure you include that in your answer as 
-  a citation, e.g. include "\\n\\nSOURCE(S): foo.pdf, bar.pdf" at the end of your answer.
+- If the given context contains the names of document(s), make sure you include the document you got the
+  answer from as a citation, e.g. include "\\n\\nSOURCE(S): foo.pdf, bar.pdf" at the end of your answer.
 - If the answer was generated via a SQL Query, make sure you include the SQL query in your answer as
   a citation, e.g. include "\\n\\nSOURCE(S): SELECT AVG('square footage') from Leases". The SQL query should be
-  in the agent scratchpad provided.
+  in the agent scratchpad provided, if you are using an agent.
 - Make sure there an actual answer if you show a SOURCE citation, i.e. make sure you don't show only
   a bare citation with no actual answer. 
 
 """
 
+HUMAN_MESSAGE_TEMPLATE = """{context}
+
+Using the context above, which can include text and tables, answer the following question.
+
+Question: {question}
+"""
+
 CREATE_DIRECT_RETRIEVAL_TOOL_DESCRIPTION_PROMPT = """Here is a snippet from a sample document of type {docset_name}:
 
 {document}

diff --git a/docugami_kg_rag/helpers/retrieval.py b/docugami_kg_rag/helpers/retrieval.py
@@ -3,7 +3,7 @@
 
 from langchain.agents.agent_toolkits import create_retriever_tool
 from langchain.prompts import ChatPromptTemplate
-from langchain.schema import Document, StrOutputParser
+from langchain.schema import BaseRetriever, Document, StrOutputParser
 from langchain.tools.base import BaseTool
 from langchain.vectorstores import Chroma
 
@@ -25,6 +25,22 @@
 )
 
 
+def get_retriever_for_docset(docset_state: LocalIndexState) -> BaseRetriever:
+    """
+    Gets a retriever for a docset. Chunks are in the vector store, and full documents
+    are in the store inside the local state.
+    """
+    chunk_vectorstore = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=EMBEDDINGS)
+
+    return FusedSummaryRetriever(
+        vectorstore=chunk_vectorstore,
+        parent_doc_store=docset_state.chunks_by_id,
+        full_doc_summary_store=docset_state.full_doc_summaries_by_id,
+        search_kwargs={"k": RETRIEVER_K},
+        search_type=SearchType.mmr,
+    )
+
+
 def docset_name_to_direct_retriever_tool_function_name(name: str) -> str:
     """
     Converts a docset name to a direct retriever tool function name.
@@ -75,19 +91,10 @@ def chunks_to_direct_retriever_tool_description(name: str, chunks: List[Document
 
 def get_retrieval_tool_for_docset(docset_state: LocalIndexState) -> Optional[BaseTool]:
     """
-    Chunks are in the vector store, and full documents are in the store inside the local state
+    Gets a retrieval tool for an agent.
     """
 
-    chunk_vectorstore = Chroma(persist_directory=CHROMA_DIRECTORY, embedding_function=EMBEDDINGS)
-
-    retriever = FusedSummaryRetriever(
-        vectorstore=chunk_vectorstore,
-        parent_doc_store=docset_state.chunks_by_id,
-        full_doc_summary_store=docset_state.full_doc_summaries_by_id,
-        search_kwargs={"k": RETRIEVER_K},
-        search_type=SearchType.mmr,
-    )
-
+    retriever = get_retriever_for_docset(docset_state=docset_state)
     return create_retriever_tool(
         retriever=retriever,
         name=docset_state.retrieval_tool_function_name,