doc: add document loading notebook example

artefactory-skaff · Mar 19, 2024 · a233d1a · a233d1a
1 parent 0ca93d0
commit a233d1a
Show file tree

Hide file tree

Showing 9 changed files with 151 additions and 88 deletions.
diff --git a/README.md b/README.md
@@ -17,27 +17,25 @@ This is a starter kit to deploy a modularizable RAG locally or on the cloud (or
 
 ## Quickstart
 
-This quickstart will guide you through the steps to serve a RAG fully locally. You will run the API backend and frontend on your machine, which should allow you to run your first queries against the RAG.
+This quickstart will guide you through the steps to serve the RAG and load a few documents. 
+
+You will run both the back and front on your machine.
+
+For this exemple, we will be using GPT4, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store.
 
-For this exemple, we will be using the `tinyllama` LLM, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store. This allows this setup to be fully local, and independent of any external API (and thus, free). However, the relevance of answers will not be impressive.
 
 Duration: ~15 minutes.
 
 ### Pre-requisites
 
-- Ollama, to serve the LLM locally ([Download and install](https://ollama.com/))
-- A few GB of disk space to host the models
+- An `OPENAI_API_KEY` for the Artefact GPT-4 deployment on Azure. Contact alexis.vialaret@artefact.com if you do not have one.
+- A few GB of disk space
 - Tested with python 3.11 (may work with other versions)
 
 ### Run using docker compose
 
 If you have docker installed and running you can run the whole RAG app using it. [Otherwise, skip to the "Run directly" section](#run-directly)
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 Start the service:
 ```shell
 docker compose up -d
@@ -53,11 +51,6 @@ Go to http://localhost:9000/ to query your RAG.
 
 ### Run directly
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 In a fresh env:
 ```shell
 pip install -r requirements-dev.txt
@@ -79,20 +72,9 @@ Start the frontend demo
 python -m streamlit run frontend/front.py
 ```
 
-### Querying and loading the RAG
-
-You should then be able to login and chat to the bot:
-
-![](docs/login_and_chat.gif)
-
-Right now the RAG does not have any document loaded, let's add a sample:
-```shell
-python data_sample/add_data_sample_to_rag.py
-```
-
-The RAG now has access to the information from your loaded documents:
+### Loading documents in the RAG
 
-![](docs/query_with_knowledge.gif)
+Right now the RAG does not have any documents loaded, you can use the notebook in the `examples` folder to transform a file into documents and load them in the vector store.
 
 ## Documentation
 

diff --git a/backend/config.yaml b/backend/config.yaml
@@ -1,9 +1,11 @@
 LLMConfig: &LLMConfig
-  source: ChatOllama
+  source: AzureChatOpenAI
   source_config:
-    model: tinyllama
-    temperature: 0
-    # base_url: http://host.docker.internal:11434  # Uncomment this line if you are running the RAG through Docker Compose
+    openai_api_type: azure
+    openai_api_key: {{ OPENAI_API_KEY }}
+    openai_api_base: https://genai-ds.openai.azure.com/openai/deployments/gpt4
+    openai_api_version: 2023-07-01-preview
+    temperature: 0.1
 
 VectorStoreConfig: &VectorStoreConfig
   source: Chroma

diff --git a/backend/main.py b/backend/main.py
@@ -11,7 +11,6 @@
 rag = RAG(config=Path(__file__).parent / "config.yaml")
 chain = rag.get_chain()
 
-
 # Create a minimal RAG server based on langserve
 # Learn how to extend this configuration to add authentication and session management
 # https://artefactory.github.io/skaff-rag-accelerator/backend/plugins/plugins/

diff --git a/backend/rag_components/rag.py b/backend/rag_components/rag.py
@@ -1,7 +1,6 @@
 from pathlib import Path
 from typing import List, Union
 
-import sqlalchemy
 from langchain.chat_models.base import BaseChatModel
 from langchain.docstore.document import Document
 from langchain.indexes import SQLRecordManager, index
@@ -67,25 +66,26 @@ def load_file(self, file_path: Path) -> List[Document]:
         filtered_documents = filter_complex_metadata(documents)
         return self.load_documents(filtered_documents)
 
-    def load_documents(self, documents: List[Document], insertion_mode: str = None):
+    def load_documents(self, documents: List[Document], insertion_mode: str = None, namespace: str = "default"):
         insertion_mode = insertion_mode or self.config.vector_store.insertion_mode
 
         record_manager = SQLRecordManager(
-            namespace="vector_store/my_docs", db_url=self.config.database.database_url
+            namespace=namespace, db_url=self.config.database.database_url
         )
 
-        try:
-            record_manager.create_schema()
-        except sqlalchemy.exc.OperationalError:
-            with Database() as connection:
-                connection.initialize_schema()
-            record_manager.create_schema()
-
-        indexing_output = index(
-            documents,
-            record_manager,
-            self.vector_store,
-            cleanup=insertion_mode,
-            source_id_key="source",
-        )
-        self.logger.info({"event": "load_documents", **indexing_output})
+        record_manager.create_schema()
+
+        self.logger.info(f"Indexing {len(documents)} documents.")
+
+        batch_size = 100
+        for batch in range(0, len(documents), batch_size):
+            self.logger.info(f"Indexing batch {batch} to {min(len(documents), batch + batch_size)}.")
+
+            indexing_output = index(
+                documents[batch : min(len(documents), batch + batch_size)],
+                record_manager,
+                self.vector_store,
+                cleanup=insertion_mode,
+                source_id_key="source",
+            )
+            self.logger.info({"event": "load_documents", **indexing_output})
diff --git a/data_sample/add_data_sample_to_rag.py b/data_sample/add_data_sample_to_rag.py
diff --git a/docs/cookbook/configs/llms_configs.md b/docs/cookbook/configs/llms_configs.md
@@ -10,7 +10,7 @@ LLMConfig: &LLMConfig
     openai_api_base: https://genai-ds.openai.azure.com/
     openai_api_version: 2023-07-01-preview
     deployment_name: gpt4
-  temperature: 0.1
+    temperature: 0.1
 ```
 
 ## Local llama2
@@ -58,5 +58,5 @@ LLMConfig: &LLMConfig
   source: ChatVertexAI
   source_config:
     model_name: gemini-pro
-  temperature: 0.1
+    temperature: 0.1
 ```
diff --git a/docs/index.md b/docs/index.md
@@ -17,27 +17,24 @@ This is a starter kit to deploy a modularizable RAG locally or on the cloud (or
 
 ## Quickstart
 
-This quickstart will guide you through the steps to serve a RAG fully locally. You will run the API backend and frontend on your machine, which should allow you to run your first queries against the RAG.
+This quickstart will guide you through the steps to serve the RAG and load a few documents. 
 
-For this exemple, we will be using the `tinyllama` LLM, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store. This allows this setup to be fully local, and independent of any external API (and thus, free). However, the relevance of answers will not be impressive.
+You will run both the back and front on your machine.
+
+For this exemple, we will be using GPT4, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store.
 
 Duration: ~15 minutes.
 
 ### Pre-requisites
 
-- Ollama, to serve the LLM locally ([Download and install](https://ollama.com/))
-- A few GB of disk space to host the models
+- An `OPENAI_API_KEY` for the Artefact GPT-4 deployment on Azure. Contact alexis.vialaret@artefact.com if you do not have one.
+- A few GB of disk space
 - Tested with python 3.11 (may work with other versions)
 
 ### Run using docker compose
 
 If you have docker installed and running you can run the whole RAG app using it. [Otherwise, skip to the "Run directly" section](index.md#run-directly)
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 Start the service:
 ```shell
 docker compose up -d
@@ -53,11 +50,6 @@ Go to http://localhost:9000/ to query your RAG.
 
 ### Run directly
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 In a fresh env:
 ```shell
 pip install -r requirements-dev.txt
@@ -79,20 +71,9 @@ Start the frontend demo
 python -m streamlit run frontend/front.py
 ```
 
-### Querying and loading the RAG
-
-You should then be able to login and chat to the bot:
-
-![](login_and_chat.gif)
-
-Right now the RAG does not have any document loaded, let's add a sample:
-```shell
-python data_sample/add_data_sample_to_rag.py
-```
-
-The RAG now has access to the information from your loaded documents:
+### Loading documents in the RAG
 
-![](query_with_knowledge.gif)
+Right now the RAG does not have any documents loaded, you can use the notebook in the `examples` folder to transform a file into documents and load them in the vector store.
 
 ## Documentation
 

diff --git a/data_sample/billionaires.csv → examples/billionaires.csv b/data_sample/billionaires.csv → examples/billionaires.csv
diff --git a/examples/load_documents.ipynb b/examples/load_documents.ipynb
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an interactive example that will walk you through the initialization of a RAG and the basic embedding of a few documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import os\n",
+    "repo_root = Path(os.getcwd()).parent\n",
+    "sys.path.append(str(repo_root))\n",
+    "\n",
+    "from backend.config import RagConfig\n",
+    "from backend.rag_components.rag import RAG\n",
+    "\n",
+    "rag_config = RagConfig.from_yaml(repo_root / \"backend\" / \"config.yaml\")\n",
+    "rag_config.database.database_url = f\"sqlite:////{repo_root}/database/rag.sqlite3\"\n",
+    "\n",
+    "rag = RAG(config=rag_config)\n",
+    "\n",
+    "print(\"LLM:\", rag.llm.__class__.__name__)\n",
+    "print(\"Embedding model:\", rag.embeddings.__class__.__name__)\n",
+    "print(\"Vector store:\", rag.vector_store.__class__.__name__)\n",
+    "print(\"Retriever:\", rag.retriever.__class__.__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we transform our CSV into standalone embeddable documents that we will be able to feed the vector store.\n",
+    "\n",
+    "We generate one document for each line, and each document will contain header:value pairs for all the columns.\n",
+    "\n",
+    "This is a very simplistic example, but vector store data models can get more advanced to support more [powerful retreival methods.](https://python.langchain.com/docs/modules/data_connection/retrievers/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
+    "from langchain.vectorstores.utils import filter_complex_metadata\n",
+    "\n",
+    "\n",
+    "data_sample_path = repo_root / \"examples\" / \"billionaires.csv\"\n",
+    "\n",
+    "loader = CSVLoader(\n",
+    "    file_path=str(data_sample_path),\n",
+    "    csv_args={\"delimiter\": \",\", \"quotechar\": '\"', \"escapechar\": \"\\\\\"},\n",
+    "    encoding=\"utf-8-sig\",\n",
+    ")\n",
+    "\n",
+    "raw_documents = loader.load()\n",
+    "documents = filter_complex_metadata(raw_documents)\n",
+    "documents[:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To load the docs in the vector store, we recommend using the `load_document` as it [indexes previously embedded docs](https://python.langchain.com/docs/modules/data_connection/indexing), making the process idempotent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rag.load_documents(documents)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}