From d7be2d3c768f1bb2685b1b96528121c384d86e69 Mon Sep 17 00:00:00 2001
From: Yoom Lam <yoom@navapbc.com>
Date: Sun, 26 May 2024 23:01:45 -0500
Subject: [PATCH] add Dockerfile and ingest-guru-cards.py

---
 05-assistive-chatbot/.gitignore               |   2 +
 05-assistive-chatbot/Dockerfile               |  26 ++
 05-assistive-chatbot/README.md                |  60 +++--
 05-assistive-chatbot/chatbot-chainlit.py      |   2 +-
 05-assistive-chatbot/chatbot/__init__.py      |   4 +-
 .../chatbot/engines/__init__.py               |   4 +-
 05-assistive-chatbot/chatbot/guru_cards.py    |  69 ++++++
 .../chatbot/ingest/__init_.py                 |   0
 .../chatbot/ingest/text_splitter.py           |  49 ++++
 05-assistive-chatbot/chatbot/llms/__init__.py |  24 +-
 05-assistive-chatbot/chatbot/utils.py         |   2 +
 05-assistive-chatbot/chatbot_api.py           |   6 +-
 05-assistive-chatbot/guru_api_access.md       |  75 ++++++
 05-assistive-chatbot/ingest-guru-cards.py     |  89 +++++++
 05-assistive-chatbot/requirements.in          |   9 +-
 05-assistive-chatbot/requirements.txt         | 226 +++++++++++++++++-
 16 files changed, 597 insertions(+), 50 deletions(-)
 create mode 100644 05-assistive-chatbot/Dockerfile
 create mode 100644 05-assistive-chatbot/chatbot/guru_cards.py
 create mode 100644 05-assistive-chatbot/chatbot/ingest/__init_.py
 create mode 100644 05-assistive-chatbot/chatbot/ingest/text_splitter.py
 create mode 100644 05-assistive-chatbot/guru_api_access.md
 create mode 100755 05-assistive-chatbot/ingest-guru-cards.py

diff --git a/05-assistive-chatbot/.gitignore b/05-assistive-chatbot/.gitignore
index 2a52261..33c3b03 100644
--- a/05-assistive-chatbot/.gitignore
+++ b/05-assistive-chatbot/.gitignore
@@ -4,6 +4,8 @@ chroma_db/
 *.log
 log/
 
+# MacOS files
 *.DS_STORE
 
+# .env contains secret API keys
 .env
diff --git a/05-assistive-chatbot/Dockerfile b/05-assistive-chatbot/Dockerfile
new file mode 100644
index 0000000..3b227c4
--- /dev/null
+++ b/05-assistive-chatbot/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+RUN apt-get update && apt-get install -y \
+    curl unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG GURU_CARDS_URL
+RUN echo "Downloading from ${GURU_CARDS_URL}" \
+    && curl -L "${GURU_CARDS_URL}" > download.zip \
+    && unzip -o download.zip \
+    && rm download.zip \
+    && mv guru_cards_for_nava--Multi-benefit.json guru_cards_for_nava.json
+
+# Update .dockerignore to prevent files from being copied into the image
+COPY . .
+
+RUN ./ingest-guru-cards.py
+
+EXPOSE 8000
+HEALTHCHECK CMD curl http://localhost:8000 || exit 1
+ENTRYPOINT ["chainlit", "run", "--port", "8000", "-h", "chatbot-chainlit.py"]
diff --git a/05-assistive-chatbot/README.md b/05-assistive-chatbot/README.md
index 8691d78..bd764b8 100644
--- a/05-assistive-chatbot/README.md
+++ b/05-assistive-chatbot/README.md
@@ -5,46 +5,54 @@
 * Use Python 3.11.x (higher versions cause library version problems).
 * Install Python libraries: `pip install -r requirements.txt`
 
-### (Optional) Enable Chatbot feedback
+### (Optional) Enable Chatbot Feedback
 To enable the [feedback mechanism](https://docs.chainlit.io/data-persistence/feedback):
 * Get an API key: https://docs.chainlit.io/data-persistence/overview
 * Create or update `.env` with `LITERAL_API_KEY` set to the API key
 
 After running the chatbot and providing feedback in the UI, review the feedback at https://cloud.getliteral.ai/projects/YOUR_PROJECT_NAME/feedback.
 
-* To use a custom feedback storage instead of `getliteral.ai`, see https://docs.chainlit.io/data-persistence/custom.
+* To use custom feedback storage instead of `getliteral.ai`, see https://docs.chainlit.io/data-persistence/custom.
 
 
-## Run
+## Running an application
 
-All apps use configurations set in `.env`, which can be overridden by environment variables, like `CHAT_ENGINE` and `LLM_MODEL_NAME`.  See `_init_settings()` in `chatbot/__init__.py` for other variables.
+There are several ways to run the chatbot application, offering different ways to interact with the chatbot.
+All apps use configurations set in `.env`, which is *not* checked into git. These configurations (like `CHAT_ENGINE` and `LLM_MODEL_NAME`) can be overridden by environment variables set on the commandline.  See `_init_settings()` in `chatbot/__init__.py` for other variables.
 
-### Run web chatbot app
+### Run commandline app
+
+This commandline application entrypoint is useful for quickly or repeatedly running, testing, or debugging without having to click through or type in a UI. Set the configuration in `.env` or as environment variables, then run `./cmdline.py`.
+
+To quickly set variables and run the app on a single line: 
+`CHATBOT_LOG_LEVEL=INFO CHAT_ENGINE=Direct LLM_MODEL_NAME='langchain.ollama :: openhermes' ./cmdline.py`
+
+To see more logs, adjust the log level, like `CHATBOT_LOG_LEVEL=DEBUG`.
+
+### Run chatbot web app
+
+This application provides a chatbot web app that users typically interact with.
 
 1. Start the Chainlit-based chatbot service: `./chatbot-chainlit.py` or `chainlit run ./chatbot-chainlit.py`
 1. Open a browser to `http://localhost:8000/`
 
-For development, run something like `chainlit run -w -h --port 9000 ./chatbot-chainlit.py`.
+For development, run something like `chainlit run -w -h --port 9000 ./chatbot-chainlit.py` to watch for changed files and automatically update the running application without having to restart chainlit.
+
+Chainlit UI configurations are in the `.chainlit/config.toml` file.
 
-Running the chatbot app will also run the API, which is defined in `chatbot_api.py`.
+Running the chatbot app will also run the API (described in the next section), which is defined in `chatbot_api.py`.
 
 ### Run only the API
 
+This application runs the chatbot API for other applications to make requests to the chatbot.
+
 1. Run `./chatbot_api.py`
 1. Open a browser to the `/query` endpoint followed by a question, such as `http://localhost:8001/query/tell me a joke`
 
-### Run commandline app
 
-1. Run `./cmdline.py`
-
-To quickly set variables and run the app on a single line: 
-`CHATBOT_LOG_LEVEL=INFO CHAT_ENGINE=Direct LLM_MODEL_NAME='langchain.ollama :: openhermes' ./cmdline.py`
-
-To see more logs, adjust the log level like `CHATBOT_LOG_LEVEL=DEBUG`.
-
-
-## Development
+## Development Notes
 
+- Application entrypoints are in the root folder of the repo. Other Python files are under the `chatbot` folder.
 - The chatbot package `chatbot/__init__.py` is run for all apps because they `import chatbot`.
 - It initializes settings (`_init_settings()`) and creates a specified chat engine (`create_chat_engine(settings)`).
 
@@ -56,7 +64,8 @@ To create a chat engine, add a new Python file under `chatbot/engines` with:
 - an `init_engine(settings)` function to instantiate a chat engine class
 - a chat engine class that:
     - creates a client to an LLM (`create_llm_client(settings)`), then
-    - uses the LLM client to generate a response to specified query (`gen_response(self, query)`)
+    - uses the LLM client to generate a response to a specified query (`gen_response(self, query)`)
+The new Python file will be automatically discovered and registered for display in the Chainlit settings web UI.
 
 The `chat_engine.gen_response(query)` function is called by the apps when a user submits a query.
 
@@ -71,8 +80,23 @@ To create a new LLM client, add a new Python file under `chatbot/llms` with:
 - an LLM client class that:
     - sets `self.client` based on the provided `settings`, and
     - implements a `submit(self, message)` function that uses `self.client` to generate a response, which may need to be parsed so that a string is returned to `chat_engine.gen_response(self, query)`.
+The new Python file will be automatically discovered and registered for display in the Chainlit settings web UI.
 
 An LLM client can be used in any arbitrary program by:
 - setting `client = init_client(model_name, settings)`
 - then calling `client.submit(message)`
 See `client_example_usage()` in `chatbot/llms/mock_llm_client.py`.
+
+### Python formatting
+
+Install and run `ruff format .` and `isort .` to consistently format Python files.
+
+### Docker
+
+A Docker image is built for deployments (by GitHub Action `push-image.yml`). To verify that the image builds and runs correctly, run:
+```
+GURU_CARDS_URL_ID='1fO-ABCD1234...' # Google Drive document id
+docker build -t dst-chatbot . --build-arg GURU_CARDS_URL="https://docs.google.com/uc?export=download&id=$GURU_CARDS_URL_ID"
+docker run --rm -p 8000:8000 dst-chatbot
+```
+Then, open a browser to `http://localhost:8000/` for testing.
diff --git a/05-assistive-chatbot/chatbot-chainlit.py b/05-assistive-chatbot/chatbot-chainlit.py
index 51f96b1..8c752d2 100755
--- a/05-assistive-chatbot/chatbot-chainlit.py
+++ b/05-assistive-chatbot/chatbot-chainlit.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env chainlit run -h
 
 """
-ChainLit-based chatbot, providing a web user interface for the selected chat engine and settings.
+Chainlit-based chatbot, providing a web user interface for the selected chat engine and settings.
 See README.md for instructions to enable user feedback.
 """
 
diff --git a/05-assistive-chatbot/chatbot/__init__.py b/05-assistive-chatbot/chatbot/__init__.py
index 21abde1..c51ccf5 100644
--- a/05-assistive-chatbot/chatbot/__init__.py
+++ b/05-assistive-chatbot/chatbot/__init__.py
@@ -13,7 +13,7 @@
 ## Initialize logging
 
 
-def _configure_logging():
+def configure_logging():
     log_format = os.environ.get("LOG_FORMAT", "%(relativeCreated)6d - %(name)-24s - %(levelname)-5s - %(message)s")
     logging.basicConfig(format=log_format)
 
@@ -23,7 +23,7 @@ def _configure_logging():
 
 
 dotenv.load_dotenv()
-_configure_logging()
+configure_logging()
 
 logger = logging.getLogger(__name__)
 
diff --git a/05-assistive-chatbot/chatbot/engines/__init__.py b/05-assistive-chatbot/chatbot/engines/__init__.py
index 535dc1b..b183b45 100644
--- a/05-assistive-chatbot/chatbot/engines/__init__.py
+++ b/05-assistive-chatbot/chatbot/engines/__init__.py
@@ -1,4 +1,3 @@
-import importlib
 import logging
 from types import ModuleType
 from typing import Dict
@@ -18,8 +17,7 @@ def available_engines():
 def _discover_chat_engines(force=False):
     if not _engines or force:
         _engines.clear()
-        namespace = importlib.import_module(__package__)
-        found_llm_modules = utils.scan_modules(namespace)
+        found_llm_modules = utils.scan_modules(__package__)
         for _module_name, module in found_llm_modules.items():
             if not hasattr(module, "ENGINE_NAME"):
                 continue
diff --git a/05-assistive-chatbot/chatbot/guru_cards.py b/05-assistive-chatbot/chatbot/guru_cards.py
new file mode 100644
index 0000000..1fe608d
--- /dev/null
+++ b/05-assistive-chatbot/chatbot/guru_cards.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+import json
+import os
+
+from bs4 import BeautifulSoup
+
+
+class GuruCardsProcessor:
+    def __init__(
+        self,
+        file_path="./guru_cards_for_nava.json",
+        question_key="preferredPhrase",
+        content_key="content",
+    ):
+        self.file_path = file_path
+        self.question_key = question_key
+        self.content_key = content_key
+
+    def extract_qa_text_from_guru(self):
+        json_data = self.cards_as_json()
+        question_answers = self._extract_question_answers(json_data)
+        return question_answers
+
+    def cards_as_json(self):
+        with open(self.file_path, encoding="utf-8") as data_file:
+            return json.load(data_file)
+
+    def _extract_question_answers(self, json_data):
+        question_answers = {}
+        for content in json_data:
+            if not content[self.question_key].strip().endswith("?"):
+                continue
+            soup = BeautifulSoup(content[self.content_key], "html.parser")
+            answer = soup.get_text(separator="\n", strip=True)
+            question_answers[content[self.question_key].strip()] = answer
+        return question_answers
+
+
+def save_simplified_json(gc_processor):
+    "Saves a simplified version of the Guru cards JSON file for easier review"
+    json_data = gc_processor.cards_as_json()
+    name, ext = os.path.splitext(gc_processor.file_path)
+    with open(f"{name}_simplified{ext}", "w", encoding="utf-8") as f:
+        simplified_json = []
+        for card in json_data:
+            tags = [tagsItem.get("value") for tagsItem in card.get("tags", [])]
+            boards = [boardsItem.get("title") for boardsItem in card.get("boards", [])]
+            soup = BeautifulSoup(card[gc_processor.content_key], "html.parser")
+            content = soup.get_text(separator="\n", strip=True)
+            simplified_json.append(
+                {
+                    "preferredPhrase": card["preferredPhrase"],
+                    "tags": ",".join(tags),
+                    "boards": ",".join(boards),
+                    gc_processor.content_key: content,
+                }
+            )
+        json.dump(simplified_json, f, indent=4)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if args := sys.argv[1:]:
+        _gc_processor = GuruCardsProcessor(file_path=args[0])
+    else:
+        _gc_processor = GuruCardsProcessor()
+
+    save_simplified_json(_gc_processor)
diff --git a/05-assistive-chatbot/chatbot/ingest/__init_.py b/05-assistive-chatbot/chatbot/ingest/__init_.py
new file mode 100644
index 0000000..e69de29
diff --git a/05-assistive-chatbot/chatbot/ingest/text_splitter.py b/05-assistive-chatbot/chatbot/ingest/text_splitter.py
new file mode 100644
index 0000000..2770e4b
--- /dev/null
+++ b/05-assistive-chatbot/chatbot/ingest/text_splitter.py
@@ -0,0 +1,49 @@
+import logging
+
+from langchain.docstore.document import Document
+from langchain_text_splitters import (NLTKTextSplitter,
+                                      RecursiveCharacterTextSplitter,
+                                      SpacyTextSplitter)
+
+logger = logging.getLogger(__name__)
+
+
+class TextSplitter:
+    def __init__(self, llm_client, token_limit, text_splitter_name, **text_splitter_args):
+        """
+        - llm_client is used to get the number of tokens in a text
+        - token_limit is the maximum number of tokens allowed by the embedding model
+        """
+        self.llm_client = llm_client
+        self.token_limit = token_limit
+        self.text_splitter = self.create_text_splitter(text_splitter_name, **text_splitter_args)
+
+    def create_text_splitter(self, choice, **kwargs):
+        logger.info("Creating %s", choice)
+        if choice == "NLTKTextSplitter":
+            logger.warning("  Not using arguments: %s", kwargs)
+            splitter = NLTKTextSplitter()
+        elif choice == "SpacyTextSplitter":
+            logger.warning("  Not using arguments: %s", kwargs)
+            splitter = SpacyTextSplitter()
+        elif choice == "RecursiveCharacterTextSplitter":
+            logger.info("  Using arguments: %s", kwargs)
+            splitter = RecursiveCharacterTextSplitter(
+                chunk_size=kwargs["chunk_size"], chunk_overlap=kwargs["chunk_overlap"]
+            )
+        return splitter
+
+    def split_into_chunks(self, title, text):
+        """
+        - title is the title to be used as the source of the text
+        - text is the text to split
+        """
+        entire_text = title + "\n\n" + text
+        texts = self.text_splitter.split_text(entire_text)
+
+        logger.info("  Split into %s", len(texts))
+        for t in texts:
+            token_count = self.llm_client.get_num_tokens(t)
+            assert token_count <= self.token_limit, "Exceeded token limit of {self.token_limit}: {token_count}"
+
+        return [Document(page_content=t, metadata={"source": title.strip(), "entire_card": entire_text}) for t in texts]
diff --git a/05-assistive-chatbot/chatbot/llms/__init__.py b/05-assistive-chatbot/chatbot/llms/__init__.py
index 4e86dfb..f039cec 100644
--- a/05-assistive-chatbot/chatbot/llms/__init__.py
+++ b/05-assistive-chatbot/chatbot/llms/__init__.py
@@ -1,4 +1,3 @@
-import importlib
 import logging
 from types import ModuleType
 from typing import Dict, Tuple
@@ -18,21 +17,25 @@ def available_llms():
 def _discover_llms(force=False):
     if not _llms or force:
         _llms.clear()
-        namespace = importlib.import_module(__package__)
-        found_modules = utils.scan_modules(namespace)
+        found_modules = utils.scan_modules(__package__)
         for module_name, module in found_modules.items():
             if not module or ignore(module_name):
                 logger.debug("Skipping module: %s", module_name)
                 continue
             client_name = module.CLIENT_NAME or module_name
             for llm_name in module.MODEL_NAMES or []:
-                qualified_llm_name = f"{client_name} :: {llm_name}"
-                _llms[qualified_llm_name] = (module, llm_name)
+                qualified_name = qualified_llm_name(client_name, llm_name)
+                _llms[qualified_name] = (module, llm_name)
     return _llms
 
 
+def qualified_llm_name(client_name, model_name):
+    return f"{client_name} :: {model_name}"
+
+
 def ignore(module_name):
     if module_name.startswith("dspy ::"):
+        # DSPy client code is not yet ready for use
         return True
     return False
 
@@ -40,7 +43,12 @@ def ignore(module_name):
 ## Factory functions
 
 
-def init_client(model_name, settings=None):
+def init_client(qualified_name, settings=None):
+    """Initialize a specific LLM client based on the qualified_name.
+    :param qualified_name: str or Tuple[client_name, model_name]
+    """
     _discover_llms()
-    module, llm_name = _llms[model_name]
-    return module.init_client(llm_name, settings)
+    if isinstance(qualified_name, Tuple):
+        qualified_name = qualified_llm_name(qualified_name[0], qualified_name[1])
+    module, llm_name = _llms[qualified_name]
+    return module.init_client(llm_name, settings or {})
diff --git a/05-assistive-chatbot/chatbot/utils.py b/05-assistive-chatbot/chatbot/utils.py
index 6461b5f..a36ab73 100644
--- a/05-assistive-chatbot/chatbot/utils.py
+++ b/05-assistive-chatbot/chatbot/utils.py
@@ -50,6 +50,8 @@ def wrapper_timer(*args, **kwargs):
 
 def scan_modules(ns_pkg):
     "Return a dictionary of Python modules found in the given namespace package"
+    if isinstance(ns_pkg, str):
+        ns_pkg = importlib.import_module(ns_pkg)
     # From https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-namespace-packages
     itr = pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
     return {name: _import_module_if_possible(name) for _, name, _ in itr}
diff --git a/05-assistive-chatbot/chatbot_api.py b/05-assistive-chatbot/chatbot_api.py
index da9a1ce..691ef30 100755
--- a/05-assistive-chatbot/chatbot_api.py
+++ b/05-assistive-chatbot/chatbot_api.py
@@ -2,8 +2,8 @@
 
 """
 This is a sample API file that demonstrates how to create an API using FastAPI,
-which is compatible with ChainLit. This file is a starting point for creating
-an API that can be deployed with the ChainLit chatbot.
+which is compatible with Chainlit. This file is a starting point for creating
+an API that can be deployed with the Chainlit chatbot.
 """
 
 import logging
@@ -17,7 +17,7 @@
     # If running directly, define the FastAPI app
     app = FastAPI()
 else:
-    # Otherwise use ChainLit's app
+    # Otherwise use Chainlit's app
     from chainlit.server import app
 
 logger = logging.getLogger(f"chatbot.{__name__}")
diff --git a/05-assistive-chatbot/guru_api_access.md b/05-assistive-chatbot/guru_api_access.md
new file mode 100644
index 0000000..9d3a770
--- /dev/null
+++ b/05-assistive-chatbot/guru_api_access.md
@@ -0,0 +1,75 @@
+## Get access to Guru API
+Add the following in `.env`:
+```
+GURU_API_TOKEN=...
+COLLECTION_ID=...
+```
+
+Get the value for `GURU_API_TOKEN` from https://navalabs.atlassian.net/browse/DST-127?focusedCommentId=10029
+and `COLLECTION_ID` from https://navalabs.atlassian.net/browse/DST-127?focusedCommentId=10032
+
+
+Apply environment variables to command line shell: `source .env`
+
+Verify it works: `curl -u $COLLECTION_ID:$GURU_API_TOKEN https://api.getguru.com/api/v1/teams -D -`
+
+Expect:
+```
+HTTP/2 200
+date: Tue, 19 Mar 2024 19:49:16 GMT
+content-type: application/json
+content-length: 449
+...
+
+[ {
+  "organization" : {
+    "name" : "Benefits Data Trust",
+    "id" : "..."
+  },
+  "totalUsers" : 294,
+  "topLevelOrganizationId" : "...",
+  "name" : "Benefits Data Trust",
+  "id" : "...",
+  "status" : "ACTIVE",
+  "dateCreated" : "2017-07-06T13:09:52.980+0000",
+  "profilePicUrl" : "https://pp.getguru.com/....jpeg"
+} ]%
+```
+
+## Get cards
+Use web UI: https://developer.getguru.com/reference/getv1cardsgetextendedfact
+or the following, which retrieves all pages:
+```
+NEXT_URL='https://api.getguru.com/api/v1/search/cardmgr?queryType=cards&showArchived=false'
+CURL_COUNTER=1
+while [ "$NEXT_URL" ]
+do
+  RESP_HEADER=$(curl -sS -D - \
+    -u "$COLLECTION_ID:$GURU_API_TOKEN" \
+    --request GET -o "guru_cards_for_nava_$CURL_COUNTER.json" \
+    --header 'accept: application/json' \
+    --url "$NEXT_URL")
+  NEXT_URL=$(echo $RESP_HEADER | ggrep -oP '^link: <\K.*(?=>)')
+  let CURL_COUNTER=$CURL_COUNTER+1
+done
+
+# Merge files each containing a JSON list
+jq -n '[inputs] | add' guru_cards_for_nava_?.json guru_cards_for_nava_??.json > guru_cards_for_nava.json
+
+# Create simplified JSON for readability
+python ingest.py guru_cards_for_nava.json
+
+# Count cards
+jq length guru_cards_for_nava.json
+
+# Zip
+zip guru_cards_for_nava.zip guru_cards_for_nava.json guru_cards_for_nava_simplified.json
+```
+
+Share zip file via Google Drive.
+
+## Extract all HTML content
+
+`cat guru_cards_for_nava.json | jq '.[] | .content' > guru_cards.html`
+
+`jq -r '.[] | .preferredPhrase + "\n  tags: " + ( [.tags[]?.value] | join(",") ) +"\n  content: " + .content' guru_cards_for_nava--Multi-benefit.json`
diff --git a/05-assistive-chatbot/ingest-guru-cards.py b/05-assistive-chatbot/ingest-guru-cards.py
new file mode 100755
index 0000000..cf00870
--- /dev/null
+++ b/05-assistive-chatbot/ingest-guru-cards.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+import logging
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Callable
+
+import dotenv
+from langchain_community.embeddings import (HuggingFaceEmbeddings,
+                                            SentenceTransformerEmbeddings)
+from langchain_community.vectorstores import Chroma
+
+import chatbot
+from chatbot import guru_cards, llms, utils
+from chatbot.ingest.text_splitter import TextSplitter
+
+logger = logging.getLogger(f"chatbot.{__name__}")
+
+
+@dataclass
+class EmbeddingsModel:
+    name: str
+    token_limit: int
+    create: Callable
+
+
+_EMBEDDINGS_MODEL_LIST = [
+    EmbeddingsModel("all-MiniLM-L6-v2", 256, lambda: SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")),
+    EmbeddingsModel("HuggingFace::all-MiniLM-L6-v2", 256, lambda: HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")),
+    EmbeddingsModel(
+        "BAAI/bge-small-en-v1.5", 512, lambda: SentenceTransformerEmbeddings(model_name="BAAI/bge-small-en-v1.5")
+    ),
+    EmbeddingsModel(
+        "mixedbread-ai/mxbai-embed-large-v1",
+        1024,
+        lambda: SentenceTransformerEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1"),
+    ),
+    # EmbeddingsModel("Google::embedding-001", 2048, lambda: GoogleGenerativeAIEmbeddings(model="models/embedding-001")),
+    # EmbeddingsModel("Google::text-embedding-004", 768, lambda: GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")),
+]
+
+EMBEDDING_MODELS = {model.name: model for model in _EMBEDDINGS_MODEL_LIST}
+
+
+class AppState:
+    def __init__(self, llm_model, embedding_name):
+        self.llm_model = llm_model
+        self.embedding_name = embedding_name
+
+    @cached_property
+    @utils.timer
+    def llm(self):
+        logger.info("Creating LLM")
+        return llms.init_client(("langchain.ollama", self.llm_model))
+
+    @cached_property
+    @utils.timer
+    def vectordb(self):
+        logger.info("Creating Vector DB")
+        embeddings_model = EMBEDDING_MODELS[self.embedding_name].create()
+        logger.info("Embeddings model created: %s", embeddings_model)
+        return Chroma(
+            embedding_function=embeddings_model,
+            # Must use collection_name="langchain" -- https://github.com/langchain-ai/langchain/issues/10864#issuecomment-1730303411
+            collection_name="langchain",
+            persist_directory="./chroma_db",
+        )
+
+
+if __name__ == "__main__":
+    dotenv.load_dotenv()
+    chatbot.configure_logging()
+
+    app_state = AppState("mistral", "all-MiniLM-L6-v2")
+
+    text_splitter = TextSplitter(
+        llm_client=app_state.llm.client,
+        token_limit=EMBEDDING_MODELS[app_state.embedding_name].token_limit,
+        text_splitter_name="RecursiveCharacterTextSplitter",
+        # Use smaller chunks for shorter-length quotes
+        chunk_size=250,
+        chunk_overlap=100,
+    )
+
+    guru_question_answers = guru_cards.GuruCardsProcessor().extract_qa_text_from_guru()
+    # Chunk the json data and load into vector db
+    for question, answer in guru_question_answers.items():
+        logger.info("Processing document: %s", question)
+        chunks = text_splitter.split_into_chunks(question, answer)
+        app_state.vectordb.add_documents(documents=chunks)
diff --git a/05-assistive-chatbot/requirements.in b/05-assistive-chatbot/requirements.in
index 7d1d1d0..38f810c 100644
--- a/05-assistive-chatbot/requirements.in
+++ b/05-assistive-chatbot/requirements.in
@@ -9,12 +9,13 @@ chainlit==1.1
 # langchain
 langchain_community
 langchain-google-genai
-# langchain-text-splitters
-# sentence-transformers
 
 ## Vector DB
-# chromadb
-# beautifulsoup4
+langchain-text-splitters
+sentence-transformers
+chromadb
+beautifulsoup4
+types-beautifulsoup4
 
 ## OpenAI LLM
 openai
diff --git a/05-assistive-chatbot/requirements.txt b/05-assistive-chatbot/requirements.txt
index 5d670d4..dd2d6d9 100644
--- a/05-assistive-chatbot/requirements.txt
+++ b/05-assistive-chatbot/requirements.txt
@@ -22,6 +22,8 @@ anyio==3.7.1
     #   openai
     #   starlette
     #   watchfiles
+asgiref==3.8.1
+    # via opentelemetry-instrumentation-asgi
 async-timeout==4.0.3
     # via
     #   aiohttp
@@ -30,14 +32,23 @@ asyncer==0.0.2
     # via chainlit
 attrs==23.2.0
     # via aiohttp
+backoff==2.2.1
+    # via posthog
+bcrypt==4.1.3
+    # via chromadb
+beautifulsoup4==4.12.3
+    # via -r requirements.in
 bidict==0.23.1
     # via python-socketio
+build==1.2.1
+    # via chromadb
 cachetools==5.3.3
     # via google-auth
 certifi==2024.2.2
     # via
     #   httpcore
     #   httpx
+    #   kubernetes
     #   requests
 chainlit==1.1.0
     # via -r requirements.in
@@ -45,10 +56,17 @@ charset-normalizer==3.3.2
     # via requests
 chevron==0.14.0
     # via literalai
+chroma-hnswlib==0.7.3
+    # via chromadb
+chromadb==0.5.0
+    # via -r requirements.in
 click==8.1.7
     # via
     #   chainlit
+    #   typer
     #   uvicorn
+coloredlogs==15.0.1
+    # via onnxruntime
 dataclasses-json==0.5.14
     # via
     #   chainlit
@@ -67,15 +85,27 @@ exceptiongroup==1.2.1
 fastapi==0.110.3
     # via
     #   chainlit
+    #   chromadb
     #   fastapi-socketio
 fastapi-socketio==0.0.10
     # via chainlit
+filelock==3.14.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
 filetype==1.2.0
     # via chainlit
+flatbuffers==24.3.25
+    # via onnxruntime
 frozenlist==1.4.1
     # via
     #   aiohttp
     #   aiosignal
+fsspec==2024.5.0
+    # via
+    #   huggingface-hub
+    #   torch
 google-ai-generativelanguage==0.6.4
     # via google-generativeai
 google-api-core[grpc]==2.19.0
@@ -92,6 +122,7 @@ google-auth==2.29.0
     #   google-api-python-client
     #   google-auth-httplib2
     #   google-generativeai
+    #   kubernetes
 google-auth-httplib2==0.2.0
     # via google-api-python-client
 google-generativeai==0.5.4
@@ -106,6 +137,7 @@ groq==0.8.0
     # via -r requirements.in
 grpcio==1.62.1
     # via
+    #   chromadb
     #   google-api-core
     #   grpcio-status
     #   opentelemetry-exporter-otlp-proto-grpc
@@ -122,12 +154,21 @@ httplib2==0.22.0
     # via
     #   google-api-python-client
     #   google-auth-httplib2
+httptools==0.6.1
+    # via uvicorn
 httpx==0.27.0
     # via
     #   chainlit
     #   groq
     #   literalai
     #   openai
+huggingface-hub==0.23.1
+    # via
+    #   sentence-transformers
+    #   tokenizers
+    #   transformers
+humanfriendly==10.0
+    # via coloredlogs
 idna==3.6
     # via
     #   anyio
@@ -136,10 +177,18 @@ idna==3.6
     #   yarl
 importlib-metadata==6.11.0
     # via opentelemetry-api
+importlib-resources==6.4.0
+    # via chromadb
+jinja2==3.1.4
+    # via torch
+joblib==1.4.2
+    # via scikit-learn
 jsonpatch==1.33
     # via langchain-core
 jsonpointer==2.4
     # via jsonpatch
+kubernetes==29.0.0
+    # via chromadb
 langchain==0.2.1
     # via langchain-community
 langchain-community==0.2.1
@@ -153,7 +202,9 @@ langchain-core==0.2.1
 langchain-google-genai==1.0.5
     # via -r requirements.in
 langchain-text-splitters==0.2.0
-    # via langchain
+    # via
+    #   -r requirements.in
+    #   langchain
 langsmith==0.1.63
     # via
     #   langchain
@@ -163,8 +214,20 @@ lazify==0.4.0
     # via chainlit
 literalai==0.0.601
     # via chainlit
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via jinja2
 marshmallow==3.21.1
     # via dataclasses-json
+mdurl==0.1.2
+    # via markdown-it-py
+mmh3==4.1.0
+    # via chromadb
+monotonic==1.6
+    # via posthog
+mpmath==1.3.0
+    # via sympy
 multidict==6.0.5
     # via
     #   aiohttp
@@ -173,17 +236,35 @@ mypy-extensions==1.0.0
     # via typing-inspect
 nest-asyncio==1.6.0
     # via chainlit
+networkx==3.3
+    # via torch
 numpy==1.26.4
     # via
+    #   chroma-hnswlib
+    #   chromadb
     #   langchain
     #   langchain-community
+    #   onnxruntime
+    #   scikit-learn
+    #   scipy
+    #   sentence-transformers
+    #   transformers
+oauthlib==3.2.2
+    # via
+    #   kubernetes
+    #   requests-oauthlib
+onnxruntime==1.18.0
+    # via chromadb
 openai==1.30.2
     # via -r requirements.in
 opentelemetry-api==1.24.0
     # via
+    #   chromadb
     #   opentelemetry-exporter-otlp-proto-grpc
     #   opentelemetry-exporter-otlp-proto-http
     #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-asgi
+    #   opentelemetry-instrumentation-fastapi
     #   opentelemetry-sdk
     #   uptrace
 opentelemetry-exporter-otlp==1.24.0
@@ -193,11 +274,20 @@ opentelemetry-exporter-otlp-proto-common==1.24.0
     #   opentelemetry-exporter-otlp-proto-grpc
     #   opentelemetry-exporter-otlp-proto-http
 opentelemetry-exporter-otlp-proto-grpc==1.24.0
-    # via opentelemetry-exporter-otlp
+    # via
+    #   chromadb
+    #   opentelemetry-exporter-otlp
 opentelemetry-exporter-otlp-proto-http==1.24.0
     # via opentelemetry-exporter-otlp
 opentelemetry-instrumentation==0.45b0
-    # via uptrace
+    # via
+    #   opentelemetry-instrumentation-asgi
+    #   opentelemetry-instrumentation-fastapi
+    #   uptrace
+opentelemetry-instrumentation-asgi==0.45b0
+    # via opentelemetry-instrumentation-fastapi
+opentelemetry-instrumentation-fastapi==0.45b0
+    # via chromadb
 opentelemetry-proto==1.24.0
     # via
     #   opentelemetry-exporter-otlp-proto-common
@@ -205,19 +295,39 @@ opentelemetry-proto==1.24.0
     #   opentelemetry-exporter-otlp-proto-http
 opentelemetry-sdk==1.24.0
     # via
+    #   chromadb
     #   opentelemetry-exporter-otlp-proto-grpc
     #   opentelemetry-exporter-otlp-proto-http
     #   uptrace
 opentelemetry-semantic-conventions==0.45b0
-    # via opentelemetry-sdk
+    # via
+    #   opentelemetry-instrumentation-asgi
+    #   opentelemetry-instrumentation-fastapi
+    #   opentelemetry-sdk
+opentelemetry-util-http==0.45b0
+    # via
+    #   opentelemetry-instrumentation-asgi
+    #   opentelemetry-instrumentation-fastapi
 orjson==3.10.3
-    # via langsmith
+    # via
+    #   chromadb
+    #   langsmith
+overrides==7.7.0
+    # via chromadb
 packaging==23.2
     # via
+    #   build
     #   chainlit
+    #   huggingface-hub
     #   langchain-core
     #   literalai
     #   marshmallow
+    #   onnxruntime
+    #   transformers
+pillow==10.3.0
+    # via sentence-transformers
+posthog==3.5.0
+    # via chromadb
 proto-plus==1.23.0
     # via
     #   google-ai-generativelanguage
@@ -229,6 +339,7 @@ protobuf==4.25.3
     #   google-generativeai
     #   googleapis-common-protos
     #   grpcio-status
+    #   onnxruntime
     #   opentelemetry-proto
     #   proto-plus
 pyasn1==0.6.0
@@ -240,6 +351,7 @@ pyasn1-modules==0.4.0
 pydantic==2.5.0
     # via
     #   chainlit
+    #   chromadb
     #   fastapi
     #   google-generativeai
     #   groq
@@ -250,12 +362,24 @@ pydantic==2.5.0
     #   openai
 pydantic-core==2.14.1
     # via pydantic
+pygments==2.18.0
+    # via rich
 pyjwt==2.8.0
     # via chainlit
 pyparsing==3.1.2
     # via httplib2
+pypika==0.48.9
+    # via chromadb
+pyproject-hooks==1.1.0
+    # via build
+python-dateutil==2.9.0.post0
+    # via
+    #   kubernetes
+    #   posthog
 python-dotenv==1.0.1
-    # via chainlit
+    # via
+    #   chainlit
+    #   uvicorn
 python-engineio==4.9.0
     # via python-socketio
 python-multipart==0.0.9
@@ -264,26 +388,62 @@ python-socketio==5.11.2
     # via fastapi-socketio
 pyyaml==6.0.1
     # via
+    #   chromadb
+    #   huggingface-hub
+    #   kubernetes
     #   langchain
     #   langchain-community
     #   langchain-core
+    #   transformers
+    #   uvicorn
+regex==2024.5.15
+    # via transformers
 requests==2.31.0
     # via
+    #   chromadb
     #   google-api-core
+    #   huggingface-hub
+    #   kubernetes
     #   langchain
     #   langchain-community
     #   langsmith
     #   opentelemetry-exporter-otlp-proto-http
+    #   posthog
+    #   requests-oauthlib
+    #   transformers
+requests-oauthlib==2.0.0
+    # via kubernetes
+rich==13.7.1
+    # via typer
 rsa==4.9
     # via google-auth
+safetensors==0.4.3
+    # via transformers
+scikit-learn==1.5.0
+    # via sentence-transformers
+scipy==1.13.1
+    # via
+    #   scikit-learn
+    #   sentence-transformers
+sentence-transformers==2.7.0
+    # via -r requirements.in
+shellingham==1.5.4
+    # via typer
 simple-websocket==1.0.0
     # via python-engineio
+six==1.16.0
+    # via
+    #   kubernetes
+    #   posthog
+    #   python-dateutil
 sniffio==1.3.1
     # via
     #   anyio
     #   groq
     #   httpx
     #   openai
+soupsieve==2.5
+    # via beautifulsoup4
 sqlalchemy==2.0.30
     # via
     #   langchain
@@ -292,29 +452,61 @@ starlette==0.37.2
     # via
     #   chainlit
     #   fastapi
+sympy==1.12
+    # via
+    #   onnxruntime
+    #   torch
 syncer==2.0.3
     # via chainlit
 tenacity==8.3.0
     # via
+    #   chromadb
     #   langchain
     #   langchain-community
     #   langchain-core
+threadpoolctl==3.5.0
+    # via scikit-learn
+tokenizers==0.19.1
+    # via
+    #   chromadb
+    #   transformers
 tomli==2.0.1
-    # via chainlit
+    # via
+    #   build
+    #   chainlit
+torch==2.3.0
+    # via sentence-transformers
 tqdm==4.66.4
     # via
+    #   chromadb
     #   google-generativeai
+    #   huggingface-hub
     #   openai
+    #   sentence-transformers
+    #   transformers
+transformers==4.41.1
+    # via sentence-transformers
+typer==0.12.3
+    # via chromadb
+types-beautifulsoup4==4.12.0.20240511
+    # via -r requirements.in
+types-html5lib==1.1.11.20240228
+    # via types-beautifulsoup4
 typing-extensions==4.10.0
     # via
+    #   asgiref
+    #   chromadb
     #   fastapi
     #   google-generativeai
     #   groq
+    #   huggingface-hub
     #   openai
     #   opentelemetry-sdk
     #   pydantic
     #   pydantic-core
     #   sqlalchemy
+    #   torch
+    #   typer
     #   typing-inspect
     #   uvicorn
 typing-inspect==0.9.0
@@ -324,11 +516,23 @@ uptrace==1.22.0
 uritemplate==4.1.1
     # via google-api-python-client
 urllib3==2.2.1
-    # via requests
-uvicorn==0.25.0
-    # via chainlit
+    # via
+    #   kubernetes
+    #   requests
+uvicorn[standard]==0.25.0
+    # via
+    #   chainlit
+    #   chromadb
+uvloop==0.19.0
+    # via uvicorn
 watchfiles==0.20.0
-    # via chainlit
+    # via
+    #   chainlit
+    #   uvicorn
+websocket-client==1.8.0
+    # via kubernetes
+websockets==12.0
+    # via uvicorn
 wrapt==1.16.0
     # via
     #   deprecated