add Dockerfile and ingest-guru-cards.py

navapbc · May 28, 2024 · d7be2d3 · d7be2d3
1 parent 8b238c5
commit d7be2d3
Show file tree

Hide file tree

Showing 16 changed files with 597 additions and 50 deletions.
diff --git a/05-assistive-chatbot/.gitignore b/05-assistive-chatbot/.gitignore
@@ -4,6 +4,8 @@ chroma_db/
 *.log
 log/
 
+# MacOS files
 *.DS_STORE
 
+# .env contains secret API keys
 .env
diff --git a/05-assistive-chatbot/Dockerfile b/05-assistive-chatbot/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+RUN apt-get update && apt-get install -y \
+    curl unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG GURU_CARDS_URL
+RUN echo "Downloading from ${GURU_CARDS_URL}" \
+    && curl -L "${GURU_CARDS_URL}" > download.zip \
+    && unzip -o download.zip \
+    && rm download.zip \
+    && mv guru_cards_for_nava--Multi-benefit.json guru_cards_for_nava.json
+
+# Update .dockerignore to prevent files from being copied into the image
+COPY . .
+
+RUN ./ingest-guru-cards.py
+
+EXPOSE 8000
+HEALTHCHECK CMD curl http://localhost:8000 || exit 1
+ENTRYPOINT ["chainlit", "run", "--port", "8000", "-h", "chatbot-chainlit.py"]
diff --git a/05-assistive-chatbot/README.md b/05-assistive-chatbot/README.md
@@ -5,46 +5,54 @@
 * Use Python 3.11.x (higher versions cause library version problems).
 * Install Python libraries: `pip install -r requirements.txt`
 
-### (Optional) Enable Chatbot feedback
+### (Optional) Enable Chatbot Feedback
 To enable the [feedback mechanism](https://docs.chainlit.io/data-persistence/feedback):
 * Get an API key: https://docs.chainlit.io/data-persistence/overview
 * Create or update `.env` with `LITERAL_API_KEY` set to the API key
 
 After running the chatbot and providing feedback in the UI, review the feedback at https://cloud.getliteral.ai/projects/YOUR_PROJECT_NAME/feedback.
 
-* To use a custom feedback storage instead of `getliteral.ai`, see https://docs.chainlit.io/data-persistence/custom.
+* To use custom feedback storage instead of `getliteral.ai`, see https://docs.chainlit.io/data-persistence/custom.
 
 
-## Run
+## Running an application
 
-All apps use configurations set in `.env`, which can be overridden by environment variables, like `CHAT_ENGINE` and `LLM_MODEL_NAME`.  See `_init_settings()` in `chatbot/__init__.py` for other variables.
+There are several ways to run the chatbot application, offering different ways to interact with the chatbot.
+All apps use configurations set in `.env`, which is *not* checked into git. These configurations (like `CHAT_ENGINE` and `LLM_MODEL_NAME`) can be overridden by environment variables set on the commandline.  See `_init_settings()` in `chatbot/__init__.py` for other variables.
 
-### Run web chatbot app
+### Run commandline app
+
+This commandline application entrypoint is useful for quickly or repeatedly running, testing, or debugging without having to click through or type in a UI. Set the configuration in `.env` or as environment variables, then run `./cmdline.py`.
+
+To quickly set variables and run the app on a single line: 
+`CHATBOT_LOG_LEVEL=INFO CHAT_ENGINE=Direct LLM_MODEL_NAME='langchain.ollama :: openhermes' ./cmdline.py`
+
+To see more logs, adjust the log level, like `CHATBOT_LOG_LEVEL=DEBUG`.
+
+### Run chatbot web app
+
+This application provides a chatbot web app that users typically interact with.
 
 1. Start the Chainlit-based chatbot service: `./chatbot-chainlit.py` or `chainlit run ./chatbot-chainlit.py`
 1. Open a browser to `http://localhost:8000/`
 
-For development, run something like `chainlit run -w -h --port 9000 ./chatbot-chainlit.py`.
+For development, run something like `chainlit run -w -h --port 9000 ./chatbot-chainlit.py` to watch for changed files and automatically update the running application without having to restart chainlit.
+
+Chainlit UI configurations are in the `.chainlit/config.toml` file.
 
-Running the chatbot app will also run the API, which is defined in `chatbot_api.py`.
+Running the chatbot app will also run the API (described in the next section), which is defined in `chatbot_api.py`.
 
 ### Run only the API
 
+This application runs the chatbot API for other applications to make requests to the chatbot.
+
 1. Run `./chatbot_api.py`
 1. Open a browser to the `/query` endpoint followed by a question, such as `http://localhost:8001/query/tell me a joke`
 
-### Run commandline app
 
-1. Run `./cmdline.py`
-
-To quickly set variables and run the app on a single line: 
-`CHATBOT_LOG_LEVEL=INFO CHAT_ENGINE=Direct LLM_MODEL_NAME='langchain.ollama :: openhermes' ./cmdline.py`
-
-To see more logs, adjust the log level like `CHATBOT_LOG_LEVEL=DEBUG`.
-
-
-## Development
+## Development Notes
 
+- Application entrypoints are in the root folder of the repo. Other Python files are under the `chatbot` folder.
 - The chatbot package `chatbot/__init__.py` is run for all apps because they `import chatbot`.
 - It initializes settings (`_init_settings()`) and creates a specified chat engine (`create_chat_engine(settings)`).
 
@@ -56,7 +64,8 @@ To create a chat engine, add a new Python file under `chatbot/engines` with:
 - an `init_engine(settings)` function to instantiate a chat engine class
 - a chat engine class that:
     - creates a client to an LLM (`create_llm_client(settings)`), then
-    - uses the LLM client to generate a response to specified query (`gen_response(self, query)`)
+    - uses the LLM client to generate a response to a specified query (`gen_response(self, query)`)
+The new Python file will be automatically discovered and registered for display in the Chainlit settings web UI.
 
 The `chat_engine.gen_response(query)` function is called by the apps when a user submits a query.
 
@@ -71,8 +80,23 @@ To create a new LLM client, add a new Python file under `chatbot/llms` with:
 - an LLM client class that:
     - sets `self.client` based on the provided `settings`, and
     - implements a `submit(self, message)` function that uses `self.client` to generate a response, which may need to be parsed so that a string is returned to `chat_engine.gen_response(self, query)`.
+The new Python file will be automatically discovered and registered for display in the Chainlit settings web UI.
 
 An LLM client can be used in any arbitrary program by:
 - setting `client = init_client(model_name, settings)`
 - then calling `client.submit(message)`
 See `client_example_usage()` in `chatbot/llms/mock_llm_client.py`.
+
+### Python formatting
+
+Install and run `ruff format .` and `isort .` to consistently format Python files.
+
+### Docker
+
+A Docker image is built for deployments (by GitHub Action `push-image.yml`). To verify that the image builds and runs correctly, run:
+```
+GURU_CARDS_URL_ID='1fO-ABCD1234...' # Google Drive document id
+docker build -t dst-chatbot . --build-arg GURU_CARDS_URL="https://docs.google.com/uc?export=download&id=$GURU_CARDS_URL_ID"
+docker run --rm -p 8000:8000 dst-chatbot
+```
+Then, open a browser to `http://localhost:8000/` for testing.
diff --git a/05-assistive-chatbot/chatbot-chainlit.py b/05-assistive-chatbot/chatbot-chainlit.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env chainlit run -h
 
 """
-ChainLit-based chatbot, providing a web user interface for the selected chat engine and settings.
+Chainlit-based chatbot, providing a web user interface for the selected chat engine and settings.
 See README.md for instructions to enable user feedback.
 """
 

diff --git a/05-assistive-chatbot/chatbot/__init__.py b/05-assistive-chatbot/chatbot/__init__.py
@@ -13,7 +13,7 @@
 ## Initialize logging
 
 
-def _configure_logging():
+def configure_logging():
     log_format = os.environ.get("LOG_FORMAT", "%(relativeCreated)6d - %(name)-24s - %(levelname)-5s - %(message)s")
     logging.basicConfig(format=log_format)
 
@@ -23,7 +23,7 @@ def _configure_logging():
 
 
 dotenv.load_dotenv()
-_configure_logging()
+configure_logging()
 
 logger = logging.getLogger(__name__)
 

diff --git a/05-assistive-chatbot/chatbot/engines/__init__.py b/05-assistive-chatbot/chatbot/engines/__init__.py
@@ -1,4 +1,3 @@
-import importlib
 import logging
 from types import ModuleType
 from typing import Dict
@@ -18,8 +17,7 @@ def available_engines():
 def _discover_chat_engines(force=False):
     if not _engines or force:
         _engines.clear()
-        namespace = importlib.import_module(__package__)
-        found_llm_modules = utils.scan_modules(namespace)
+        found_llm_modules = utils.scan_modules(__package__)
         for _module_name, module in found_llm_modules.items():
             if not hasattr(module, "ENGINE_NAME"):
                 continue

diff --git a/05-assistive-chatbot/chatbot/guru_cards.py b/05-assistive-chatbot/chatbot/guru_cards.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+import json
+import os
+
+from bs4 import BeautifulSoup
+
+
+class GuruCardsProcessor:
+    def __init__(
+        self,
+        file_path="./guru_cards_for_nava.json",
+        question_key="preferredPhrase",
+        content_key="content",
+    ):
+        self.file_path = file_path
+        self.question_key = question_key
+        self.content_key = content_key
+
+    def extract_qa_text_from_guru(self):
+        json_data = self.cards_as_json()
+        question_answers = self._extract_question_answers(json_data)
+        return question_answers
+
+    def cards_as_json(self):
+        with open(self.file_path, encoding="utf-8") as data_file:
+            return json.load(data_file)
+
+    def _extract_question_answers(self, json_data):
+        question_answers = {}
+        for content in json_data:
+            if not content[self.question_key].strip().endswith("?"):
+                continue
+            soup = BeautifulSoup(content[self.content_key], "html.parser")
+            answer = soup.get_text(separator="\n", strip=True)
+            question_answers[content[self.question_key].strip()] = answer
+        return question_answers
+
+
+def save_simplified_json(gc_processor):
+    "Saves a simplified version of the Guru cards JSON file for easier review"
+    json_data = gc_processor.cards_as_json()
+    name, ext = os.path.splitext(gc_processor.file_path)
+    with open(f"{name}_simplified{ext}", "w", encoding="utf-8") as f:
+        simplified_json = []
+        for card in json_data:
+            tags = [tagsItem.get("value") for tagsItem in card.get("tags", [])]
+            boards = [boardsItem.get("title") for boardsItem in card.get("boards", [])]
+            soup = BeautifulSoup(card[gc_processor.content_key], "html.parser")
+            content = soup.get_text(separator="\n", strip=True)
+            simplified_json.append(
+                {
+                    "preferredPhrase": card["preferredPhrase"],
+                    "tags": ",".join(tags),
+                    "boards": ",".join(boards),
+                    gc_processor.content_key: content,
+                }
+            )
+        json.dump(simplified_json, f, indent=4)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if args := sys.argv[1:]:
+        _gc_processor = GuruCardsProcessor(file_path=args[0])
+    else:
+        _gc_processor = GuruCardsProcessor()
+
+    save_simplified_json(_gc_processor)
diff --git a/05-assistive-chatbot/chatbot/ingest/__init_.py b/05-assistive-chatbot/chatbot/ingest/__init_.py
diff --git a/05-assistive-chatbot/chatbot/ingest/text_splitter.py b/05-assistive-chatbot/chatbot/ingest/text_splitter.py
@@ -0,0 +1,49 @@
+import logging
+
+from langchain.docstore.document import Document
+from langchain_text_splitters import (NLTKTextSplitter,
+                                      RecursiveCharacterTextSplitter,
+                                      SpacyTextSplitter)
+
+logger = logging.getLogger(__name__)
+
+
+class TextSplitter:
+    def __init__(self, llm_client, token_limit, text_splitter_name, **text_splitter_args):
+        """
+        - llm_client is used to get the number of tokens in a text
+        - token_limit is the maximum number of tokens allowed by the embedding model
+        """
+        self.llm_client = llm_client
+        self.token_limit = token_limit
+        self.text_splitter = self.create_text_splitter(text_splitter_name, **text_splitter_args)
+
+    def create_text_splitter(self, choice, **kwargs):
+        logger.info("Creating %s", choice)
+        if choice == "NLTKTextSplitter":
+            logger.warning("  Not using arguments: %s", kwargs)
+            splitter = NLTKTextSplitter()
+        elif choice == "SpacyTextSplitter":
+            logger.warning("  Not using arguments: %s", kwargs)
+            splitter = SpacyTextSplitter()
+        elif choice == "RecursiveCharacterTextSplitter":
+            logger.info("  Using arguments: %s", kwargs)
+            splitter = RecursiveCharacterTextSplitter(
+                chunk_size=kwargs["chunk_size"], chunk_overlap=kwargs["chunk_overlap"]
+            )
+        return splitter
+
+    def split_into_chunks(self, title, text):
+        """
+        - title is the title to be used as the source of the text
+        - text is the text to split
+        """
+        entire_text = title + "\n\n" + text
+        texts = self.text_splitter.split_text(entire_text)
+
+        logger.info("  Split into %s", len(texts))
+        for t in texts:
+            token_count = self.llm_client.get_num_tokens(t)
+            assert token_count <= self.token_limit, "Exceeded token limit of {self.token_limit}: {token_count}"
+
+        return [Document(page_content=t, metadata={"source": title.strip(), "entire_card": entire_text}) for t in texts]
diff --git a/05-assistive-chatbot/chatbot/llms/__init__.py b/05-assistive-chatbot/chatbot/llms/__init__.py
@@ -1,4 +1,3 @@
-import importlib
 import logging
 from types import ModuleType
 from typing import Dict, Tuple
@@ -18,29 +17,38 @@ def available_llms():
 def _discover_llms(force=False):
     if not _llms or force:
         _llms.clear()
-        namespace = importlib.import_module(__package__)
-        found_modules = utils.scan_modules(namespace)
+        found_modules = utils.scan_modules(__package__)
         for module_name, module in found_modules.items():
             if not module or ignore(module_name):
                 logger.debug("Skipping module: %s", module_name)
                 continue
             client_name = module.CLIENT_NAME or module_name
             for llm_name in module.MODEL_NAMES or []:
-                qualified_llm_name = f"{client_name} :: {llm_name}"
-                _llms[qualified_llm_name] = (module, llm_name)
+                qualified_name = qualified_llm_name(client_name, llm_name)
+                _llms[qualified_name] = (module, llm_name)
     return _llms
 
 
+def qualified_llm_name(client_name, model_name):
+    return f"{client_name} :: {model_name}"
+
+
 def ignore(module_name):
     if module_name.startswith("dspy ::"):
+        # DSPy client code is not yet ready for use
         return True
     return False
 
 
 ## Factory functions
 
 
-def init_client(model_name, settings=None):
+def init_client(qualified_name, settings=None):
+    """Initialize a specific LLM client based on the qualified_name.
+    :param qualified_name: str or Tuple[client_name, model_name]
+    """
     _discover_llms()
-    module, llm_name = _llms[model_name]
-    return module.init_client(llm_name, settings)
+    if isinstance(qualified_name, Tuple):
+        qualified_name = qualified_llm_name(qualified_name[0], qualified_name[1])
+    module, llm_name = _llms[qualified_name]
+    return module.init_client(llm_name, settings or {})
diff --git a/05-assistive-chatbot/chatbot/utils.py b/05-assistive-chatbot/chatbot/utils.py
@@ -50,6 +50,8 @@ def wrapper_timer(*args, **kwargs):
 
 def scan_modules(ns_pkg):
     "Return a dictionary of Python modules found in the given namespace package"
+    if isinstance(ns_pkg, str):
+        ns_pkg = importlib.import_module(ns_pkg)
     # From https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-namespace-packages
     itr = pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
     return {name: _import_module_if_possible(name) for _, name, _ in itr}

diff --git a/05-assistive-chatbot/chatbot_api.py b/05-assistive-chatbot/chatbot_api.py
@@ -2,8 +2,8 @@
 
 """
 This is a sample API file that demonstrates how to create an API using FastAPI,
-which is compatible with ChainLit. This file is a starting point for creating
-an API that can be deployed with the ChainLit chatbot.
+which is compatible with Chainlit. This file is a starting point for creating
+an API that can be deployed with the Chainlit chatbot.
 """
 
 import logging
@@ -17,7 +17,7 @@
     # If running directly, define the FastAPI app
     app = FastAPI()
 else:
-    # Otherwise use ChainLit's app
+    # Otherwise use Chainlit's app
     from chainlit.server import app
 
 logger = logging.getLogger(f"chatbot.{__name__}")