From d7be2d3c768f1bb2685b1b96528121c384d86e69 Mon Sep 17 00:00:00 2001 From: Yoom Lam Date: Sun, 26 May 2024 23:01:45 -0500 Subject: [PATCH] add Dockerfile and ingest-guru-cards.py --- 05-assistive-chatbot/.gitignore | 2 + 05-assistive-chatbot/Dockerfile | 26 ++ 05-assistive-chatbot/README.md | 60 +++-- 05-assistive-chatbot/chatbot-chainlit.py | 2 +- 05-assistive-chatbot/chatbot/__init__.py | 4 +- .../chatbot/engines/__init__.py | 4 +- 05-assistive-chatbot/chatbot/guru_cards.py | 69 ++++++ .../chatbot/ingest/__init_.py | 0 .../chatbot/ingest/text_splitter.py | 49 ++++ 05-assistive-chatbot/chatbot/llms/__init__.py | 24 +- 05-assistive-chatbot/chatbot/utils.py | 2 + 05-assistive-chatbot/chatbot_api.py | 6 +- 05-assistive-chatbot/guru_api_access.md | 75 ++++++ 05-assistive-chatbot/ingest-guru-cards.py | 89 +++++++ 05-assistive-chatbot/requirements.in | 9 +- 05-assistive-chatbot/requirements.txt | 226 +++++++++++++++++- 16 files changed, 597 insertions(+), 50 deletions(-) create mode 100644 05-assistive-chatbot/Dockerfile create mode 100644 05-assistive-chatbot/chatbot/guru_cards.py create mode 100644 05-assistive-chatbot/chatbot/ingest/__init_.py create mode 100644 05-assistive-chatbot/chatbot/ingest/text_splitter.py create mode 100644 05-assistive-chatbot/guru_api_access.md create mode 100755 05-assistive-chatbot/ingest-guru-cards.py diff --git a/05-assistive-chatbot/.gitignore b/05-assistive-chatbot/.gitignore index 2a52261..33c3b03 100644 --- a/05-assistive-chatbot/.gitignore +++ b/05-assistive-chatbot/.gitignore @@ -4,6 +4,8 @@ chroma_db/ *.log log/ +# MacOS files *.DS_STORE +# .env contains secret API keys .env diff --git a/05-assistive-chatbot/Dockerfile b/05-assistive-chatbot/Dockerfile new file mode 100644 index 0000000..3b227c4 --- /dev/null +++ b/05-assistive-chatbot/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip3 install --no-cache-dir -r requirements.txt + +RUN apt-get update && apt-get install -y \ + curl unzip \ + && rm -rf /var/lib/apt/lists/* + +ARG GURU_CARDS_URL +RUN echo "Downloading from ${GURU_CARDS_URL}" \ + && curl -L "${GURU_CARDS_URL}" > download.zip \ + && unzip -o download.zip \ + && rm download.zip \ + && mv guru_cards_for_nava--Multi-benefit.json guru_cards_for_nava.json + +# Update .dockerignore to prevent files from being copied into the image +COPY . . + +RUN ./ingest-guru-cards.py + +EXPOSE 8000 +HEALTHCHECK CMD curl http://localhost:8000 || exit 1 +ENTRYPOINT ["chainlit", "run", "--port", "8000", "-h", "chatbot-chainlit.py"] diff --git a/05-assistive-chatbot/README.md b/05-assistive-chatbot/README.md index 8691d78..bd764b8 100644 --- a/05-assistive-chatbot/README.md +++ b/05-assistive-chatbot/README.md @@ -5,46 +5,54 @@ * Use Python 3.11.x (higher versions cause library version problems). * Install Python libraries: `pip install -r requirements.txt` -### (Optional) Enable Chatbot feedback +### (Optional) Enable Chatbot Feedback To enable the [feedback mechanism](https://docs.chainlit.io/data-persistence/feedback): * Get an API key: https://docs.chainlit.io/data-persistence/overview * Create or update `.env` with `LITERAL_API_KEY` set to the API key After running the chatbot and providing feedback in the UI, review the feedback at https://cloud.getliteral.ai/projects/YOUR_PROJECT_NAME/feedback. -* To use a custom feedback storage instead of `getliteral.ai`, see https://docs.chainlit.io/data-persistence/custom. +* To use custom feedback storage instead of `getliteral.ai`, see https://docs.chainlit.io/data-persistence/custom. -## Run +## Running an application -All apps use configurations set in `.env`, which can be overridden by environment variables, like `CHAT_ENGINE` and `LLM_MODEL_NAME`. See `_init_settings()` in `chatbot/__init__.py` for other variables. +There are several ways to run the chatbot application, offering different ways to interact with the chatbot. +All apps use configurations set in `.env`, which is *not* checked into git. These configurations (like `CHAT_ENGINE` and `LLM_MODEL_NAME`) can be overridden by environment variables set on the commandline. See `_init_settings()` in `chatbot/__init__.py` for other variables. -### Run web chatbot app +### Run commandline app + +This commandline application entrypoint is useful for quickly or repeatedly running, testing, or debugging without having to click through or type in a UI. Set the configuration in `.env` or as environment variables, then run `./cmdline.py`. + +To quickly set variables and run the app on a single line: +`CHATBOT_LOG_LEVEL=INFO CHAT_ENGINE=Direct LLM_MODEL_NAME='langchain.ollama :: openhermes' ./cmdline.py` + +To see more logs, adjust the log level, like `CHATBOT_LOG_LEVEL=DEBUG`. + +### Run chatbot web app + +This application provides a chatbot web app that users typically interact with. 1. Start the Chainlit-based chatbot service: `./chatbot-chainlit.py` or `chainlit run ./chatbot-chainlit.py` 1. Open a browser to `http://localhost:8000/` -For development, run something like `chainlit run -w -h --port 9000 ./chatbot-chainlit.py`. +For development, run something like `chainlit run -w -h --port 9000 ./chatbot-chainlit.py` to watch for changed files and automatically update the running application without having to restart chainlit. + +Chainlit UI configurations are in the `.chainlit/config.toml` file. -Running the chatbot app will also run the API, which is defined in `chatbot_api.py`. +Running the chatbot app will also run the API (described in the next section), which is defined in `chatbot_api.py`. ### Run only the API +This application runs the chatbot API for other applications to make requests to the chatbot. + 1. Run `./chatbot_api.py` 1. Open a browser to the `/query` endpoint followed by a question, such as `http://localhost:8001/query/tell me a joke` -### Run commandline app -1. Run `./cmdline.py` - -To quickly set variables and run the app on a single line: -`CHATBOT_LOG_LEVEL=INFO CHAT_ENGINE=Direct LLM_MODEL_NAME='langchain.ollama :: openhermes' ./cmdline.py` - -To see more logs, adjust the log level like `CHATBOT_LOG_LEVEL=DEBUG`. - - -## Development +## Development Notes +- Application entrypoints are in the root folder of the repo. Other Python files are under the `chatbot` folder. - The chatbot package `chatbot/__init__.py` is run for all apps because they `import chatbot`. - It initializes settings (`_init_settings()`) and creates a specified chat engine (`create_chat_engine(settings)`). @@ -56,7 +64,8 @@ To create a chat engine, add a new Python file under `chatbot/engines` with: - an `init_engine(settings)` function to instantiate a chat engine class - a chat engine class that: - creates a client to an LLM (`create_llm_client(settings)`), then - - uses the LLM client to generate a response to specified query (`gen_response(self, query)`) + - uses the LLM client to generate a response to a specified query (`gen_response(self, query)`) +The new Python file will be automatically discovered and registered for display in the Chainlit settings web UI. The `chat_engine.gen_response(query)` function is called by the apps when a user submits a query. @@ -71,8 +80,23 @@ To create a new LLM client, add a new Python file under `chatbot/llms` with: - an LLM client class that: - sets `self.client` based on the provided `settings`, and - implements a `submit(self, message)` function that uses `self.client` to generate a response, which may need to be parsed so that a string is returned to `chat_engine.gen_response(self, query)`. +The new Python file will be automatically discovered and registered for display in the Chainlit settings web UI. An LLM client can be used in any arbitrary program by: - setting `client = init_client(model_name, settings)` - then calling `client.submit(message)` See `client_example_usage()` in `chatbot/llms/mock_llm_client.py`. + +### Python formatting + +Install and run `ruff format .` and `isort .` to consistently format Python files. + +### Docker + +A Docker image is built for deployments (by GitHub Action `push-image.yml`). To verify that the image builds and runs correctly, run: +``` +GURU_CARDS_URL_ID='1fO-ABCD1234...' # Google Drive document id +docker build -t dst-chatbot . --build-arg GURU_CARDS_URL="https://docs.google.com/uc?export=download&id=$GURU_CARDS_URL_ID" +docker run --rm -p 8000:8000 dst-chatbot +``` +Then, open a browser to `http://localhost:8000/` for testing. diff --git a/05-assistive-chatbot/chatbot-chainlit.py b/05-assistive-chatbot/chatbot-chainlit.py index 51f96b1..8c752d2 100755 --- a/05-assistive-chatbot/chatbot-chainlit.py +++ b/05-assistive-chatbot/chatbot-chainlit.py @@ -1,7 +1,7 @@ #!/usr/bin/env chainlit run -h """ -ChainLit-based chatbot, providing a web user interface for the selected chat engine and settings. +Chainlit-based chatbot, providing a web user interface for the selected chat engine and settings. See README.md for instructions to enable user feedback. """ diff --git a/05-assistive-chatbot/chatbot/__init__.py b/05-assistive-chatbot/chatbot/__init__.py index 21abde1..c51ccf5 100644 --- a/05-assistive-chatbot/chatbot/__init__.py +++ b/05-assistive-chatbot/chatbot/__init__.py @@ -13,7 +13,7 @@ ## Initialize logging -def _configure_logging(): +def configure_logging(): log_format = os.environ.get("LOG_FORMAT", "%(relativeCreated)6d - %(name)-24s - %(levelname)-5s - %(message)s") logging.basicConfig(format=log_format) @@ -23,7 +23,7 @@ def _configure_logging(): dotenv.load_dotenv() -_configure_logging() +configure_logging() logger = logging.getLogger(__name__) diff --git a/05-assistive-chatbot/chatbot/engines/__init__.py b/05-assistive-chatbot/chatbot/engines/__init__.py index 535dc1b..b183b45 100644 --- a/05-assistive-chatbot/chatbot/engines/__init__.py +++ b/05-assistive-chatbot/chatbot/engines/__init__.py @@ -1,4 +1,3 @@ -import importlib import logging from types import ModuleType from typing import Dict @@ -18,8 +17,7 @@ def available_engines(): def _discover_chat_engines(force=False): if not _engines or force: _engines.clear() - namespace = importlib.import_module(__package__) - found_llm_modules = utils.scan_modules(namespace) + found_llm_modules = utils.scan_modules(__package__) for _module_name, module in found_llm_modules.items(): if not hasattr(module, "ENGINE_NAME"): continue diff --git a/05-assistive-chatbot/chatbot/guru_cards.py b/05-assistive-chatbot/chatbot/guru_cards.py new file mode 100644 index 0000000..1fe608d --- /dev/null +++ b/05-assistive-chatbot/chatbot/guru_cards.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +import json +import os + +from bs4 import BeautifulSoup + + +class GuruCardsProcessor: + def __init__( + self, + file_path="./guru_cards_for_nava.json", + question_key="preferredPhrase", + content_key="content", + ): + self.file_path = file_path + self.question_key = question_key + self.content_key = content_key + + def extract_qa_text_from_guru(self): + json_data = self.cards_as_json() + question_answers = self._extract_question_answers(json_data) + return question_answers + + def cards_as_json(self): + with open(self.file_path, encoding="utf-8") as data_file: + return json.load(data_file) + + def _extract_question_answers(self, json_data): + question_answers = {} + for content in json_data: + if not content[self.question_key].strip().endswith("?"): + continue + soup = BeautifulSoup(content[self.content_key], "html.parser") + answer = soup.get_text(separator="\n", strip=True) + question_answers[content[self.question_key].strip()] = answer + return question_answers + + +def save_simplified_json(gc_processor): + "Saves a simplified version of the Guru cards JSON file for easier review" + json_data = gc_processor.cards_as_json() + name, ext = os.path.splitext(gc_processor.file_path) + with open(f"{name}_simplified{ext}", "w", encoding="utf-8") as f: + simplified_json = [] + for card in json_data: + tags = [tagsItem.get("value") for tagsItem in card.get("tags", [])] + boards = [boardsItem.get("title") for boardsItem in card.get("boards", [])] + soup = BeautifulSoup(card[gc_processor.content_key], "html.parser") + content = soup.get_text(separator="\n", strip=True) + simplified_json.append( + { + "preferredPhrase": card["preferredPhrase"], + "tags": ",".join(tags), + "boards": ",".join(boards), + gc_processor.content_key: content, + } + ) + json.dump(simplified_json, f, indent=4) + + +if __name__ == "__main__": + import sys + + if args := sys.argv[1:]: + _gc_processor = GuruCardsProcessor(file_path=args[0]) + else: + _gc_processor = GuruCardsProcessor() + + save_simplified_json(_gc_processor) diff --git a/05-assistive-chatbot/chatbot/ingest/__init_.py b/05-assistive-chatbot/chatbot/ingest/__init_.py new file mode 100644 index 0000000..e69de29 diff --git a/05-assistive-chatbot/chatbot/ingest/text_splitter.py b/05-assistive-chatbot/chatbot/ingest/text_splitter.py new file mode 100644 index 0000000..2770e4b --- /dev/null +++ b/05-assistive-chatbot/chatbot/ingest/text_splitter.py @@ -0,0 +1,49 @@ +import logging + +from langchain.docstore.document import Document +from langchain_text_splitters import (NLTKTextSplitter, + RecursiveCharacterTextSplitter, + SpacyTextSplitter) + +logger = logging.getLogger(__name__) + + +class TextSplitter: + def __init__(self, llm_client, token_limit, text_splitter_name, **text_splitter_args): + """ + - llm_client is used to get the number of tokens in a text + - token_limit is the maximum number of tokens allowed by the embedding model + """ + self.llm_client = llm_client + self.token_limit = token_limit + self.text_splitter = self.create_text_splitter(text_splitter_name, **text_splitter_args) + + def create_text_splitter(self, choice, **kwargs): + logger.info("Creating %s", choice) + if choice == "NLTKTextSplitter": + logger.warning(" Not using arguments: %s", kwargs) + splitter = NLTKTextSplitter() + elif choice == "SpacyTextSplitter": + logger.warning(" Not using arguments: %s", kwargs) + splitter = SpacyTextSplitter() + elif choice == "RecursiveCharacterTextSplitter": + logger.info(" Using arguments: %s", kwargs) + splitter = RecursiveCharacterTextSplitter( + chunk_size=kwargs["chunk_size"], chunk_overlap=kwargs["chunk_overlap"] + ) + return splitter + + def split_into_chunks(self, title, text): + """ + - title is the title to be used as the source of the text + - text is the text to split + """ + entire_text = title + "\n\n" + text + texts = self.text_splitter.split_text(entire_text) + + logger.info(" Split into %s", len(texts)) + for t in texts: + token_count = self.llm_client.get_num_tokens(t) + assert token_count <= self.token_limit, "Exceeded token limit of {self.token_limit}: {token_count}" + + return [Document(page_content=t, metadata={"source": title.strip(), "entire_card": entire_text}) for t in texts] diff --git a/05-assistive-chatbot/chatbot/llms/__init__.py b/05-assistive-chatbot/chatbot/llms/__init__.py index 4e86dfb..f039cec 100644 --- a/05-assistive-chatbot/chatbot/llms/__init__.py +++ b/05-assistive-chatbot/chatbot/llms/__init__.py @@ -1,4 +1,3 @@ -import importlib import logging from types import ModuleType from typing import Dict, Tuple @@ -18,21 +17,25 @@ def available_llms(): def _discover_llms(force=False): if not _llms or force: _llms.clear() - namespace = importlib.import_module(__package__) - found_modules = utils.scan_modules(namespace) + found_modules = utils.scan_modules(__package__) for module_name, module in found_modules.items(): if not module or ignore(module_name): logger.debug("Skipping module: %s", module_name) continue client_name = module.CLIENT_NAME or module_name for llm_name in module.MODEL_NAMES or []: - qualified_llm_name = f"{client_name} :: {llm_name}" - _llms[qualified_llm_name] = (module, llm_name) + qualified_name = qualified_llm_name(client_name, llm_name) + _llms[qualified_name] = (module, llm_name) return _llms +def qualified_llm_name(client_name, model_name): + return f"{client_name} :: {model_name}" + + def ignore(module_name): if module_name.startswith("dspy ::"): + # DSPy client code is not yet ready for use return True return False @@ -40,7 +43,12 @@ def ignore(module_name): ## Factory functions -def init_client(model_name, settings=None): +def init_client(qualified_name, settings=None): + """Initialize a specific LLM client based on the qualified_name. + :param qualified_name: str or Tuple[client_name, model_name] + """ _discover_llms() - module, llm_name = _llms[model_name] - return module.init_client(llm_name, settings) + if isinstance(qualified_name, Tuple): + qualified_name = qualified_llm_name(qualified_name[0], qualified_name[1]) + module, llm_name = _llms[qualified_name] + return module.init_client(llm_name, settings or {}) diff --git a/05-assistive-chatbot/chatbot/utils.py b/05-assistive-chatbot/chatbot/utils.py index 6461b5f..a36ab73 100644 --- a/05-assistive-chatbot/chatbot/utils.py +++ b/05-assistive-chatbot/chatbot/utils.py @@ -50,6 +50,8 @@ def wrapper_timer(*args, **kwargs): def scan_modules(ns_pkg): "Return a dictionary of Python modules found in the given namespace package" + if isinstance(ns_pkg, str): + ns_pkg = importlib.import_module(ns_pkg) # From https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-namespace-packages itr = pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".") return {name: _import_module_if_possible(name) for _, name, _ in itr} diff --git a/05-assistive-chatbot/chatbot_api.py b/05-assistive-chatbot/chatbot_api.py index da9a1ce..691ef30 100755 --- a/05-assistive-chatbot/chatbot_api.py +++ b/05-assistive-chatbot/chatbot_api.py @@ -2,8 +2,8 @@ """ This is a sample API file that demonstrates how to create an API using FastAPI, -which is compatible with ChainLit. This file is a starting point for creating -an API that can be deployed with the ChainLit chatbot. +which is compatible with Chainlit. This file is a starting point for creating +an API that can be deployed with the Chainlit chatbot. """ import logging @@ -17,7 +17,7 @@ # If running directly, define the FastAPI app app = FastAPI() else: - # Otherwise use ChainLit's app + # Otherwise use Chainlit's app from chainlit.server import app logger = logging.getLogger(f"chatbot.{__name__}") diff --git a/05-assistive-chatbot/guru_api_access.md b/05-assistive-chatbot/guru_api_access.md new file mode 100644 index 0000000..9d3a770 --- /dev/null +++ b/05-assistive-chatbot/guru_api_access.md @@ -0,0 +1,75 @@ +## Get access to Guru API +Add the following in `.env`: +``` +GURU_API_TOKEN=... +COLLECTION_ID=... +``` + +Get the value for `GURU_API_TOKEN` from https://navalabs.atlassian.net/browse/DST-127?focusedCommentId=10029 +and `COLLECTION_ID` from https://navalabs.atlassian.net/browse/DST-127?focusedCommentId=10032 + + +Apply environment variables to command line shell: `source .env` + +Verify it works: `curl -u $COLLECTION_ID:$GURU_API_TOKEN https://api.getguru.com/api/v1/teams -D -` + +Expect: +``` +HTTP/2 200 +date: Tue, 19 Mar 2024 19:49:16 GMT +content-type: application/json +content-length: 449 +... + +[ { + "organization" : { + "name" : "Benefits Data Trust", + "id" : "..." + }, + "totalUsers" : 294, + "topLevelOrganizationId" : "...", + "name" : "Benefits Data Trust", + "id" : "...", + "status" : "ACTIVE", + "dateCreated" : "2017-07-06T13:09:52.980+0000", + "profilePicUrl" : "https://pp.getguru.com/....jpeg" +} ]% +``` + +## Get cards +Use web UI: https://developer.getguru.com/reference/getv1cardsgetextendedfact +or the following, which retrieves all pages: +``` +NEXT_URL='https://api.getguru.com/api/v1/search/cardmgr?queryType=cards&showArchived=false' +CURL_COUNTER=1 +while [ "$NEXT_URL" ] +do + RESP_HEADER=$(curl -sS -D - \ + -u "$COLLECTION_ID:$GURU_API_TOKEN" \ + --request GET -o "guru_cards_for_nava_$CURL_COUNTER.json" \ + --header 'accept: application/json' \ + --url "$NEXT_URL") + NEXT_URL=$(echo $RESP_HEADER | ggrep -oP '^link: <\K.*(?=>)') + let CURL_COUNTER=$CURL_COUNTER+1 +done + +# Merge files each containing a JSON list +jq -n '[inputs] | add' guru_cards_for_nava_?.json guru_cards_for_nava_??.json > guru_cards_for_nava.json + +# Create simplified JSON for readability +python ingest.py guru_cards_for_nava.json + +# Count cards +jq length guru_cards_for_nava.json + +# Zip +zip guru_cards_for_nava.zip guru_cards_for_nava.json guru_cards_for_nava_simplified.json +``` + +Share zip file via Google Drive. + +## Extract all HTML content + +`cat guru_cards_for_nava.json | jq '.[] | .content' > guru_cards.html` + +`jq -r '.[] | .preferredPhrase + "\n tags: " + ( [.tags[]?.value] | join(",") ) +"\n content: " + .content' guru_cards_for_nava--Multi-benefit.json` diff --git a/05-assistive-chatbot/ingest-guru-cards.py b/05-assistive-chatbot/ingest-guru-cards.py new file mode 100755 index 0000000..cf00870 --- /dev/null +++ b/05-assistive-chatbot/ingest-guru-cards.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +import logging +from dataclasses import dataclass +from functools import cached_property +from typing import Callable + +import dotenv +from langchain_community.embeddings import (HuggingFaceEmbeddings, + SentenceTransformerEmbeddings) +from langchain_community.vectorstores import Chroma + +import chatbot +from chatbot import guru_cards, llms, utils +from chatbot.ingest.text_splitter import TextSplitter + +logger = logging.getLogger(f"chatbot.{__name__}") + + +@dataclass +class EmbeddingsModel: + name: str + token_limit: int + create: Callable + + +_EMBEDDINGS_MODEL_LIST = [ + EmbeddingsModel("all-MiniLM-L6-v2", 256, lambda: SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")), + EmbeddingsModel("HuggingFace::all-MiniLM-L6-v2", 256, lambda: HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")), + EmbeddingsModel( + "BAAI/bge-small-en-v1.5", 512, lambda: SentenceTransformerEmbeddings(model_name="BAAI/bge-small-en-v1.5") + ), + EmbeddingsModel( + "mixedbread-ai/mxbai-embed-large-v1", + 1024, + lambda: SentenceTransformerEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1"), + ), + # EmbeddingsModel("Google::embedding-001", 2048, lambda: GoogleGenerativeAIEmbeddings(model="models/embedding-001")), + # EmbeddingsModel("Google::text-embedding-004", 768, lambda: GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")), +] + +EMBEDDING_MODELS = {model.name: model for model in _EMBEDDINGS_MODEL_LIST} + + +class AppState: + def __init__(self, llm_model, embedding_name): + self.llm_model = llm_model + self.embedding_name = embedding_name + + @cached_property + @utils.timer + def llm(self): + logger.info("Creating LLM") + return llms.init_client(("langchain.ollama", self.llm_model)) + + @cached_property + @utils.timer + def vectordb(self): + logger.info("Creating Vector DB") + embeddings_model = EMBEDDING_MODELS[self.embedding_name].create() + logger.info("Embeddings model created: %s", embeddings_model) + return Chroma( + embedding_function=embeddings_model, + # Must use collection_name="langchain" -- https://github.com/langchain-ai/langchain/issues/10864#issuecomment-1730303411 + collection_name="langchain", + persist_directory="./chroma_db", + ) + + +if __name__ == "__main__": + dotenv.load_dotenv() + chatbot.configure_logging() + + app_state = AppState("mistral", "all-MiniLM-L6-v2") + + text_splitter = TextSplitter( + llm_client=app_state.llm.client, + token_limit=EMBEDDING_MODELS[app_state.embedding_name].token_limit, + text_splitter_name="RecursiveCharacterTextSplitter", + # Use smaller chunks for shorter-length quotes + chunk_size=250, + chunk_overlap=100, + ) + + guru_question_answers = guru_cards.GuruCardsProcessor().extract_qa_text_from_guru() + # Chunk the json data and load into vector db + for question, answer in guru_question_answers.items(): + logger.info("Processing document: %s", question) + chunks = text_splitter.split_into_chunks(question, answer) + app_state.vectordb.add_documents(documents=chunks) diff --git a/05-assistive-chatbot/requirements.in b/05-assistive-chatbot/requirements.in index 7d1d1d0..38f810c 100644 --- a/05-assistive-chatbot/requirements.in +++ b/05-assistive-chatbot/requirements.in @@ -9,12 +9,13 @@ chainlit==1.1 # langchain langchain_community langchain-google-genai -# langchain-text-splitters -# sentence-transformers ## Vector DB -# chromadb -# beautifulsoup4 +langchain-text-splitters +sentence-transformers +chromadb +beautifulsoup4 +types-beautifulsoup4 ## OpenAI LLM openai diff --git a/05-assistive-chatbot/requirements.txt b/05-assistive-chatbot/requirements.txt index 5d670d4..dd2d6d9 100644 --- a/05-assistive-chatbot/requirements.txt +++ b/05-assistive-chatbot/requirements.txt @@ -22,6 +22,8 @@ anyio==3.7.1 # openai # starlette # watchfiles +asgiref==3.8.1 + # via opentelemetry-instrumentation-asgi async-timeout==4.0.3 # via # aiohttp @@ -30,14 +32,23 @@ asyncer==0.0.2 # via chainlit attrs==23.2.0 # via aiohttp +backoff==2.2.1 + # via posthog +bcrypt==4.1.3 + # via chromadb +beautifulsoup4==4.12.3 + # via -r requirements.in bidict==0.23.1 # via python-socketio +build==1.2.1 + # via chromadb cachetools==5.3.3 # via google-auth certifi==2024.2.2 # via # httpcore # httpx + # kubernetes # requests chainlit==1.1.0 # via -r requirements.in @@ -45,10 +56,17 @@ charset-normalizer==3.3.2 # via requests chevron==0.14.0 # via literalai +chroma-hnswlib==0.7.3 + # via chromadb +chromadb==0.5.0 + # via -r requirements.in click==8.1.7 # via # chainlit + # typer # uvicorn +coloredlogs==15.0.1 + # via onnxruntime dataclasses-json==0.5.14 # via # chainlit @@ -67,15 +85,27 @@ exceptiongroup==1.2.1 fastapi==0.110.3 # via # chainlit + # chromadb # fastapi-socketio fastapi-socketio==0.0.10 # via chainlit +filelock==3.14.0 + # via + # huggingface-hub + # torch + # transformers filetype==1.2.0 # via chainlit +flatbuffers==24.3.25 + # via onnxruntime frozenlist==1.4.1 # via # aiohttp # aiosignal +fsspec==2024.5.0 + # via + # huggingface-hub + # torch google-ai-generativelanguage==0.6.4 # via google-generativeai google-api-core[grpc]==2.19.0 @@ -92,6 +122,7 @@ google-auth==2.29.0 # google-api-python-client # google-auth-httplib2 # google-generativeai + # kubernetes google-auth-httplib2==0.2.0 # via google-api-python-client google-generativeai==0.5.4 @@ -106,6 +137,7 @@ groq==0.8.0 # via -r requirements.in grpcio==1.62.1 # via + # chromadb # google-api-core # grpcio-status # opentelemetry-exporter-otlp-proto-grpc @@ -122,12 +154,21 @@ httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 +httptools==0.6.1 + # via uvicorn httpx==0.27.0 # via # chainlit # groq # literalai # openai +huggingface-hub==0.23.1 + # via + # sentence-transformers + # tokenizers + # transformers +humanfriendly==10.0 + # via coloredlogs idna==3.6 # via # anyio @@ -136,10 +177,18 @@ idna==3.6 # yarl importlib-metadata==6.11.0 # via opentelemetry-api +importlib-resources==6.4.0 + # via chromadb +jinja2==3.1.4 + # via torch +joblib==1.4.2 + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch +kubernetes==29.0.0 + # via chromadb langchain==0.2.1 # via langchain-community langchain-community==0.2.1 @@ -153,7 +202,9 @@ langchain-core==0.2.1 langchain-google-genai==1.0.5 # via -r requirements.in langchain-text-splitters==0.2.0 - # via langchain + # via + # -r requirements.in + # langchain langsmith==0.1.63 # via # langchain @@ -163,8 +214,20 @@ lazify==0.4.0 # via chainlit literalai==0.0.601 # via chainlit +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via jinja2 marshmallow==3.21.1 # via dataclasses-json +mdurl==0.1.2 + # via markdown-it-py +mmh3==4.1.0 + # via chromadb +monotonic==1.6 + # via posthog +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via # aiohttp @@ -173,17 +236,35 @@ mypy-extensions==1.0.0 # via typing-inspect nest-asyncio==1.6.0 # via chainlit +networkx==3.3 + # via torch numpy==1.26.4 # via + # chroma-hnswlib + # chromadb # langchain # langchain-community + # onnxruntime + # scikit-learn + # scipy + # sentence-transformers + # transformers +oauthlib==3.2.2 + # via + # kubernetes + # requests-oauthlib +onnxruntime==1.18.0 + # via chromadb openai==1.30.2 # via -r requirements.in opentelemetry-api==1.24.0 # via + # chromadb # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http # opentelemetry-instrumentation + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi # opentelemetry-sdk # uptrace opentelemetry-exporter-otlp==1.24.0 @@ -193,11 +274,20 @@ opentelemetry-exporter-otlp-proto-common==1.24.0 # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http opentelemetry-exporter-otlp-proto-grpc==1.24.0 - # via opentelemetry-exporter-otlp + # via + # chromadb + # opentelemetry-exporter-otlp opentelemetry-exporter-otlp-proto-http==1.24.0 # via opentelemetry-exporter-otlp opentelemetry-instrumentation==0.45b0 - # via uptrace + # via + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi + # uptrace +opentelemetry-instrumentation-asgi==0.45b0 + # via opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-fastapi==0.45b0 + # via chromadb opentelemetry-proto==1.24.0 # via # opentelemetry-exporter-otlp-proto-common @@ -205,19 +295,39 @@ opentelemetry-proto==1.24.0 # opentelemetry-exporter-otlp-proto-http opentelemetry-sdk==1.24.0 # via + # chromadb # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http # uptrace opentelemetry-semantic-conventions==0.45b0 - # via opentelemetry-sdk + # via + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi + # opentelemetry-sdk +opentelemetry-util-http==0.45b0 + # via + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi orjson==3.10.3 - # via langsmith + # via + # chromadb + # langsmith +overrides==7.7.0 + # via chromadb packaging==23.2 # via + # build # chainlit + # huggingface-hub # langchain-core # literalai # marshmallow + # onnxruntime + # transformers +pillow==10.3.0 + # via sentence-transformers +posthog==3.5.0 + # via chromadb proto-plus==1.23.0 # via # google-ai-generativelanguage @@ -229,6 +339,7 @@ protobuf==4.25.3 # google-generativeai # googleapis-common-protos # grpcio-status + # onnxruntime # opentelemetry-proto # proto-plus pyasn1==0.6.0 @@ -240,6 +351,7 @@ pyasn1-modules==0.4.0 pydantic==2.5.0 # via # chainlit + # chromadb # fastapi # google-generativeai # groq @@ -250,12 +362,24 @@ pydantic==2.5.0 # openai pydantic-core==2.14.1 # via pydantic +pygments==2.18.0 + # via rich pyjwt==2.8.0 # via chainlit pyparsing==3.1.2 # via httplib2 +pypika==0.48.9 + # via chromadb +pyproject-hooks==1.1.0 + # via build +python-dateutil==2.9.0.post0 + # via + # kubernetes + # posthog python-dotenv==1.0.1 - # via chainlit + # via + # chainlit + # uvicorn python-engineio==4.9.0 # via python-socketio python-multipart==0.0.9 @@ -264,26 +388,62 @@ python-socketio==5.11.2 # via fastapi-socketio pyyaml==6.0.1 # via + # chromadb + # huggingface-hub + # kubernetes # langchain # langchain-community # langchain-core + # transformers + # uvicorn +regex==2024.5.15 + # via transformers requests==2.31.0 # via + # chromadb # google-api-core + # huggingface-hub + # kubernetes # langchain # langchain-community # langsmith # opentelemetry-exporter-otlp-proto-http + # posthog + # requests-oauthlib + # transformers +requests-oauthlib==2.0.0 + # via kubernetes +rich==13.7.1 + # via typer rsa==4.9 # via google-auth +safetensors==0.4.3 + # via transformers +scikit-learn==1.5.0 + # via sentence-transformers +scipy==1.13.1 + # via + # scikit-learn + # sentence-transformers +sentence-transformers==2.7.0 + # via -r requirements.in +shellingham==1.5.4 + # via typer simple-websocket==1.0.0 # via python-engineio +six==1.16.0 + # via + # kubernetes + # posthog + # python-dateutil sniffio==1.3.1 # via # anyio # groq # httpx # openai +soupsieve==2.5 + # via beautifulsoup4 sqlalchemy==2.0.30 # via # langchain @@ -292,29 +452,61 @@ starlette==0.37.2 # via # chainlit # fastapi +sympy==1.12 + # via + # onnxruntime + # torch syncer==2.0.3 # via chainlit tenacity==8.3.0 # via + # chromadb # langchain # langchain-community # langchain-core +threadpoolctl==3.5.0 + # via scikit-learn +tokenizers==0.19.1 + # via + # chromadb + # transformers tomli==2.0.1 - # via chainlit + # via + # build + # chainlit +torch==2.3.0 + # via sentence-transformers tqdm==4.66.4 # via + # chromadb # google-generativeai + # huggingface-hub # openai + # sentence-transformers + # transformers +transformers==4.41.1 + # via sentence-transformers +typer==0.12.3 + # via chromadb +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20240228 + # via types-beautifulsoup4 typing-extensions==4.10.0 # via + # asgiref + # chromadb # fastapi # google-generativeai # groq + # huggingface-hub # openai # opentelemetry-sdk # pydantic # pydantic-core # sqlalchemy + # torch + # typer # typing-inspect # uvicorn typing-inspect==0.9.0 @@ -324,11 +516,23 @@ uptrace==1.22.0 uritemplate==4.1.1 # via google-api-python-client urllib3==2.2.1 - # via requests -uvicorn==0.25.0 - # via chainlit + # via + # kubernetes + # requests +uvicorn[standard]==0.25.0 + # via + # chainlit + # chromadb +uvloop==0.19.0 + # via uvicorn watchfiles==0.20.0 - # via chainlit + # via + # chainlit + # uvicorn +websocket-client==1.8.0 + # via kubernetes +websockets==12.0 + # via uvicorn wrapt==1.16.0 # via # deprecated