diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..9e2aa624 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Python Exclusions +.venv +**__pycache__** + +# Helm Exclusions +**/charts/*.tgz + +# project temp files +deploy/*.log +deploy/*.txt + +# Docker Compose exclusions +volumes/ +uploaded_files/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index fbabf83e..af30477c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,25 +3,52 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.0] - 2024-01-22 + +### Added + +- [New dedicated example](./docs/rag/aiplayground.md) showcasing Nvidia AI Playground based models using Langchain connectors. +- [New example](./RetrievalAugmentedGeneration/README.md#5-qa-chatbot-with-task-decomposition-example----a100h100l40s) demonstrating query decomposition. +- Support for using [PG Vector as a vector database in the developer rag canonical example.](./RetrievalAugmentedGeneration/README.md#deploying-with-pgvector-vector-store) +- Support for using Speech-in Speech-out interface in the sample frontend leveraging RIVA Skills. +- New tool showcasing [RAG observability support.](./tools/observability/) +- Support for on-prem deployment of [TRTLLM based nemotron models.](./RetrievalAugmentedGeneration/README.md#6-qa-chatbot----nemotron-model) + +### Changed + +- Upgraded Langchain and llamaindex dependencies for all container. +- Restructured [README](./README.md) files for better intuitiveness. +- Added provision to plug in multiple examples using [a common base class](./RetrievalAugmentedGeneration/common/base.py). +- Changed `minio` service's port to `9010`from `9000` in docker based deployment. +- Moved `evaluation` directory from top level to under `tools` and created a [dedicated compose file](./deploy/compose/docker-compose-evaluation.yaml). +- Added an [experimental directory](./experimental/) for plugging in experimental features. +- Modified notebooks to use TRTLLM and Nvidia AI foundation based connectors from langchain. +- Changed `ai-playground` model engine name to `nv-ai-foundation` in configurations. + +### Fixed + +- [Fixed issue #19](https://github.com/NVIDIA/GenerativeAIExamples/issues/19) + ## [0.2.0] - 2023-12-15 ### Added -- Support for using [Nvidia AI Foundational LLM models](./docs/rag/aiplayground.md#using-nvdia-cloud-based-llms) -- Support for using [Nvidia AI Foundational embedding models](./docs/rag/aiplayground.md#using-nvidia-cloud-based-embedding-models) +- Support for using [Nvidia AI Playground based LLM models](./docs/rag/aiplayground.md) +- Support for using [Nvidia AI Playground based embedding models](./docs/rag/aiplayground.md) - Support for [deploying and using quantized LLM models](./docs/rag/llm_inference_server.md#quantized-llama2-model-deployment) -- Support for [evaluating RAG pipeline](./evaluation/README.md) +- Support for Kubernetes deployment support using helm charts +- Support for [evaluating RAG pipeline](./tools/evaluation/README.md) ### Changed - Repository restructing to allow better open source contributions - [Upgraded dependencies](./RetrievalAugmentedGeneration/Dockerfile) for chain server container -- [Upgraded NeMo Inference Framework container version](./RetrievalAugmentedGeneration/llm-inference-server/Dockerfile), no seperate sign up needed now for access. +- [Upgraded NeMo Inference Framework container version](./RetrievalAugmentedGeneration/llm-inference-server/Dockerfile), no seperate sign up needed for access. - Main [README](./README.md) now provides more details. - Documentation improvements. -- Better error handling and reporting mechanism for corner cases. -- Renamed `triton-inference-server` container and service to `llm-inference-server` +- Better error handling and reporting mechanism for corner cases +- Renamed `triton-inference-server` container to `llm-inference-server` ### Fixed diff --git a/README.md b/README.md index 06c977c0..f9056109 100644 --- a/README.md +++ b/README.md @@ -8,40 +8,67 @@ Generative AI Examples uses resources from the [NVIDIA NGC AI Development Catalo Sign up for a [free NGC developer account](https://ngc.nvidia.com/signin) to access: -- The GPU-optimized NVIDIA containers, models, scripts, and tools used in these examples -- The latest NVIDIA upstream contributions to the respective programming frameworks -- The latest NVIDIA Deep Learning and LLM software libraries -- Release notes for each of the NVIDIA optimized containers -- Links to developer documentation +- GPU-optimized containers used in these examples +- Release notes and developer documentation ## Retrieval Augmented Generation (RAG) -A RAG pipeline embeds multimodal data -- such as documents, images, and video -- into a database connected to a Large Language Model. RAG lets users use an LLM to chat with their own data. +A RAG pipeline embeds multimodal data -- such as documents, images, and video -- into a database connected to a LLM. RAG lets users chat with their data! -| Name | Description | LLM | Framework | Multi-GPU | Multi-node | Embedding | TRT-LLM | Triton | VectorDB | K8s | -|---------------|-----------------------|------------|-------------------------|-----------|------------|-------------|---------|--------|----------|-----| -| [Linux developer RAG](https://github.com/NVIDIA/GenerativeAIExamples/tree/main/RetrievalAugmentedGeneration) | Single VM, single GPU | llama2-13b | Langchain + Llama Index | No | No | e5-large-v2 | Yes | Yes | Milvus | No | -| [Windows developer RAG](https://github.com/NVIDIA/trt-llm-rag-windows) | RAG on Windows | llama2-13b | Llama Index | No | No | NA | Yes | No | FAISS | NA | -| [Developer LLM Operator for Kubernetes](./docs/developer-llm-operator/) | Single node, single GPU | llama2-13b | Langchain + Llama Index | No | No | e5-large-v2 | Yes | Yes | Milvus | Yes | +### Developer RAG Examples +The developer RAG examples run on a single VM. They demonstrate how to combine NVIDIA GPU acceleration with popular LLM programming frameworks using NVIDIA's [open source connectors](#open-source-integrations). The examples are easy to deploy via [Docker Compose](https://docs.docker.com/compose/). -## Large Language Models -NVIDIA LLMs are optimized for building enterprise generative AI applications. +Examples support local and remote inference endpoints. If you have a GPU, you can inference locally via [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). If you don't have a GPU, you can inference and embed remotely via [NVIDIA AI Foundations endpoints](https://www.nvidia.com/en-us/ai-data-science/foundation-models/). -| Name | Description | Type | Context Length | Example | License | -|---------------|-----------------------|------------|----------------|---------|---------| -| [nemotron-3-8b-qa-4k](https://huggingface.co/nvidia/nemotron-3-8b-qa-4k) | Q&A LLM customized on knowledge bases | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | -| [nemotron-3-8b-chat-4k-steerlm](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-steerlm) | Best out-of-the-box chat model with flexible alignment at inference | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | -| [nemotron-3-8b-chat-4k-rlhf](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-rlhf) | Best out-of-the-box chat model performance| Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | +| Model | Embedding | Framework | Description | Multi-GPU | TRT-LLM | NVIDIA AI Foundation | Triton | Vector Database | +|---------------|-----------------------|------------|-------------------------|-----------|------------|-------------|---------|--------| +| llama-2 | e5-large-v2 | Llamaindex | Canonical QA Chatbot | [YES](RetrievalAugmentedGeneration/README.md#3-qa-chatbot-multi-gpu----a100h100l40s) | [YES](RetrievalAugmentedGeneration/README.md#2-qa-chatbot----a100h100l40s-gpu) | No | YES | Milvus/[PGVector]((RetrievalAugmentedGeneration/README.md#2-qa-chatbot----a100h100l40s-gpu))| +| mixtral_8x7b | nvolveqa_40k | Langchain | [Nvidia AI foundation based QA Chatbot](RetrievalAugmentedGeneration/README.md#1-qa-chatbot----nvidia-ai-foundation-inference-endpoint) | No | No | YES | YES | FAISS| +| llama-2 | all-MiniLM-L6-v2 | Llama Index | [QA Chatbot, GeForce, Windows](https://github.com/NVIDIA/trt-llm-rag-windows/tree/release/1.0) | NO | YES | NO | NO | FAISS | +| llama-2 | nvolveqa_40k | Langchain | [QA Chatbot, Task Decomposition Agent](./RetrievalAugmentedGeneration/README.md#5-qa-chatbot-with-task-decomposition-example----a100h100l40s) | No | No | YES | YES | FAISS +| mixtral_8x7b | nvolveqa_40k | Langchain | [Minimilastic example showcasing RAG using Nvidia AI foundation models](./examples/README.md#rag-in-5-minutes-example) | No | No | YES | YES | FAISS| -## Integration Examples + +### Enterprise RAG Examples + +The enterprise RAG examples run as microservies distributed across multiple VMs and GPUs. They show how RAG pipelines can be orchestrated with [Kubernetes](https://kubernetes.io/) and deployed with [Helm](https://helm.sh/). + +Enterprise RAG examples include a [Kubernetes operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) for LLM lifecycle management. It is compatible with the [NVIDIA GPU operator](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/gpu-operator) that automates GPU discovery and lifecycle management in a Kubernetes cluster. + +Enterprise RAG examples also support local and remote inference via [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and [NVIDIA AI Foundations endpoints](https://www.nvidia.com/en-us/ai-data-science/foundation-models/). + +| Model | Embedding | Framework | Description | Multi-GPU | Multi-node | TRT-LLM | NVIDIA AI Foundation | Triton | Vector Database | +|---------------|-----------------------|------------|--------|-------------------------|-----------|------------|-------------|---------|--------| +| llama-2 | NV-Embed-QA-003 | Llamaindex | QA Chatbot, Helm, k8s | NO | NO | [YES](./docs/developer-llm-operator/) | NO | YES | Milvus| + +## Tools + +Example tools and tutorials to enhance LLM development and productivity when using NVIDIA RAG pipelines. + +| Name | Description | Deployment | Tutorial | +|------|-------------|------|--------| +| Evaluation | Example open source RAG eval tool that uses synthetic data generation and LLM-as-a-judge | [Docker compose file](./deploy/compose/docker-compose-evaluation.yaml) | [README](./docs/rag/evaluation.md) |] +| Observability | Observability serves as an efficient mechanism for both monitoring and debugging RAG pipelines. | [Docker compose file](./deploy/compose/docker-compose-observability.yaml) | [README](./docs/rag/observability.md) |] + +## Open Source Integrations + +These are open source connectors for NVIDIA-hosted and self-hosted API endpoints. These open source connectors are maintained and tested by NVIDIA engineers. + +| Name | Framework | Chat | Text Embedding | Python | Description | +|------|-----------|------|-----------|--------|-------------| +|[NVIDIA AI Foundation Endpoints](https://python.langchain.com/docs/integrations/providers/nvidia) | [Langchain](https://www.langchain.com/) |[YES](https://python.langchain.com/docs/integrations/chat/nvidia_ai_endpoints)|[YES](https://python.langchain.com/docs/integrations/text_embedding/nvidia_ai_endpoints)|[YES](https://pypi.org/project/langchain-nvidia-ai-endpoints/)|Easy access to NVIDIA hosted models. Supports chat, embedding, code generation, steerLM, multimodal, and RAG.| +|[NVIDIA Triton + TensorRT-LLM](https://github.com/langchain-ai/langchain/tree/master/libs/partners/nvidia-trt) | [Langchain](https://www.langchain.com/) |[YES](https://github.com/langchain-ai/langchain/blob/master/libs/partners/nvidia-trt/docs/llms.ipynb)|[YES](https://github.com/langchain-ai/langchain/blob/master/libs/partners/nvidia-trt/docs/llms.ipynb)|[YES](https://pypi.org/project/langchain-nvidia-trt/)|This connector allows Langchain to remotely interact with a Triton inference server over GRPC or HTTP tfor optimized LLM inference.| +|[NVIDIA Triton Inference Server](https://docs.llamaindex.ai/en/stable/examples/llm/nvidia_triton.html) | [LlamaIndex](https://www.llamaindex.ai/) |YES|YES|NO|Triton inference server provides API access to hosted LLM models over gRPC. | +|[NVIDIA TensorRT-LLM](https://docs.llamaindex.ai/en/stable/examples/llm/nvidia_tensorrt.html) | [LlamaIndex](https://www.llamaindex.ai/) |YES|YES|NO|TensorRT-LLM provides a Python API to build TensorRT engines with state-of-the-art optimizations for LLM inference on NVIDIA GPUs. | + ## NVIDIA support -In each of the READMEs, we indicate the level of support provided. +In each example README we indicate the level of support provided. ## Feedback / Contributions -We're posting these examples on GitHub to better support the community, facilitate feedback, as well as collect and implement contributions using GitHub Issues and pull requests. We welcome all contributions! +We're posting these examples on GitHub to support the NVIDIA LLM community, facilitate feedback. We invite contributions via GitHub Issues or pull requests! ## Known issues - In each of the READMEs, we indicate any known issues and encourage the community to provide feedback. diff --git a/RetrievalAugmentedGeneration/.gitattributes b/RetrievalAugmentedGeneration/.gitattributes deleted file mode 100644 index c8a8d73b..00000000 --- a/RetrievalAugmentedGeneration/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -notebooks/dataset.zip filter=lfs diff=lfs merge=lfs -text diff --git a/RetrievalAugmentedGeneration/.gitignore b/RetrievalAugmentedGeneration/.gitignore deleted file mode 100644 index baec5514..00000000 --- a/RetrievalAugmentedGeneration/.gitignore +++ /dev/null @@ -1,25 +0,0 @@ -# Python Exclusions -.venv -__pycache__ - -# Sphinx Exclusions -_build - -# Helm Exclusions -**/charts/*.tgz - -# project temp files -deploy/*.log -deploy/*.txt -**/my.* -**/my-* - -# Next JS Exclusions -**/.next -frontend/frontend_js/out -frontend-sdxl/frontend_js/out -**/node_modules - -# Docker Compose exclusions -volumes/ -uploaded_files/ diff --git a/RetrievalAugmentedGeneration/Dockerfile b/RetrievalAugmentedGeneration/Dockerfile index 25e879cd..20578559 100644 --- a/RetrievalAugmentedGeneration/Dockerfile +++ b/RetrievalAugmentedGeneration/Dockerfile @@ -1,14 +1,22 @@ ARG BASE_IMAGE_URL=nvcr.io/nvidia/pytorch ARG BASE_IMAGE_TAG=23.08-py3 - FROM ${BASE_IMAGE_URL}:${BASE_IMAGE_TAG} + +ARG EXAMPLE_NAME COPY RetrievalAugmentedGeneration/__init__.py /opt/RetrievalAugmentedGeneration/ COPY RetrievalAugmentedGeneration/common /opt/RetrievalAugmentedGeneration/common -COPY RetrievalAugmentedGeneration/examples /opt/RetrievalAugmentedGeneration/examples +COPY RetrievalAugmentedGeneration/examples/${EXAMPLE_NAME} /opt/RetrievalAugmentedGeneration/example COPY integrations /opt/integrations +COPY tools /opt/tools +RUN apt-get update && apt-get install -y libpq-dev RUN --mount=type=bind,source=RetrievalAugmentedGeneration/requirements.txt,target=/opt/requirements.txt \ python3 -m pip install --no-cache-dir -r /opt/requirements.txt +RUN if [ -f "/opt/RetrievalAugmentedGeneration/example/requirements.txt" ] ; then \ + python3 -m pip install --no-cache-dir -r /opt/RetrievalAugmentedGeneration/example/requirements.txt ; else \ + echo "Skipping example dependency installation, since requirements.txt was not found" ; \ + fi + WORKDIR /opt ENTRYPOINT ["uvicorn", "RetrievalAugmentedGeneration.common.server:app"] diff --git a/RetrievalAugmentedGeneration/README.md b/RetrievalAugmentedGeneration/README.md index b47d967d..614d19fc 100644 --- a/RetrievalAugmentedGeneration/README.md +++ b/RetrievalAugmentedGeneration/README.md @@ -1,205 +1,694 @@ # Retrieval Augmented Generation -## Project Details -**Project Goal**: A reference Retrieval Augmented Generation(RAG) workflow for a chatbot to question answer off public press releases & tech blogs. It performs document ingestion & Q&A interface using open source models deployed on any cloud or customer datacenter, leverages the power of GPU-accelerated Milvus for efficient vector storage and retrieval, along with TRT-LLM, to achieve lightning-fast inference speeds with custom LangChain LLM wrapper. +Retrieval Augmented Generation (RAG) generates up-to-date and domain-specific answers by connecting a Large Language Model (LLM) to your enterprise data. + +## Developer RAG Examples + +1. [QA Chatbot -- No-GPU using NVIDIA AI Foundation](#1-qa-chatbot----nvidia-ai-foundation-inference-endpoint) +2. [QA Chatbot -- A100/H100/L40S](#2-qa-chatbot----a100h100l40s-gpu) +3. [QA Chatbot -- Multi-GPU](#3-qa-chatbot-multi-gpu----a100h100l40s) +4. [QA Chatbot -- Quantized LLM model](#4-qa-chatbot-with-quantized-llm-model----a100h100l40s) +5. [QA Chatbot -- Task Decomposition](#5-qa-chatbot-with-task-decomposition-example----a100h100l40s) +6. [QA Chatbot -- NemoTron Model](#6-qa-chatbot----nemotron-model) + +
+ +### 1: QA Chatbot -- NVIDIA AI Foundation inference endpoint + +This example deploys a developer RAG pipeline for chat QA and serves inferencing via the NVIDIA AI Foundation endpoint. + +Developers get free credits for 10K requests to any of the available models. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelEmbeddingFrameworkDescriptionMulti-GPUTRT-LLMNVIDIA AI FoundationTritonVector Database
mixtral_8x7bnvolveqa_40kLangchainQA chatbotNONOYESNOFAISS
+ +#### 1.1 Prepare the environment + +This example uses NVIDIA AI Foundation inference endpoint. + +1. Follow steps 1 - 5 in the ["Prepare the environment" section of example 02](#21-prepare-the-environment). + +#### 1.2 Deploy + +Follow [these instructions](../docs/rag/aiplayground.md) to sign up for an NVIDIA AI Foundation developer account and deploy this example. + +
+ +### 2: QA Chatbot -- A100/H100/L40S GPU + +This example deploys a developer RAG pipeline for chat QA and serves inferencing via the NeMo Framework inference container. +> ⚠️ **NOTE**: This example requires an A100, H100, or L40S GPU. Refer to the [support matrix](../docs/rag/support_matrix.md) to understand memory requirements for the model you are deploying. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelEmbeddingFrameworkDescriptionMulti-GPUTRT-LLMNVIDIA AI FoundationTritonVector Database
llama-2e5-large-v2LlamaindexQA chatbotNOYESNOYESMilvus
llama-2e5-large-v2LlamaindexQA chatbotNOYESNOYESpgvector
+ + +#### 2.1 Prepare the environment + +1. Install [Docker Engine and Docker Compose.](https://docs.docker.com/engine/install/ubuntu/) + +2. Verify NVIDIA GPU driver version 535 or later is installed. + + **Note**: This step is not required for Nvidia AI foundation workflow + +``` $ nvidia-smi --query-gpu=driver_version --format=csv,noheader +535.129.03 + +$ nvidia-smi -q -d compute + +==============NVSMI LOG============== + +Timestamp : Sun Nov 26 21:17:25 2023 +Driver Version : 535.129.03 +CUDA Version : 12.2 + +Attached GPUs : 1 +GPU 00000000:CA:00.0 + Compute Mode : Default +``` +Reference: [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and [NVIDIA Linux driver installation instructions](https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html) + +3. Clone the Generative AI examples Git repository. + +> ⚠️ **NOTE**: This example requires Git Large File Support (LFS) + +``` +sudo apt -y install git-lfs +git clone git@github.com:NVIDIA/GenerativeAIExamples.git +cd GenerativeAIExamples/ +git lfs pull +``` -## Components -- **LLM**: [Llama2](https://ai.meta.com/llama/) - 7b-chat, 13b-chat, and 70b-chat all supported. 13b-chat and 70b-chat generate good responses. -- **LLM Backend**: Nemo framework inference container with Triton inference server & TRT-LLM backend for speed. -- **Vector DB**: Milvus because it's GPU accelerated. -- **Embedding Model**: [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) since it is one of the best embedding model available at the moment. -- **Framework(s)**: LangChain and LlamaIndex. +4. Verify the NVIDIA container toolkit is installed and configured as the default container runtime. -This reference workflow uses a variety of components and services to customize and deploy the RAG based chatbot. The following diagram illustrates how they work together. Refer to the [detailed architecture guide](../docs/rag/architecture.md) to understand more about these components and how they are tied together. + **Note**: This step is not required for Nvidia AI foundation workflow +``` +$ cat /etc/docker/daemon.json +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "/usr/bin/nvidia-container-runtime", + "runtimeArgs": [] + } + } +} + +$ sudo docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi -L +GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-d8ce95c1-12f7-3174-6395-e573163a2ace) +``` + +5. Create an NGC Account and API Key. + +Please refer to [instructions](https://docs.nvidia.com/ngc/gpu-cloud/ngc-overview/index.html) to create account and generate NGC API key. + +Login to `nvcr.io` using the following command: + +``` +docker login nvcr.io +``` + +6. [Optional] Enable Riva ASR and TTS. -![Diagram](../docs/rag/images/image3.jpg) + a. To launch a Riva server locally, please refer to the instructions in the [Riva Quick Start Guide](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/quick-start-guide.html). -*Note:* -We've used [Llama2](https://ai.meta.com/llama/) and [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) models as example defaults in this workflow, you should ensure that both the LLM and embedding model are appropriate for your use case, and validate that they are secure and have not been tampered with prior to use. + - In the provided `config.sh` script, set `service_enabled_asr=true` and `service_enabled_tts=true`, and select the desired ASR and TTS languages by adding the appropriate language codes to `asr_language_code` and `tts_language_code`. -# Getting Started -This section covers step by step guide to setup and try out this example workflow. + - Once the server is running, assign its IP address (or hostname) and port (50051 by default) to `RIVA_API_URI` in `deploy/compose/compose.env`. -## Prerequisites -Before proceeding with this guide, make sure you meet the following prerequisites: + b. Alternatively, you can use a hosted Riva API endpoint. You might need to obtain an API key and/or Function ID for access. -- You should have at least one NVIDIA GPU. For this guide, we used an A100 data center GPU. + - In `deploy/compose/compose.env`, make the following assignments as necessary: + ``` + export RIVA_API_URI=":" + export RIVA_API_KEY="" + export RIVA_FUNCTION_ID="" + ``` - - NVIDIA driver version 535 or newer. To check the driver version run: ``nvidia-smi --query-gpu=driver_version --format=csv,noheader``. - - If you are running multiple GPUs they must all be set to the same mode (ie Compute vs. Display). You can check compute mode for each GPU using - ``nvidia-smi -q -d compute`` +Reference: +- [Docker installation instructions (Ubuntu)](https://docs.docker.com/engine/install/ubuntu/) +- [NVIDIA Container Toolkit Installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) -### Setup the following +#### 2.2 Deploy -- Docker and Docker-Compose are essential. Please follow the [installation instructions](https://docs.docker.com/engine/install/ubuntu/). +##### Downloading the model +You can download the model either from huggingface or meta. - Note: - Please do **not** use Docker that is packaged with Ubuntu as the newer version of Docker is required for proper Docker Compose support. +The steps mentioned here explains how to download from meta. If you are interested in downloading the model checkpoints from huggingface, follow the steps [here](../docs/rag/hf_model_download.md) instead. - Make sure your user account is able to execute Docker commands. +1. Clone the Llama Github. +``` +git clone https://github.com/facebookresearch/llama.git +cd llama/ +``` -- NVIDIA Container Toolkit is also required. Refer to the [installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). +2. Fill out Meta's [Llama request access form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). +3. Download the model weights. -- NGC Account and API Key +- Select the Llama 2 and Llama Chat text boxes. +- After verifying your email, Meta will email you a download link. +- Download the llama-2-13b-chat model when prompted. - - Please refer to [instructions](https://docs.nvidia.com/ngc/gpu-cloud/ngc-overview/index.html) to create account and generate NGC API key. - - Docker login to `nvcr.io` using the following command: - ``` - docker login nvcr.io - ``` +``` +$ ./download.sh +Enter the URL from email: < https://download.llamameta.net/… etc> -- git-lfs - - Make sure you have [git-lfs](https://git-lfs.github.com) installed. +Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: 13B-chat +``` -- You can download Llama2 Chat Model Weights from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/). You can skip this step [if you are interested in using cloud based LLM's using Nvidia AI Playground](#using-nvdia-cloud-based-llm). +4. Copy the tokenizer to the model directory. - **Note for checkpoint downloaded using Meta**: +``` +$ mv tokenizer* llama-2-13b-chat/ - - When downloading model weights from Meta, you can follow the instructions up to the point of downloading the models using ``download.sh``. There is no need to deploy the model using the steps mentioned in the repository. We will use Triton to deploy the model. +$ ls ~/git/llama/llama-2-13b-chat/ +checklist.chk consolidated.00.pth consolidated.01.pth params.json tokenizer.model tokenizer_checklist.chk +``` - - Meta will download two additional files, namely `tokenizer.model` and `tokenizer_checklist.chk`, outside of the model checkpoint directory. Ensure that you copy these files into the same directory as the model checkpoint directory. +##### Deploying the model - **Using Cloud based Nvidia AI Foundational models**: +1. Set the absolute path to the model location in compose.env. - - Instead of deploying the models on-prem if you will like to use LLM models deployed from NVIDIA AI Playground then follow the instructions from [here.](../docs/rag/aiplayground.md) +``` +$ cd ~/git/GenerativeAIExamples - **Using Quantized models**: +$ grep MODEL deploy/compose/compose.env | grep -v \# +export MODEL_DIRECTORY="/home/nvidia/git/llama/llama-2-13b-chat/" +export MODEL_ARCHITECTURE="llama" +export MODEL_NAME="Llama-2-13b-chat" +``` - - In this workflow, we will be leveraging a Llama2 (7B parameters) chat model, which requires 38 GB of GPU memory.
- IMPORTANT: For this initial version of the workflow only 7B chat model is supported on A100 and H100 GPUs. +2. Deploy the developer RAG example via Docker compose using milvus vector store, steps to deploy RAG example with pgvector vector store is [here](#deploying-with-pgvector-vector-store). - - We also support quantization of LLama2 model using AWQ, which changes model precision to INT4, thereby reducing memory usage. Checkout the steps [here](../docs/rag/llm_inference_server.md) to enable quantization. +> ⚠️ **NOTE**: It may take up to 5 minutes for the Triton server to start. The `-d` flag starts the services in the background. +``` +$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build -## Install Guide +$ docker compose -f deploy/compose/docker-compose.yaml up -d -NVIDIA TensorRT LLM providex state of the art performance for running LLM inference. Follow the below steps from the root of this project to setup the RAG example with TensorRT LLM and Triton deployed locally. +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +256da0ecdb7b llm-playground Up 48 minutes +2974aa4fb2ce chain-server Up 48 minutes +4a8c4aebe4ad notebook-server Up 48 minutes +5be2b57bb5c1 milvus-standalone Up 48 minutes (healthy) +ecf674c8139c llm-inference-server Up 48 minutes (healthy) +a6609c22c171 milvus-minio Up 48 minutes (healthy) +b23c0858c4d4 milvus-etcd Up 48 minutes (healthy) +``` -### Step 1: Set Environment Variables +Reference: +- [Meta Llama README](https://github.com/facebookresearch/llama/blob/main/README.md) +- [Meta Llama request access form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) -Modify ``compose.env`` in the ``deploy/compose`` directory to set your environment variables. The following variables are required as shown below for using a llama based model. +#### 2.3 Test - # full path to the local copy of the model weights - export MODEL_DIRECTORY="$HOME/src/Llama-2-13b-chat-hf" +1. Connect to the sample web application at ``http://host-ip:8090``. - # the architecture of the model. eg: llama - export MODEL_ARCHITECTURE="llama" +2. Check **[X] Enable TTS output** to allow the web app to read the answers to your queries aloud. - # the name of the model being used - only for displaying on frontend - export MODEL_NAME="llama-2-13b-chat" +3. Select the desired ASR language (`English (en-US)` for this test), TTS language (`English (en-US)` for this test) and TTS voice from the dropdown menus below the checkboxes to utilize the web app's voice-to-voice interaction capabilities. - # [OPTIONAL] the config file for chain server - APP_CONFIG_FILE=/dev/null +4. In the Converse tab, type "How many cores does the Grace superchip contain?" in the chat box and press Submit. Alternatively, click on the microphone button to the right of the text box and ask your query verbally. +![Grace query failure](../notebooks/imgs/grace_noanswer_with_riva.png) -### Step 2: Build and Start Containers -- Pull lfs files. This will pull large files from repository. - ``` - git lfs pull - ``` -- Run the following command to build containers. - ``` - source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build - ``` +5. If you encounter an error message reading "Media devices could not be accessed" when you first attempt to transcribe a voice query, -- Run the following command to start containers. - ``` - source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml up -d - ``` - > ⚠️ **NOTE**: It will take a few minutes for the containers to come up and may take up to 5 minutes for the Triton server to be ready. Adding the `-d` flag will have the services run in the background. ⚠️ +![Media device access error](../notebooks/imgs/media_device_access_error.png) + +carry out the following steps: + + - Open ``chrome://flags`` in another browser tab. + + - Search for "insecure origins treated as secure". + + - Copy ``http://host-ip:8090`` into the associated text box. + + - Select "Enabled" in the adjacent dropdown menu. + + - Click on the "Relaunch" button at the bottom right of the page. + + - Grant ``http://host-ip:8090`` access to your microphone. + +![Fix media device access error in Chrome Flags](../notebooks/imgs/chrome_flags_fix_media_device_access_error.png) + +6. Upload the sample data set to the Knowledge Base tab. + +> ⚠️ **NOTE**: ``dataset.zip`` is located in the ``notebooks`` directory. Unzip the archive and upload the PDFs. + +> There is a timeout of `10 mins` set for the ingestion process. Uploading large files may see ingestion failure depending on network bandwidth. + +7. Return to **Converse** tab and check **[X] Use knowledge base**. + +8. Retype (or re-transcribe) the question: "How many cores does the Grace superchip contain?" + +![Grace query success](../notebooks/imgs/grace_answer_with_riva.png) + +> ⚠️ **NOTE**: Default prompts are optimized for llama chat model if you're using completion model then prompts need to be finetuned accordingly. + +#### Learn More + +Execute the Jupyter notebooks to explore optional features. -- Run ``docker ps -a``. When the containers are ready the output should look similar to the image below. - ![Docker Output](../docs/rag/images/docker-output.png "Docker Output Image") +Note: Jupyter notebook is supported for [default flow](../deploy/compose/docker-compose.yaml) i.e. trt-llm with milvus. +1. In a web browser, open Jupyter at ``http://host-ip:8888``. - **Note**: - - Default prompts are optimized for llama chat model if you're using completion model then prompts need to be finetuned accordingly. +2. Execute the notebooks in order: -#### Multi GPU deployment +- [Enable streaming responses from the LLM](../notebooks/01-llm-streaming-client.ipynb) +- [Document QA with LangChain](../notebooks/02_langchain_simple.ipynb) +- [Document QA with LlamaIndex](../notebooks/03_llama_index_simple.ipynb) +- [Advanced Document QA with LlamaIndex](../notebooks/04_llamaindex_hier_node_parser.ipynb) +- [Document QA via REST FastAPI Server](../notebooks/05_dataloader.ipynb) -By default the LLM model will be deployed using all available GPU's of the system. To use some specific GPU's you can provide the GPU ID(s) in the [docker compose file](../deploy/compose/docker-compose.yaml) under `llm` service's `deploy` section: +#### 2.4 Uninstall +To uninstall, stop and remove the running containers. +``` +cd deploy/compose +source compose.env +docker compose down +docker compose ps -q +``` + +#### Deploying with [pgvector](https://github.com/pgvector/pgvector) vector store +2. Deploy the developer RAG example via Docker compose. + +> ⚠️ **NOTE**: It may take up to 5 minutes for the Triton server to start. The `-d` flag starts the services in the background. + +``` +$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-pgvector.yaml build + +$ docker compose -f deploy/compose/docker-compose-pgvector.yaml up -d + +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +0f6f091d892e llm-playground Up 22 hours +8d0ab09fcb98 chain-server Up 22 hours +85bd98ba3b24 notebook-server Up 22 hours +22f0d405b38b llm-inference-server Up 22 hours (healthy) +cbd3cf65ce7e pgvector Up 22 hours +``` + +After deployment is successful, you can follow steps from [Test](#23-test) to verify workflow. + +
+ +### 3: QA Chatbot Multi-GPU -- A100/H100/L40S + +This example deploys a developer RAG pipeline for chat QA and serves inference via the NeMo Framework inference container across multiple GPUs. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelEmbeddingFrameworkDescriptionMulti-GPUTRT-LLMNVIDIA AI FoundationTritonVector Database
llama-2e5-large-v2LlamaindexQA chatbotYESYESNOYESMilvus
+ +#### 3.1 Prepare the environment + +1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment). + +#### 3.2 Deploy + +1. Follow steps 1 - 4 in the ["Deploy" section of example 02](#downloading-the-model) to stage the model weights. + +2. Find the GPU device ID. You can check this using `nvidia-smi` command. + +3. Assign LLM inference to specific GPUs by specifying the GPU ID(s) in the [docker compose file](../deploy/compose/docker-compose.yaml). + +``` deploy: resources: reservations: devices: - driver: nvidia # count: ${INFERENCE_GPU_COUNT:-all} # Comment this out - device_ids: ["0"] # Provide the device id of GPU. It can be found using `nvidia-smi` command + device_ids: ["0"] capabilities: [gpu] +``` + +4. Follow steps in the ["Deploy the model" section of example 02](#deploying-the-model) to deploy via Docker compose. + +#### 3.3 Test + +1. Follow steps 1 - 5 in the ["Test" section of example 02](#23-test). + +2. Verify the correct GPU is serving the model using `nvidia-smi`. + +#### 3.4 Uninstall + +1. To unintstall, follow the ["Uninstall" steps in example 02"](#24-uninstall). + +
+ + +### 4: QA Chatbot with Quantized LLM model -- A100/H100/L40S + +This example deploys a developer RAG pipeline for chat QA and serves inference via the NeMo Framework inference container across multiple GPUs using a quantized version of Llama-7b-chat model. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelEmbeddingFrameworkDescriptionMulti-GPUTRT-LLMNVIDIA AI FoundationTritonVector Database
llama-2-7b-chate5-large-v2LlamaindexQA chatbotYESYESNOYESMilvus
+ +#### 4.1 Prepare the environment + +1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment). + + +#### 4.2 Deploy +1. [Download Llama2-7b chat Chat Model Weights](#downloading-the-model) from huggingface as meta checkpoint does not have the required files to quantize it. + +> ⚠️ **NOTE**: For this initial version only 7B chat model is supported on A100/H100/L40 GPUs. + + +1. For quantization of the Llama2 model using AWQ, first clone the [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0) repository separately and checkout release/v0.5.0. + + - Also copy the Llama2 model directory downloaded earlier to the TensorRT-LLM repo + +``` + git clone https://github.com/NVIDIA/TensorRT-LLM.git + cp -r TensorRT-LLM/ + cd TensorRT-LLM/ + git checkout release/0.5.0 +``` + +3. Now setup the TensorRT-LLM repo seprately using steps [here](https://github.com/NVIDIA/TensorRT-LLM/blob/release/0.5.0/docs/source/installation.md) + +4. Once the model is downloaded and TensorRT-LLM repo is setup, we can quantize the model using the TensorRT-LLM container. + + - Follow the steps from [here](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.5.0/examples/llama#awq) to quantize using AWQ, run these commands inside the container. + + - While running the quantization script, make sure to point `--model_dir` to your downloaded Llama2 model directory + + - Once the quantization is completed, copy the generated PyTorch (.pt) file inside the model directory + + ``` + cp .pt + ``` + +5. Now, we will come back our repository, follow the steps below to deploy this quantized model using the inference server. + + - Update [compose.env](../deploy/compose/compose.env) with `MODEL_DIRECTORY` pointing to Llama2 model directory containing the quantized checkpoint. + + - Make sure the qantized PyTorch model (.pt) file generated using above steps is present inside the MODEL_DIRECTORY. + + + - Uncomment the QUANTIZATION variable which specifies quantization as "int4_awq" inside the [compose.env](../deploy/compose/compose.env). + ``` + export QUANTIZATION="int4_awq" + ``` + +6. Deploy the developer RAG example via Docker compose. + +> ⚠️ **NOTE**: It may take up to 5 minutes for the Triton server to start. The `-d` flag starts the services in the background. + +``` +$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build + +$ docker compose -f deploy/compose/docker-compose.yaml up -d + +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +256da0ecdb7b llm-playground Up 48 minutes +2974aa4fb2ce chain-server Up 48 minutes +4a8c4aebe4ad notebook-server Up 48 minutes +5be2b57bb5c1 milvus-standalone Up 48 minutes (healthy) +ecf674c8139c llm-inference-server Up 48 minutes (healthy) +a6609c22c171 milvus-minio Up 48 minutes (healthy) +b23c0858c4d4 milvus-etcd Up 48 minutes (healthy) +``` + +#### 4.3 Test + +1. Follow steps 1 - 5 in the ["Test" section of example 02](#23-test). + +#### 4.4 Uninstall + +1. To uninstall, follow the ["Uninstall" steps in example 02"](#24-uninstall). + +
+ +### 5: QA Chatbot with Task Decomposition example -- A100/H100/L40S + +This example deploys a recursive Task Decomposition example for chat QA. It uses the llama2-70b chat model (via the NVIDIA AI Foundation endpoint) for inference. + +It showcases how to perform RAG when the agent needs to access information from several different files/chunks or perform some computation on the answers. It uses a custom langchain agent that recursively breaks down the user's questions into subquestions that it attempts to answer. It has access to 2 tools - search (which performs standard RAG on a subquestion) and math (which poses a math question to the LLM). The agent continues to break down the question into sub-questions until it has the answers it needs to formulate the final answer. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelEmbeddingFrameworkDescriptionMulti-GPUTRT-LLMNVIDIA AI FoundationTritonVector Database
llama2_70bnvolveqa_40kLangchainQA chatbotNONOYESNOFAISS
+ +#### 5.1 Prepare the environment + +1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment). + + +#### 5.2 Deploy + +1. Follow the ["Deploy" section of example 01](#downloading-the-model) to setup your API key + +2. Change the RAG example in `deploy/compose/compose.env`. + ```shell + export RAG_EXAMPLE="query_decomposition_rag" + ``` + +3. Change the LLM in `deploy/compose/docker-compose-nv-ai-foundation.yaml` to `llama2_70b`. + ```yaml + query: + container_name: chain-server + ... + environment: + APP_LLM_MODELNAME: llama2_70b + ... + ``` + +4. Deploy the Query Decomposition RAG example via Docker compose. + +``` +$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml build +$ docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml up -d -### Step 3: Experiment with RAG in JupyterLab +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +256da0ecdb7b llm-playground Up 48 minutes +2974aa4fb2ce chain-server Up 48 minutes +``` -This AI Workflow includes Jupyter notebooks which allow you to experiment with RAG. +#### 5.3 Test -- Using a web browser, type in the following URL to open Jupyter +1. Connect to the sample web application at ``http://host-ip:8090``. - ``http://host-ip:8888`` +2. Upload 2 text documents in the Knowledge Base tab. The documents can contain different information - for example, one document can contain a company's revenue analysis for Q3 2023 and the other can contain a similar analysis for Q4 2023. -- Locate the [LLM Streaming Client notebook](../notebooks/01-llm-streaming-client.ipynb) which demonstrates how to stream responses from the LLM. +3. Return to the **Converse** tab and check **[X] Use knowledge base**. -- Proceed with the next 4 notebooks: +4. Enter the question: "Which is greater - NVIDIA's datacenter revenue for Q4 2023 or the sum of its datacenter and gaming revenues for Q3 2023?" and hit submit to get the answer. - - [Document Question-Answering with LangChain](../notebooks/02_langchain_simple.ipynb) +#### 5.4 Uninstall - - [Document Question-Answering with LlamaIndex](../notebooks/03_llama_index_simple.ipynb) +1. To uninstall, follow the ["Uninstall" steps in example 02"](#24-uninstall). - - [Advanced Document Question-Answering with LlamaIndex](../notebooks/04_llamaindex_hier_node_parser.ipynb) +
- - [Interact with REST FastAPI Server](../notebooks/05_dataloader.ipynb) +### 6: QA Chatbot -- NemoTron Model -### Step 4: Run the Sample Web Application -A sample chatbot web application is provided in the workflow. Requests to the chat system are wrapped in FastAPI calls. +This example deploys a developer RAG pipeline for chat QA and serves inference via the NeMo Framework inference container using NeMoTron model and showcases inference using sample notebook. -- Open the web application at ``http://host-ip:8090``. -- Type in the following question without using a knowledge base: "How many cores are on the Nvidia Grace superchip?" +#### 6.1 Prepare the environment - **Note:** the chatbot mentions the chip doesn't exist. +1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment). -- To use a knowledge base: +> ⚠️ **NOTE**: This example requires at least 100GB of GPU memory or two A100 GPUs for locally deploying the nemotron model. - - Click the **Knowledge Base** tab and upload the file [dataset.zip](../notebooks/dataset.zip). -- Return to **Converse** tab and check **[X] Use knowledge base**. +#### 6.2 Deploy -- Retype the question: "How many cores are on the Nvidia Grace superchip?" +1. Download [NeMoTron chat checkpoint](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-sft) from HuggingFace -# RAG Evaluation +``` +git-lfs clone https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-sft +``` -## Prerequisites -Make sure the corps comm dataset is loaded into the vector database using the [Dataloader](../notebooks/05_dataloader.ipynb) notebook as part of step-3 of setup. +2. Make sure the absolute model path of nemotron-3-8b-chat-4k-sft model is updated in `/GenerativeAIExamples/deploy/compose/compose.env`. Set the below values in `compose.env` file. -This workflow include jupyter notebooks which allow you perform evaluation of your RAG application on the sample dataset and they can be extended to other datasets as well. -Setup the workflow by building and starting the containers by following the steps [outlined here using docker compose.](#step-2-build-and-start-containers) +``` +export MODEL_DIRECTORY="/home/nvidia/nemotron-3-8b-chat-4k-sft" # Example path +export MODEL_ARCHITECTURE="gptnext" +export MODEL_NAME="nemotron-3-8b-chat-4k-sft" +``` -After setting up the workflow follow these steps: +3. Build and deploy the nemotron workflow -- Using a web browser, type in the following URL to open Jupyter Labs +``` +source deploy/compose/compose.env +docker compose -f deploy/compose/docker-compose-nemotron.yaml build +docker compose -f deploy/compose/docker-compose-nemotron.yaml up -d +``` +4. Check the deployment status by printing logs of `llm-inference-server` container - ``http://host-ip:8889`` +Successful TRT-LLM conversion and Triton Inference Server deployment logs will display the following message +``` +I0107 03:03:38.638311 260 http_server.cc:3558] Started HTTPService at 0.0.0.0:8000 +I0107 03:03:38.679626 260 http_server.cc:187] Started Metrics Service at 0.0.0.0:8002 +``` -- Locate the [synthetic data generation](../evaluation/01_synthetic_data_generation.ipynb) which demonstrates how to generate synthetic data of question answer pairs for evaluation +#### 6.3 Test -- Proceed with the next 3 notebooks: +1. Run `02_langchain_simple.ipynb` for Document Question-Answering with LangChain based using NeMoTron model. - - [Filling generated answers](../evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb) +[Optional] Run `00-llm-non-streaming-nemotron.ipynb` to send request to LLM. - - [Ragas evaluation with NVIDIA AI playground](../evaluation/03_eval_ragas.ipynb) +> ⚠️ **NOTE**: +- Nemotron models do not support streaming in this release. - - [LLM as a Judge evaluation with NVIDIA AI playground](../evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb) +
+### Learn More -# Learn More -1. [Architecture Guide](../docs/rag/architecture.md): Detailed explanation of different components and how they are tried up together. -2. Component Guides: Component specific features are enlisted in these sections. - 1. [Chain Server](../docs/rag/chat_server.md) - 2. [NeMo Framework Inference Server](../docs/rag/llm_inference_server.md) - 3. [Jupyter Server](../docs/rag/jupyter_server.md) - 4. [Sample frontend](../docs/rag/frontend.md) -3. [Configuration Guide](../docs/rag/configuration.md): This guide covers different configurations available for this workflow. -4. [Support Matrix](../docs/rag/support_matrix.md): This covers GPU, CPU, Memory and Storage requirements for deploying this workflow. +To deep dive into different components and workflow used by the examples, please refer to the [Developer Guide.](../docs/README.md) diff --git a/RetrievalAugmentedGeneration/common/base.py b/RetrievalAugmentedGeneration/common/base.py new file mode 100644 index 00000000..7b61a51a --- /dev/null +++ b/RetrievalAugmentedGeneration/common/base.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base interface that all RAG examples should implement.""" + +from abc import ABC, abstractmethod +from typing import Generator + +class BaseExample(ABC): + + @abstractmethod + def llm_chain(self, context: str, question: str, num_tokens: int) -> Generator[str, None, None]: + pass + + @abstractmethod + def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]: + pass + + @abstractmethod + def ingest_docs(self, data_dir: str, filename: str) -> None: + pass \ No newline at end of file diff --git a/RetrievalAugmentedGeneration/common/configuration.py b/RetrievalAugmentedGeneration/common/configuration.py index 8aa21d2f..23882963 100644 --- a/RetrievalAugmentedGeneration/common/configuration.py +++ b/RetrievalAugmentedGeneration/common/configuration.py @@ -18,24 +18,40 @@ @configclass -class MilvusConfig(ConfigWizard): - """Configuration class for the Weaviate connection. +class VectorStoreConfig(ConfigWizard): + """Configuration class for the Vector Store connection. - :cvar url: URL of Milvus DB + :cvar name: Name of vector store + :cvar url: URL of Vector Store """ + name: str = configfield( + "name", + default="milvus", # supports pgvector, milvus + help_txt="The name of vector store", + ) url: str = configfield( "url", - default="http://localhost:19530", - help_txt="The host of the machine running Milvus DB", + default="http://milvus:19530", # for pgvector `pgvector:5432` + help_txt="The host of the machine running Vector Store DB", + ) + nlist: int = configfield( + "nlist", + default=64, # IVF Flat milvus + help_txt="Number of cluster units", + ) + nprobe: int = configfield( + "nprobe", + default=16, # IVF Flat milvus + help_txt="Number of units to query", ) @configclass class LLMConfig(ConfigWizard): - """Configuration class for the Triton connection. + """Configuration class for the llm connection. - :cvar server_url: The location of the Triton server hosting the llm model. + :cvar server_url: The location of the llm server hosting the model. :cvar model_name: The name of the hosted model. """ @@ -60,7 +76,7 @@ class LLMConfig(ConfigWizard): class TextSplitterConfig(ConfigWizard): """Configuration class for the Text Splitter. - :cvar chunk_size: Chunk size for text splitter. + :cvar chunk_size: Chunk size for text splitter. Tokens per chunk in token-based splitters. :cvar chunk_overlap: Text overlap in text splitter. """ @@ -138,10 +154,10 @@ class PromptsConfig(ConfigWizard): class AppConfig(ConfigWizard): """Configuration class for the application. - :cvar milvus: The configuration of the Milvus vector db connection. - :type milvus: MilvusConfig - :cvar triton: The configuration of the backend Triton server. - :type triton: TritonConfig + :cvar vector_store: The configuration of the vector db connection. + :type vector_store: VectorStoreConfig + :cvar llm: The configuration of the backend llm server. + :type llm: LLMConfig :cvar text_splitter: The configuration for text splitter :type text_splitter: TextSplitterConfig :cvar embeddings: The configuration for huggingface embeddings @@ -150,11 +166,11 @@ class AppConfig(ConfigWizard): :type prompts: PromptsConfig """ - milvus: MilvusConfig = configfield( - "milvus", + vector_store: VectorStoreConfig = configfield( + "vector_store", env=False, - help_txt="The configuration of the Milvus connection.", - default=MilvusConfig(), + help_txt="The configuration of the vector db connection.", + default=VectorStoreConfig(), ) llm: LLMConfig = configfield( "llm", diff --git a/RetrievalAugmentedGeneration/common/server.py b/RetrievalAugmentedGeneration/common/server.py index c207312f..01f7021b 100644 --- a/RetrievalAugmentedGeneration/common/server.py +++ b/RetrievalAugmentedGeneration/common/server.py @@ -20,25 +20,22 @@ import logging from pathlib import Path from typing import Any, Dict, List +import importlib +from inspect import getmembers, isclass -from fastapi import FastAPI, File, UploadFile +from fastapi import FastAPI, File, UploadFile, Request from fastapi.responses import JSONResponse, StreamingResponse from pydantic import BaseModel, Field from pymilvus.exceptions import MilvusException, MilvusUnavailableException - -from RetrievalAugmentedGeneration.common import utils -from RetrievalAugmentedGeneration.examples.developer_rag import chains +from RetrievalAugmentedGeneration.common import utils, tracing logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # create the FastAPI server app = FastAPI() -# prestage the embedding model -_ = utils.get_embedding_model() -# set the global service context for Llama Index -utils.set_service_context() +EXAMPLE_DIR = "RetrievalAugmentedGeneration/example" class Prompt(BaseModel): """Definition of the Prompt API data type.""" @@ -56,14 +53,47 @@ class DocumentSearch(BaseModel): num_docs: int = Field(description="The maximum number of documents to return in the response.", default=4) +@app.on_event("startup") +def import_example() -> None: + """ + Import the example class from the specified example file. + The example directory is expected to have a python file where the example class is defined. + """ + + for root, dirs, files in os.walk(EXAMPLE_DIR): + for file in files: + if not file.endswith(".py"): + continue + + # Import the specified file dynamically + spec = importlib.util.spec_from_file_location(name="example", location=os.path.join(root, file)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Scan each class in the file to find one with the 3 implemented methods: ingest_docs, rag_chain and llm_chain + for name, _ in getmembers(module, isclass): + try: + cls = getattr(module, name) + if set(["ingest_docs", "llm_chain", "rag_chain"]).issubset(set(dir(cls))): + if name == "BaseExample": + continue + example = cls() + app.example = cls + return + except: + raise ValueError(f"Class {name} is not implemented and could not be instantiated.") + + raise NotImplementedError(f"Could not find a valid example class in {EXAMPLE_DIR}") + + @app.post("/uploadDocument") -async def upload_document(file: UploadFile = File(...)) -> JSONResponse: +@tracing.instrumentation_wrapper +async def upload_document(request: Request, file: UploadFile = File(...)) -> JSONResponse: """Upload a document to the vector store.""" if not file.filename: return JSONResponse(content={"message": "No files provided"}, status_code=200) try: - upload_folder = "uploaded_files" upload_file = os.path.basename(file.filename) if not upload_file: @@ -75,7 +105,7 @@ async def upload_document(file: UploadFile = File(...)) -> JSONResponse: with open(file_path, "wb") as f: shutil.copyfileobj(file.file, f) - chains.ingest_docs(file_path, upload_file) + app.example().ingest_docs(file_path, upload_file) return JSONResponse( content={"message": "File uploaded successfully"}, status_code=200 @@ -84,21 +114,23 @@ async def upload_document(file: UploadFile = File(...)) -> JSONResponse: except Exception as e: logger.error("Error from /uploadDocument endpoint. Ingestion of file: " + file.filename + " failed with error: " + str(e)) return JSONResponse( - content={"message": f"Ingestion of file: " + file.filename + " failed with error: " + str(e)}, status_code=500 + content={"message": str(e)}, status_code=500 ) @app.post("/generate") -async def generate_answer(prompt: Prompt) -> StreamingResponse: +@tracing.instrumentation_wrapper +async def generate_answer(request: Request, prompt: Prompt) -> StreamingResponse: """Generate and stream the response to the provided prompt.""" try: + example = app.example() if prompt.use_knowledge_base: logger.info("Knowledge base is enabled. Using rag chain for response generation.") - generator = chains.rag_chain(prompt.question, prompt.num_tokens) + generator = example.rag_chain(prompt.question, prompt.num_tokens) return StreamingResponse(generator, media_type="text/event-stream") - generator = chains.llm_chain(prompt.context, prompt.question, prompt.num_tokens) + generator = example.llm_chain(prompt.context, prompt.question, prompt.num_tokens) return StreamingResponse(generator, media_type="text/event-stream") except (MilvusException, MilvusUnavailableException) as e: @@ -111,20 +143,16 @@ async def generate_answer(prompt: Prompt) -> StreamingResponse: @app.post("/documentSearch") -def document_search(data: DocumentSearch) -> List[Dict[str, Any]]: +@tracing.instrumentation_wrapper +async def document_search(request: Request,data: DocumentSearch) -> List[Dict[str, Any]]: """Search for the most relevant documents for the given search parameters.""" try: - retriever = utils.get_doc_retriever(num_nodes=data.num_docs) - nodes = retriever.retrieve(data.content) - output = [] - for node in nodes: - file_name = nodes[0].metadata["filename"] - decoded_filename = base64.b64decode(file_name.encode("utf-8")).decode("utf-8") - entry = {"score": node.score, "source": decoded_filename, "content": node.text} - output.append(entry) - - return output + example = app.example() + if hasattr(example, "document_search") and callable(example.document_search): + return example.document_search(data.content, data.num_docs) + + raise NotImplementedError("Example class has not implemented the document_search method.") except Exception as e: logger.error(f"Error from /documentSearch endpoint. Error details: {e}") diff --git a/RetrievalAugmentedGeneration/common/tracing.py b/RetrievalAugmentedGeneration/common/tracing.py new file mode 100644 index 00000000..b2b5cb6e --- /dev/null +++ b/RetrievalAugmentedGeneration/common/tracing.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module for configuring objects used to create OpenTelemetry traces.""" + +import os +from opentelemetry import trace, context +from opentelemetry.sdk.resources import SERVICE_NAME, Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator +from opentelemetry.propagate import set_global_textmap, get_global_textmap +from opentelemetry.propagators.composite import CompositePropagator +from tools.observability.llamaindex import opentelemetry_callback +import llama_index +from llama_index.callbacks.base import CallbackManager +from functools import wraps + +# Configure tracer used by the Chain Server to create spans +resource = Resource.create({SERVICE_NAME: "chain-server"}) +provider = TracerProvider(resource=resource) +if os.environ.get("ENABLE_TRACING") == "true": + processor = SimpleSpanProcessor(OTLPSpanExporter()) + provider.add_span_processor(processor) +trace.set_tracer_provider(provider) +tracer = trace.get_tracer("chain-server") + +# Configure Propagator used for processing trace context received by the Chain Server +if os.environ.get("ENABLE_TRACING") == "true": + propagator = TraceContextTextMapPropagator() + # Llamaindex global handler set to pass callbacks into the OpenTelemetry handler + llama_index.global_handler = opentelemetry_callback.OpenTelemetryCallbackHandler(tracer) +else: + propagator = CompositePropagator([]) # No-op propagator +set_global_textmap(propagator) + +# Wrapper Function to perform instrumentation +def instrumentation_wrapper(func): + @wraps(func) + async def wrapper(*args, **kwargs): + request = kwargs.get("request") + prompt = kwargs.get("prompt") + ctx = get_global_textmap().extract(request.headers) + if ctx is not None: + context.attach(ctx) + if prompt is not None and prompt.use_knowledge_base == False: + # Hack to get the LLM event for no knowledge base queries to show up. + # A trace is not generated by Llamaindex for these calls so we need to generate it instead. + callback_manager = CallbackManager([]) + with callback_manager.as_trace("query"): + result = func(*args, **kwargs) + else: + result = func(*args, **kwargs) + return await result + + return wrapper diff --git a/RetrievalAugmentedGeneration/common/utils.py b/RetrievalAugmentedGeneration/common/utils.py index 50853f0c..99fda3f9 100644 --- a/RetrievalAugmentedGeneration/common/utils.py +++ b/RetrievalAugmentedGeneration/common/utils.py @@ -20,29 +20,55 @@ from functools import lru_cache from typing import TYPE_CHECKING, List, Optional -import torch -from llama_index.postprocessor.types import BaseNodePostprocessor -from llama_index.schema import MetadataMode -from llama_index.utils import globals_helper -from llama_index.vector_stores import MilvusVectorStore -from llama_index import VectorStoreIndex, ServiceContext, set_global_service_context -from llama_index.llms import LangChainLLM -from llama_index.embeddings import LangchainEmbedding -from langchain.text_splitter import SentenceTransformersTokenTextSplitter -from langchain.embeddings import HuggingFaceEmbeddings +logger = logging.getLogger(__name__) + +try: + import torch +except Exception as e: + logger.error(f"torch import failed with error: {e}") + +try: + import psycopg2 +except Exception as e: + logger.error(f"psycogp2 import failed with error: {e}") + +try: + from sqlalchemy import make_url +except Exception as e: + logger.error(f"SQLalchemy import failed with error: {e}") + +try: + from llama_index.postprocessor.types import BaseNodePostprocessor + from llama_index.schema import MetadataMode + from llama_index.utils import globals_helper, get_tokenizer + from llama_index.vector_stores import MilvusVectorStore, PGVectorStore + from llama_index import VectorStoreIndex, ServiceContext, set_global_service_context + from llama_index.llms import LangChainLLM + from llama_index.embeddings import LangchainEmbedding + if TYPE_CHECKING: + from llama_index.indices.base_retriever import BaseRetriever + from llama_index.indices.query.schema import QueryBundle + from llama_index.schema import NodeWithScore +except Exception as e: + logger.error(f"Llamaindex import failed with error: {e}") + +try: + from langchain.text_splitter import SentenceTransformersTokenTextSplitter + from langchain.embeddings import HuggingFaceEmbeddings +except Exception as e: + logger.error(f"Langchain import failed with error: {e}") + +try: + from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings +except Exception as e: + logger.error(f"NVIDIA AI connector import failed with error: {e}") + from integrations.langchain.llms.triton_trt_llm import TensorRTLLM -from integrations.langchain.llms.nv_aiplay import GeneralLLM -from integrations.langchain.embeddings.nv_aiplay import NVAIPlayEmbeddings from RetrievalAugmentedGeneration.common import configuration if TYPE_CHECKING: - from llama_index.indices.base_retriever import BaseRetriever - from llama_index.indices.query.schema import QueryBundle - from llama_index.schema import NodeWithScore from RetrievalAugmentedGeneration.common.configuration_wizard import ConfigWizard -logger = logging.getLogger(__name__) - DEFAULT_MAX_CONTEXT = 1500 DEFAULT_NUM_TOKENS = 150 TEXT_SPLITTER_EMBEDDING_MODEL = "intfloat/e5-large-v2" @@ -58,11 +84,12 @@ def _postprocess_nodes( included_nodes = [] current_length = 0 limit = DEFAULT_MAX_CONTEXT + tokenizer = get_tokenizer() for node in nodes: current_length += len( - globals_helper.tokenizer( - node.node.get_content(metadata_mode=MetadataMode.LLM) + tokenizer( + node.get_content(metadata_mode=MetadataMode.LLM) ) ) if current_length > limit: @@ -95,7 +122,42 @@ def get_config() -> "ConfigWizard": def get_vector_index() -> VectorStoreIndex: """Create the vector db index.""" config = get_config() - vector_store = MilvusVectorStore(uri=config.milvus.url, dim=config.embeddings.dimensions, overwrite=False) + vector_store = None + + logger.info(f"Using {config.vector_store.name} as vector store") + if config.vector_store.name == "pgvector": + connection_string = f"postgresql://{os.getenv('POSTGRES_USER', '')}:{os.getenv('POSTGRES_PASSWORD', '')}@{config.vector_store.url}" + db_name = "vector_db" + + conn = psycopg2.connect(connection_string) + conn.autocommit = True + + with conn.cursor() as c: + # Check for database existence first + c.execute(f"SELECT 1 FROM pg_database WHERE datname = '{db_name}'") + if not c.fetchone(): # Database doesn't exist + c.execute(f"CREATE DATABASE {db_name}") + + url = make_url(connection_string) + + vector_store = PGVectorStore.from_params( + database=db_name, + host=url.host, + password=url.password, + port=url.port, + user=url.username, + table_name="document_store", + embed_dim=config.embeddings.dimensions, + ) + elif config.vector_store.name == "milvus": + vector_store = MilvusVectorStore(uri=config.vector_store.url, + dim=config.embeddings.dimensions, + collection_name="document_store_ivfflat", + index_config={"index_type": "IVF_FLAT", "nlist": config.vector_store.nlist}, + search_config={"nprobe": config.vector_store.nprobe}, + overwrite=False) + else: + raise RuntimeError("Unable to find any supported Vector Store DB. Supported engines are milvus and pgvector.") return VectorStoreIndex.from_vector_store(vector_store) @@ -111,7 +173,7 @@ def get_llm() -> LangChainLLM: """Create the LLM connection.""" settings = get_config() - logger.info(f"Using {settings.llm.model_engine} as model engine for llm") + logger.info(f"Using {settings.llm.model_engine} as model engine for llm. Model name: {settings.llm.model_name}") if settings.llm.model_engine == "triton-trt-llm": trtllm = TensorRTLLM( # type: ignore server_url=settings.llm.server_url, @@ -119,17 +181,10 @@ def get_llm() -> LangChainLLM: tokens=DEFAULT_NUM_TOKENS, ) return LangChainLLM(llm=trtllm) - elif settings.llm.model_engine == "ai-playground": - if os.getenv('NVAPI_KEY') is None: - raise RuntimeError("AI PLayground key is not set") - aipl_llm = GeneralLLM( - model=settings.llm.model_name, - max_tokens=DEFAULT_NUM_TOKENS, - streaming=True - ) - return LangChainLLM(llm=aipl_llm) + elif settings.llm.model_engine == "nv-ai-foundation": + return ChatNVIDIA(model=settings.llm.model_name) else: - raise RuntimeError("Unable to find any supported Large Language Model server. Supported engines are triton-trt-llm and ai-playground.") + raise RuntimeError("Unable to find any supported Large Language Model server. Supported engines are triton-trt-llm and nv-ai-foundation.") @lru_cache @@ -151,11 +206,8 @@ def get_embedding_model() -> LangchainEmbedding: ) # Load in a specific embedding model return LangchainEmbedding(hf_embeddings) - elif settings.embeddings.model_engine == "ai-playground": - if os.getenv('NVAPI_KEY') is None: - raise RuntimeError("AI PLayground key is not set") - embedding = NVAIPlayEmbeddings(model=settings.embeddings.model_name) - return LangchainEmbedding(embedding) + elif settings.embeddings.model_engine == "nv-ai-foundation": + return NVIDIAEmbeddings(model=settings.embeddings.model_name, model_type="passage") else: raise RuntimeError("Unable to find any supported embedding model. Supported engine is huggingface.") @@ -179,6 +231,6 @@ def get_text_splitter() -> SentenceTransformersTokenTextSplitter: """Return the token text splitter instance from langchain.""" return SentenceTransformersTokenTextSplitter( model_name=TEXT_SPLITTER_EMBEDDING_MODEL, - chunk_size=get_config().text_splitter.chunk_size, + tokens_per_chunk=get_config().text_splitter.chunk_size, chunk_overlap=get_config().text_splitter.chunk_overlap, ) diff --git a/RetrievalAugmentedGeneration/examples/developer_rag/chains.py b/RetrievalAugmentedGeneration/examples/developer_rag/chains.py index b408cb69..4c9cb8cc 100644 --- a/RetrievalAugmentedGeneration/examples/developer_rag/chains.py +++ b/RetrievalAugmentedGeneration/examples/developer_rag/chains.py @@ -18,7 +18,7 @@ import os import logging from pathlib import Path -from typing import Generator +from typing import Generator, List, Dict, Any from llama_index import Prompt, download_loader from llama_index.query_engine import RetrieverQueryEngine @@ -34,84 +34,111 @@ get_vector_index, is_base64_encoded, set_service_context, + get_embedding_model, ) +from RetrievalAugmentedGeneration.common.base import BaseExample + +# prestage the embedding model +_ = get_embedding_model() +set_service_context() + logger = logging.getLogger(__name__) -def llm_chain( - context: str, question: str, num_tokens: int -) -> Generator[str, None, None]: - """Execute a simple LLM chain using the components defined above.""" - - logger.info("Using llm to generate response directly without knowledge base.") - set_service_context() - prompt = get_config().prompts.chat_template.format( - context_str=context, query_str=question - ) - - logger.info(f"Prompt used for response generation: {prompt}") - response = get_llm().stream_complete(prompt, tokens=num_tokens) - gen_response = (resp.delta for resp in response) - return gen_response - - -def rag_chain(prompt: str, num_tokens: int) -> Generator[str, None, None]: - """Execute a Retrieval Augmented Generation chain using the components defined above.""" - - logger.info("Using rag to generate response from document") - - set_service_context() - if get_config().llm.model_engine == "triton-trt-llm": - get_llm().llm.tokens = num_tokens # type: ignore - else: - get_llm().llm.max_tokens = num_tokens - retriever = get_doc_retriever(num_nodes=4) - qa_template = Prompt(get_config().prompts.rag_template) - - logger.info(f"Prompt used for response generation: {qa_template}") - query_engine = RetrieverQueryEngine.from_args( - retriever, - text_qa_template=qa_template, - node_postprocessors=[LimitRetrievedNodesLength()], - streaming=True, - ) - response = query_engine.query(prompt) - - # Properly handle an empty response - if isinstance(response, StreamingResponse): - return response.response_gen - - logger.warning("No response generated from LLM, make sure you've ingested document.") - return StreamingResponse(iter(["No response generated from LLM, make sure you have ingested document from the Knowledge Base Tab."])).response_gen # type: ignore - - -def ingest_docs(data_dir: str, filename: str) -> None: - """Ingest documents to the VectorDB.""" - - logger.info(f"Ingesting {filename} in vectorDB") - _, ext = os.path.splitext(filename) - - if ext.lower() == ".pdf": - PDFReader = download_loader("PDFReader") - loader = PDFReader() - documents = loader.load_data(file=Path(data_dir)) - - else: - unstruct_reader = download_loader("UnstructuredReader") - loader = unstruct_reader() - documents = loader.load_data(file=Path(data_dir), split_documents=False) - - encoded_filename = filename[:-4] - if not is_base64_encoded(encoded_filename): - encoded_filename = base64.b64encode(encoded_filename.encode("utf-8")).decode( - "utf-8" +class QAChatbot(BaseExample): + def ingest_docs(self, data_dir: str, filename: str): + """Ingest documents to the VectorDB.""" + + try: + logger.info(f"Ingesting {filename} in vectorDB") + _, ext = os.path.splitext(filename) + + if ext.lower() == ".pdf": + PDFReader = download_loader("PDFReader") + loader = PDFReader() + documents = loader.load_data(file=Path(data_dir)) + + else: + unstruct_reader = download_loader("UnstructuredReader") + loader = unstruct_reader() + documents = loader.load_data(file=Path(data_dir), split_documents=False) + + encoded_filename = filename[:-4] + if not is_base64_encoded(encoded_filename): + encoded_filename = base64.b64encode(encoded_filename.encode("utf-8")).decode( + "utf-8" + ) + + for document in documents: + document.metadata = {"filename": encoded_filename} + + index = get_vector_index() + node_parser = LangchainNodeParser(get_text_splitter()) + nodes = node_parser.get_nodes_from_documents(documents) + index.insert_nodes(nodes) + logger.info(f"Document {filename} ingested successfully") + except Exception as e: + logger.error(f"Failed to ingest document due to exception {e}") + raise ValueError("Failed to upload document. Please upload an unstructured text document.") + + def llm_chain(self, context: str, question: str, num_tokens: int) -> Generator[str, None, None]: + """Execute a simple LLM chain using the components defined above.""" + + logger.info("Using llm to generate response directly without knowledge base.") + set_service_context() + prompt = get_config().prompts.chat_template.format( + context_str=context, query_str=question + ) + + logger.info(f"Prompt used for response generation: {prompt}") + response = get_llm().stream_complete(prompt, tokens=num_tokens) + gen_response = (resp.delta for resp in response) + return gen_response + + def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]: + """Execute a Retrieval Augmented Generation chain using the components defined above.""" + + logger.info("Using rag to generate response from document") + + set_service_context() + if get_config().llm.model_engine == "triton-trt-llm": + get_llm().llm.tokens = num_tokens # type: ignore + else: + get_llm().llm.max_tokens = num_tokens + retriever = get_doc_retriever(num_nodes=4) + qa_template = Prompt(get_config().prompts.rag_template) + + logger.info(f"Prompt used for response generation: {qa_template}") + query_engine = RetrieverQueryEngine.from_args( + retriever, + text_qa_template=qa_template, + node_postprocessors=[LimitRetrievedNodesLength()], + streaming=True, ) + response = query_engine.query(prompt) + + # Properly handle an empty response + if isinstance(response, StreamingResponse): + return response.response_gen + + logger.warning("No response generated from LLM, make sure you've ingested document.") + return StreamingResponse(iter(["No response generated from LLM, make sure you have ingested document from the Knowledge Base Tab."])).response_gen # type: ignore + + def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]: + """Search for the most relevant documents for the given search parameters.""" + + try: + retriever = get_doc_retriever(num_nodes=num_docs) + nodes = retriever.retrieve(content) + output = [] + for node in nodes: + file_name = nodes[0].metadata["filename"] + decoded_filename = base64.b64decode(file_name.encode("utf-8")).decode("utf-8") + entry = {"score": node.score, "source": decoded_filename, "content": node.text} + output.append(entry) - for document in documents: - document.metadata = {"filename": encoded_filename} + return output - index = get_vector_index() - node_parser = LangchainNodeParser(get_text_splitter()) - nodes = node_parser.get_nodes_from_documents(documents) - index.insert_nodes(nodes) - logger.info(f"Document {filename} ingested successfully") + except Exception as e: + logger.error(f"Error from /documentSearch endpoint. Error details: {e}") + return [] diff --git a/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py new file mode 100644 index 00000000..82886fbb --- /dev/null +++ b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py @@ -0,0 +1,151 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from functools import lru_cache +from typing import Generator, List, Dict, Any + +from langchain.document_loaders import UnstructuredFileLoader +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate +from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings +from RetrievalAugmentedGeneration.common.base import BaseExample +from RetrievalAugmentedGeneration.common.utils import get_config, get_llm, get_embedding_model + +logger = logging.getLogger(__name__) +DOCS_DIR = os.path.abspath("./uploaded_files") +vector_store_path = "vectorstore.pkl" +document_embedder = get_embedding_model() +vectorstore = None +settings = get_config() + + +class NvidiaAIFoundation(BaseExample): + def ingest_docs(self, file_name: str, filename: str): + """Ingest documents to the VectorDB.""" + + try: + # TODO: Load embedding created in older conversation, memory persistance + # We initialize class in every call therefore it should be global + global vectorstore + # Load raw documents from the directory + # Data is copied to `DOCS_DIR` in common.server:upload_document + _path = os.path.join(DOCS_DIR, filename) + raw_documents = UnstructuredFileLoader(_path).load() + + if raw_documents: + text_splitter = CharacterTextSplitter(chunk_size=settings.text_splitter.chunk_size, chunk_overlap=settings.text_splitter.chunk_overlap) + documents = text_splitter.split_documents(raw_documents) + if vectorstore: + vectorstore.add_documents(documents) + else: + vectorstore = FAISS.from_documents(documents, document_embedder) + logger.info("Vector store created and saved.") + else: + logger.warning("No documents available to process!") + except Exception as e: + logger.error(f"Failed to ingest document due to exception {e}") + raise ValueError("Failed to upload document. Please upload an unstructured text document.") + + def llm_chain( + self, context: str, question: str, num_tokens: str + ) -> Generator[str, None, None]: + """Execute a simple LLM chain using the components defined above.""" + + logger.info("Using llm to generate response directly without knowledge base.") + prompt_template = ChatPromptTemplate.from_messages( + [ + ( + "system", + settings.prompts.chat_template, + ), + ("user", "{input}"), + ] + ) + + llm = get_llm() + + chain = prompt_template | llm | StrOutputParser() + augmented_user_input = ( + "Context: " + context + "\n\nQuestion: " + question + "\n" + ) + return chain.stream({"input": augmented_user_input}) + + def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]: + """Execute a Retrieval Augmented Generation chain using the components defined above.""" + + logger.info("Using rag to generate response from document") + + prompt_template = ChatPromptTemplate.from_messages( + [ + ( + "system", + settings.prompts.rag_template, + ), + ("user", "{input}"), + ] + ) + llm = get_llm() + + chain = prompt_template | llm | StrOutputParser() + + try: + if vectorstore != None: + retriever = vectorstore.as_retriever() + docs = retriever.get_relevant_documents(prompt) + + context = "" + for doc in docs: + context += doc.page_content + "\n\n" + + augmented_user_input = ( + "Context: " + context + "\n\nQuestion: " + prompt + "\n" + ) + + return chain.stream({"input": augmented_user_input}) + except Exception as e: + logger.warning(f"Failed to generate response due to exception {e}") + logger.warning( + "No response generated from LLM, make sure you've ingested document." + ) + return iter( + [ + "No response generated from LLM, make sure you have ingested document from the Knowledge Base Tab." + ] + ) + + def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]: + """Search for the most relevant documents for the given search parameters.""" + + try: + if vectorstore != None: + retriever = vectorstore.as_retriever() + docs = retriever.get_relevant_documents(content) + result = [] + for doc in docs: + result.append( + { + "source": os.path.basename(doc.metadata.get('source', '')), + "content": doc.page_content + } + ) + return result + return [] + except Exception as e: + logger.error(f"Error from /documentSearch endpoint. Error details: {e}") + return [] diff --git a/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt new file mode 100644 index 00000000..39556ee6 --- /dev/null +++ b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt @@ -0,0 +1 @@ +faiss-cpu==1.7.4 \ No newline at end of file diff --git a/RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py new file mode 100644 index 00000000..a08b2c20 --- /dev/null +++ b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py new file mode 100644 index 00000000..5d1a63fc --- /dev/null +++ b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py @@ -0,0 +1,341 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example showcases recursive task decomposition to perform RAG which requires multiple steps. +The agent is a langchain custom LLM agent, which uses 2 tools - search and math. +It uses OpenAI's GPT-4 model for sub-answer formation, tool prediction and math operations. It uses the deployed LLM for final answer formation. +Search tool is a RAG pipeline, whereas the math tool uses an LLM call to perform mathematical calculations. +""" + +from langchain.vectorstores import FAISS +from langchain.document_loaders import UnstructuredFileLoader +from langchain.text_splitter import CharacterTextSplitter +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain.chains import LLMChain +from langchain.prompts import BaseChatPromptTemplate +from langchain.schema import HumanMessage +from langchain.agents import LLMSingleActionAgent, AgentOutputParser, AgentExecutor, Tool +from langchain.schema.agent import AgentFinish, AgentAction +from typing import List, Union, Dict, Any +import json +import jinja2 +import os + +import os +import logging +from typing import Generator, List + +from RetrievalAugmentedGeneration.common.utils import ( + get_config, + get_llm, + set_service_context, + get_embedding_model, +) +from RetrievalAugmentedGeneration.common.base import BaseExample + +logger = logging.getLogger(__name__) + +llm = get_llm() +DOCS_DIR = os.path.abspath("./uploaded_files") +vector_store_path = "vectorstore.pkl" +document_embedder = get_embedding_model() +vectorstore = None +settings = get_config() + +##### Helper methods and tools ##### + +class Ledger: # Stores the state of the recursive decomposition + def __init__(self): + self.question_trace = [] + self.answer_trace = [] + self.trace = 0 + self.done = False + + +##### LLM and Prompt definitions ##### +def fetch_context(ledger: Ledger) -> str: + """ + Create the context for the prompt from the subquestions and answers + """ + context = "" + for i in range(len(ledger.question_trace)): + context += "Sub-Question: " + ledger.question_trace[i] + context += "\nSub-Answer: " + ledger.answer_trace[i] + "\n" + + return context + +template = """Your task is to answer questions. If you cannot answer the question, you can request use for a tool and break the question into specific sub questions. Fill with Nil where no action is required. You should only return a JSON containing the tool and the generated sub questions. Consider the contextual information and only ask for information that you do not already have. Do not return any other explanations or text. The output should be a simple JSON structure! You are given two tools: +- Search tool +- Math tool + +Do not pass sub questions to any tool if they already have an answer in the Contextual Information. +If you have all the information needed to answer the question, mark the Tool_Request as Nil. + +Contextual Information: +{{ context }} + +Question: +{{ question }} + +{"Tool_Request": "", "Generated Sub Questions": []} +""" + +class CustomPromptTemplate(BaseChatPromptTemplate): + template: str + tools: List[Tool] + ledger: Ledger + + def format_messages(self, **kwargs) -> str: + kwargs["context"] = fetch_context(self.ledger).strip("\n") + env = jinja2.Environment() + prompt_template = env.from_string(template) + prompt = prompt_template.render(**kwargs) + logger.info(prompt) + return [HumanMessage(content=prompt)] + + +##### LLM output parser ##### + + +class CustomOutputParser(AgentOutputParser): + class Config: + arbitrary_types_allowed = True + + ledger: Ledger + + def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]: + """ + Make a decision about the tool to be called based on LLM output. + """ + + logger.info(f"LLM Response: {llm_output}") + local_state = json.loads(llm_output) + if ( + local_state["Generated Sub Questions"][0] == "Nil" + or local_state["Tool_Request"] == "Nil" + or self.ledger.trace > 3 + or local_state["Generated Sub Questions"][0] in self.ledger.question_trace + ): + return AgentFinish( + return_values={"output": "success"}, + log=llm_output, + ) + + if local_state["Tool_Request"] == "Search tool": + self.ledger.trace += 1 + + if local_state["Tool_Request"] in ["Search tool", "Math tool"]: + return AgentAction( + tool=local_state["Tool_Request"], + tool_input={"sub_questions": local_state["Generated Sub Questions"]}, + log=llm_output, + ) + raise ValueError(f"Invalid Tool name: {local_state['Tool_Request']}") + + +class QueryDecompositionChatbot(BaseExample): + def ingest_docs(self, file_name: str, filename: str): + """Ingest documents to the VectorDB.""" + + try: + # TODO: Load embedding created in older conversation, memory persistance + # We initialize class in every call therefore it should be global + global vectorstore + # Load raw documents from the directory + # Data is copied to `DOCS_DIR` in common.server:upload_document + _path = os.path.join(DOCS_DIR, filename) + raw_documents = UnstructuredFileLoader(_path).load() + + if raw_documents: + text_splitter = CharacterTextSplitter(chunk_size=settings.text_splitter.chunk_size, chunk_overlap=settings.text_splitter.chunk_overlap) + documents = text_splitter.split_documents(raw_documents) + if vectorstore: + vectorstore.add_documents(documents) + else: + vectorstore = FAISS.from_documents(documents, document_embedder) + logger.info("Vector store created and saved.") + else: + logger.warning("No documents available to process!") + except Exception as e: + logger.error(f"Failed to ingest document due to exception {e}") + raise ValueError("Failed to upload document. Please upload an unstructured text document.") + + + def llm_chain( + self, context: str, question: str, num_tokens: str + ) -> Generator[str, None, None]: + """Execute a simple LLM chain using the components defined above.""" + + logger.info("Using llm to generate response directly without knowledge base.") + prompt_template = ChatPromptTemplate.from_messages( + [ + ( + "system", + settings.prompts.chat_template, + ), + ("user", "{input}"), + ] + ) + + llm = get_llm() + + chain = prompt_template | llm | StrOutputParser() + augmented_user_input = ( + "Context: " + context + "\n\nQuestion: " + question + "\n" + ) + return chain.stream({"input": augmented_user_input}) + + def rag_chain(self, question: str, num_tokens: int) -> Generator[str, None, None]: + """Execute a Retrieval Augmented Generation chain using the components defined above.""" + + logger.info("Using rag to generate response from document") + + set_service_context() + final_context = self.run_agent(question) + logger.info(f"Final Answer from agent: {final_context}") + + final_prompt_template = ChatPromptTemplate.from_messages( + [ + ("human", final_context) + ] + ) + chain = final_prompt_template | llm | StrOutputParser() + + return chain.stream({}) + + + def create_agent(self) -> AgentExecutor: + """ + Creates the tools, chain, output parser and agent used to fetch the full context. + """ + + self.ledger = Ledger() + + tools = [ + Tool(name="Search tool", func=self.search, description="Searches for the answer from a given context."), + Tool(name="Math tool", func=self.math, description="Performs mathematical calculations."), + ] + tool_names = [tool.name for tool in tools] + + prompt = CustomPromptTemplate(template=template, tools=tools, input_variables=["question"], ledger=self.ledger) + output_parser = CustomOutputParser(ledger=self.ledger) + llm_chain = LLMChain(llm=llm, prompt=prompt) + + recursive_decomposition_agent = LLMSingleActionAgent( + llm_chain=llm_chain, output_parser=output_parser, stop=["\n\n"], allowed_tools=tool_names + ) + + agent_executor = AgentExecutor.from_agent_and_tools(agent=recursive_decomposition_agent, tools=tools, verbose=True) + return agent_executor + + + def run_agent(self, question: str): + """ + Run question on the agent + """ + + agent_executor = self.create_agent() + agent_executor.invoke({"question": question}) + + ##### LLM call to get final answer ###### + + prompt = "Question: " + question + "\n\n" + prompt += "Sub Questions and Answers\n" + for i in range(len(self.ledger.question_trace)): + prompt += "Sub Question: " + str(self.ledger.question_trace[i]) + "\n" + prompt += "Sub Answer: " + str(self.ledger.answer_trace[i]) + "\n" + prompt += "\nFinal Answer: " + + return prompt + + def retriever(self, query: str) -> List[str]: + """ + Searches for the answer from a given context. + """ + + if vectorstore is None: + return [] + + retriever = vectorstore.as_retriever() + result = retriever.get_relevant_documents(query) + logger.info(result) + return [hit.page_content for hit in result] + + + def extract_answer(self, chunks: List[str], question: str) -> str: + """ + Find the answer to the query from the retrieved chunks + """ + + prompt = "Below is a Question and set of Passages that may or may not be relevant. Your task is to Extract the answer for question using only the information available in the passages. Be as concise as possible and only include the answer if present. Do not infer or process the passage in any other way\n\n" + prompt += "Question: " + question + "\n\n" + for idx, chunk in enumerate(chunks): + prompt += f"Passage {idx + 1}:\n" + prompt += chunk + "\n" + + answer = llm([HumanMessage(content=prompt)]) + return answer.content + + + def search(self, sub_questions: List[str]): + """ + Search for the answer for each subquestion and add them to the ledger. + """ + + logger.info(f"Entering search with subquestions: {sub_questions}") + for sub_question in sub_questions: + chunk = self.retriever(sub_question) + sub_answer = self.extract_answer(chunk, sub_question) + + self.ledger.question_trace.append(sub_question) + self.ledger.answer_trace.append(sub_answer) + + + def math(self, sub_questions: List[str]): + """ + Places an LLM call to answer mathematical subquestions which do not require search + """ + + prompt = "Solve this mathematical question:\nQuestion: " + sub_questions[0] + prompt += f"Context:\n{fetch_context(self.ledger)}\n" + prompt += "Be concise and only return the answer." + + logger.info(f"Performing Math LLM call with prompt: {prompt}") + sub_answer = llm([HumanMessage(content=prompt)]) + self.ledger.question_trace.append(sub_questions[0]) + self.ledger.answer_trace.append(sub_answer.content) + + self.ledger.done = True + + def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]: + """Search for the most relevant documents for the given search parameters.""" + + try: + retriever = get_doc_retriever(num_nodes=num_docs) + nodes = retriever.retrieve(content) + output = [] + for node in nodes: + file_name = nodes[0].metadata["filename"] + decoded_filename = base64.b64decode(file_name.encode("utf-8")).decode("utf-8") + entry = {"score": node.score, "source": decoded_filename, "content": node.text} + output.append(entry) + + return output + + except Exception as e: + logger.error(f"Error from /documentSearch endpoint. Error details: {e}") + return [] diff --git a/RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt new file mode 100644 index 00000000..39556ee6 --- /dev/null +++ b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt @@ -0,0 +1 @@ +faiss-cpu==1.7.4 \ No newline at end of file diff --git a/RetrievalAugmentedGeneration/frontend/Dockerfile b/RetrievalAugmentedGeneration/frontend/Dockerfile index be5975b6..5f8192db 100644 --- a/RetrievalAugmentedGeneration/frontend/Dockerfile +++ b/RetrievalAugmentedGeneration/frontend/Dockerfile @@ -1,12 +1,14 @@ FROM docker.io/library/python:3.11-slim -COPY frontend /app/frontend +RUN mkdir /app COPY requirements.txt /app RUN apt-get update; \ apt-get upgrade -y; \ python3 -m pip --no-cache-dir install -r /app/requirements.txt; \ + python3 -m pip --no-cache-dir install nvidia-riva-client==2.14.0; \ apt-get clean USER 1001 +COPY frontend /app/frontend WORKDIR /app ENTRYPOINT ["python3", "-m", "frontend"] diff --git a/RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py b/RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py new file mode 100644 index 00000000..a15aefed --- /dev/null +++ b/RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py @@ -0,0 +1,231 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import queue +from threading import Thread + +import os +import logging +import grpc +import pycountry +import gradio as gr +import numpy as np +import riva.client +import riva.client.proto.riva_asr_pb2 as riva_asr +import riva.client.proto.riva_asr_pb2_grpc as rasr_srv +from google.protobuf import text_format + +class ASRSession: + def __init__(self): + self.is_first_buffer = True + self.request_queue = None + self.response_stream = None + self.response_thread = None + self.transcript = "" + +_LOGGER = logging.getLogger(__name__) + +# Extract environmental variables +RIVA_API_URI = os.getenv("RIVA_API_URI", None) +RIVA_API_KEY = os.getenv("RIVA_API_KEY", None) +RIVA_FUNCTION_ID = os.getenv("RIVA_FUNCTION_ID", None) + +# Establish a connection to the Riva server +try: + use_ssl = False + metadata = [] + auth = None + if RIVA_API_KEY: + use_ssl = True + metadata.append(("authorization", "Bearer " + RIVA_API_KEY)) + if RIVA_FUNCTION_ID: + use_ssl = True + metadata.append(("function-id", RIVA_FUNCTION_ID)) + auth = riva.client.Auth( + None, use_ssl=use_ssl, + uri=RIVA_API_URI, + metadata_args=metadata + ) + _LOGGER.info('Created riva.client.Auth success') +except: + _LOGGER.info('Error creating riva.client.Auth') + +# Obtain the ASR languages available on the Riva server +ASR_LANGS = dict() + +try: + _LOGGER.info("Available ASR languages") + asr_client = riva.client.ASRService(auth) + config_response = asr_client.stub.GetRivaSpeechRecognitionConfig(riva_asr.RivaSpeechRecognitionConfigRequest()) + for model_config in config_response.model_config: + if model_config.parameters["decoder_type"] and model_config.model_name.endswith("streaming"): + language_code = model_config.parameters['language_code'] + language_name = f"{pycountry.languages.get(alpha_2=language_code[:2]).name} ({language_code})" + _LOGGER.info(f"{language_name} {model_config.model_name}") + ASR_LANGS[language_name] = {"language_code": language_code, "model": model_config.model_name} +except: + ASR_LANGS["No ASR languages available"] = "No ASR languages available" + gr.Info('The app could not find any available ASR languages. Thus, none will appear in the "ASR Language" dropdown menu. Check that you are connected to a Riva server with ASR enabled.') + _LOGGER.info('The app could not find any available ASR languages. Thus, none will appear in the "ASR Language" dropdown menu. Check that you are connected to a Riva server with ASR enabled.') + +ASR_LANGS = dict(sorted(ASR_LANGS.items())) + +def print_streaming_response(asr_session): + asr_session.transcript = "" + final_transcript = "" + try: + for response in asr_session.response_stream: + final = "" + partial = "" + if not response.results: + continue + if len(response.results) > 0 and len(response.results[0].alternatives) > 0: + for result in response.results: + if result.is_final: + final += result.alternatives[0].transcript + else: + partial += result.alternatives[0].transcript + + final_transcript += final + asr_session.transcript = final_transcript + partial + + except grpc.RpcError as rpc_error: + _LOGGER.error(rpc_error.code(), rpc_error.details()) + # TODO See if Gradio popup error mechanism can be used. + # For now whow error via transcript text box. + asr_session.transcript = rpc_error.details() + return + +def start_recording(audio, language, asr_session): + _LOGGER.info('start_recording') + asr_session.is_first_buffer = True + asr_session.request_queue = queue.Queue() + return "", asr_session + +def stop_recording(asr_session): + _LOGGER.info('stop_recording') + try: + asr_session.request_queue.put(None) + asr_session.response_thread.join() + except: + pass + return asr_session + +def transcribe_streaming(audio, language, asr_session, auth=auth): + _LOGGER.info('transcribe_streaming') + + if auth == None: + _LOGGER.info('Riva client did not initialize properly. Skipping transcription.') + return None, None + + if language == 'No ASR languages available': + gr.Info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.') + _LOGGER.info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.') + return None, None + rate, data = audio + if len(data.shape) > 1: + data = np.mean(data, axis=1) + + if not len(data): + return asr_session.transcript, asr_session + + if asr_session.is_first_buffer: + + streaming_config = riva.client.StreamingRecognitionConfig( + config=riva.client.RecognitionConfig( + encoding=riva.client.AudioEncoding.LINEAR_PCM, + language_code=ASR_LANGS[language]['language_code'], + max_alternatives=1, + profanity_filter=False, + enable_automatic_punctuation=True, + verbatim_transcripts=False, + sample_rate_hertz=rate, + audio_channel_count=1, + enable_word_time_offsets=True, + model=ASR_LANGS[language]['model'], + ), + interim_results=True, + ) + + _LOGGER.info(f'auth.channel = {auth.channel}') + rasr_stub = rasr_srv.RivaSpeechRecognitionStub(auth.channel) + asr_session.response_stream = rasr_stub.StreamingRecognize(iter(asr_session.request_queue.get, None)) + + # First buffer should contain only the config + request = riva_asr.StreamingRecognizeRequest(streaming_config=streaming_config) + asr_session.request_queue.put(request) + + asr_session.response_thread = Thread(target=print_streaming_response, args=(asr_session,)) + + # run the thread + asr_session.response_thread.start() + + asr_session.is_first_buffer = False + + request = riva_asr.StreamingRecognizeRequest(audio_content=data.astype(np.int16).tobytes()) + asr_session.request_queue.put(request) + + return asr_session.transcript, asr_session + +def transcribe_offline(audio, language, diarization, auth=auth): + _LOGGER.info('transcribe_offline') + + if auth == None: + _LOGGER.info('Riva client did not initialize properly. Skipping transcription.') + return None, None + + if language == 'No ASR languages available': + gr.Info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.') + _LOGGER.info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.') + return None, None + rate, data = audio + if len(data.shape) > 1: + data = np.mean(data, axis=1) + + if not len(data): + _LOGGER.info("Empty audio provided") + return None, None + + asr_dict = next((d for d in asr_config if d['asr_language_name'] == language), None) + + config = riva.client.RecognitionConfig( + encoding=riva.client.AudioEncoding.LINEAR_PCM, + sample_rate_hertz=rate, + audio_channel_count=1, + language_code=ASR_LANGS[language]['language_code'], + max_alternatives=1, + profanity_filter=False, + enable_automatic_punctuation=True, + verbatim_transcripts=False, + enable_word_time_offsets=True, + ) + riva.client.add_speaker_diarization_to_config(config, diarization) + + asr_client = riva.client.ASRService(auth) + try: + response = asr_client.offline_recognize(data.astype(np.int16).tobytes(), config) + if len(response.results) > 0 and len(response.results[0].alternatives) > 0: + final_transcript = "" + for res in response.results: + final_transcript += res.alternatives[0].transcript + return final_transcript, text_format.MessageToString(response, as_utf8=True) + except grpc.RpcError as rpc_error: + _LOGGER.info(f"{rpc_error.code()}, {rpc_error.details()}") + # TODO See if Gradio popup error mechanism can be used. + # For now whow error via transcript text box. + latest_transcript = rpc_error.details() + return latest_transcript, None + + return latest_transcript, None \ No newline at end of file diff --git a/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css b/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css index 04e93049..237f3ddd 100644 --- a/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css +++ b/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css @@ -11,3 +11,18 @@ footer { visibility: hidden; } + +.record-button { + width: 35px !important; + overflow: hidden !important; +} +.record-button::before { + content: "🎤" !important; + background-color: var(--block-background-fill) !important; +} +.mic-wrap {float: left} +#microphone{min-width: min(0px, 100%) !important;} +#microphone div.small { + width: 25px; + height: 100%; +} diff --git a/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py b/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py index b2cac45a..0b95f957 100644 --- a/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py +++ b/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py @@ -20,6 +20,8 @@ import requests +from frontend import tracing + _LOGGER = logging.getLogger(__name__) @@ -37,12 +39,16 @@ def model_name(self) -> str: """Return the friendly model name.""" return self._model_name + @tracing.instrumentation_wrapper def search( - self, prompt: str + self, carrier, prompt: str ) -> typing.List[typing.Dict[str, typing.Union[str, float]]]: """Search for relevant documents and return json data.""" data = {"content": prompt, "num_docs": 4} - headers = {"accept": "application/json", "Content-Type": "application/json"} + headers = { + **carrier, + "accept": "application/json", "Content-Type": "application/json" + } url = f"{self.server_url}/documentSearch" _LOGGER.debug( "looking up documents - %s", str({"server_url": url, "post_data": data}) @@ -62,8 +68,9 @@ def search( ) + @tracing.predict_instrumentation_wrapper def predict( - self, query: str, use_knowledge_base: bool, num_tokens: int + self, carrier, query: str, use_knowledge_base: bool, num_tokens: int ) -> typing.Generator[str, None, None]: """Make a model prediction.""" data = { @@ -78,8 +85,7 @@ def predict( ) try: - with requests.post(url, stream=True, json=data, timeout=10) as req: - + with requests.post(url, stream=True, json=data, timeout=30, headers=carrier) as req: req.raise_for_status() for chunk in req.iter_content(16): yield chunk.decode("UTF-8") @@ -87,10 +93,16 @@ def predict( _LOGGER.error(f"Failed to get response from /generate endpoint of chain-server. Error details: {e}. Refer to chain-server logs for details.") yield str("Failed to get response from /generate endpoint of chain-server. Check if the fastapi server in chain-server is up. Refer to chain-server logs for details.") - def upload_documents(self, file_paths: typing.List[str]) -> None: + # Send None to indicate end of response + yield None + + + @tracing.instrumentation_wrapper + def upload_documents(self, carrier, file_paths: typing.List[str]) -> None: """Upload documents to the kb.""" url = f"{self.server_url}/uploadDocument" headers = { + **carrier, "accept": "application/json", } @@ -105,8 +117,11 @@ def upload_documents(self, file_paths: typing.List[str]) -> None: str({"server_url": url, "file": fpath}), ) - _ = requests.post( + resp = requests.post( url, headers=headers, files=files, timeout=600 # type: ignore [arg-type] ) + if resp.status_code == 500: + raise ValueError(f"{resp.json().get('message', 'Failed to upload document')}") except Exception as e: - _LOGGER.error(f"Failed to get response from /uploadDocument endpoint of chain-server. Error details: {e}. Refer to chain-server logs for details.") \ No newline at end of file + _LOGGER.error(f"Failed to get response from /uploadDocument endpoint of chain-server. Error details: {e}. Refer to chain-server logs for details.") + raise ValueError(f"{e}") diff --git a/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py b/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py index 2671f5b0..414e6a87 100644 --- a/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py +++ b/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py @@ -20,7 +20,7 @@ import gradio as gr -from frontend import assets, chat_client +from frontend import assets, chat_client, asr_utils, tts_utils _LOGGER = logging.getLogger(__name__) PATH = "/converse" @@ -38,16 +38,21 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: - """Buiild the gradio page to be mounted in the frame.""" + """Build the gradio page to be mounted in the frame.""" kui_theme, kui_styles = assets.load_theme("kaizen") with gr.Blocks(title=TITLE, theme=kui_theme, css=kui_styles + _LOCAL_CSS) as page: + + # session specific state across runs + state = gr.State(value=asr_utils.ASRSession()) + # create the page header gr.Markdown(f"# {TITLE}") # chat logs with gr.Row(equal_height=True): chatbot = gr.Chatbot(scale=2, label=client.model_name) + latest_response = gr.Textbox(visible=False) context = gr.JSON( scale=1, label="Knowledge Base Context", @@ -55,16 +60,82 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: elem_id="contextbox", ) + # TTS output box + # visible so that users can stop or replay playback + with gr.Row(): + output_audio = gr.Audio( + label="Synthesized Speech", + autoplay=True, + interactive=False, + streaming=True, + visible=True, + show_download_button=False + ) + + # check boxes with gr.Row(): - with gr.Column(scale=10, min_width=600): + with gr.Column(scale=10, min_width=150): kb_checkbox = gr.Checkbox( label="Use knowledge base", info="", value=False ) + with gr.Column(scale=10, min_width=150): + tts_checkbox = gr.Checkbox( + label="Enable TTS output", info="", value=False + ) + + # dropdowns + with gr.Accordion("ASR and TTS Settings"): + with gr.Row(): + asr_language_list = list(asr_utils.ASR_LANGS) + asr_language_dropdown = gr.components.Dropdown( + label="ASR Language", + choices=asr_language_list, + value=asr_language_list[0], + ) + tts_language_list = list(tts_utils.TTS_MODELS) + tts_language_dropdown = gr.components.Dropdown( + label="TTS Language", + choices=tts_language_list, + value=tts_language_list[0], + ) + all_voices = [] + try: + for model in tts_utils.TTS_MODELS: + all_voices.extend(tts_utils.TTS_MODELS[model]['voices']) + default_voice = tts_utils.TTS_MODELS[tts_language_list[0]]['voices'][0] + except: + all_voices.append("No TTS voices available") + default_voice = "No TTS voices available" + tts_voice_dropdown = gr.components.Dropdown( + label="TTS Voice", + choices=all_voices, + value=default_voice, + ) + + # audio and text input boxes + with gr.Row(): + with gr.Column(scale=10, min_width=500): msg = gr.Textbox( show_label=False, placeholder="Enter text and press ENTER", container=False, ) + # For (at least) Gradio 3.39.0 and lower, the first argument + # in the list below is named `source`. If not None, it must + # be a single string, namely either "upload" or "microphone". + # For more recent Gradio versions (such as 4.4.1), it's named + # `sources`, plural. If not None, it must be a list, containing + # either "upload", "microphone", or both. + audio_mic = gr.Audio( + sources=["microphone"], + type="numpy", + streaming=True, + visible=True, + label="Transcribe Audio Query", + show_label=False, + container=False, + elem_id="microphone", + ) # user feedback with gr.Row(): @@ -73,7 +144,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: # _ = gr.Button(value="⚠️ Flag") submit_btn = gr.Button(value="Submit") _ = gr.ClearButton(msg) - _ = gr.ClearButton([msg, chatbot], value="Clear history") + _ = gr.ClearButton([msg, chatbot], value="Clear History") ctx_show = gr.Button(value="Show Context") ctx_hide = gr.Button(value="Hide Context", visible=False) @@ -95,10 +166,49 @@ def _toggle_context(btn: str) -> Dict[gr.component, Dict[Any, Any]]: # form actions _my_build_stream = functools.partial(_stream_predict, client) msg.submit( - _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context] + _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context, latest_response] ) submit_btn.click( - _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context] + _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context, latest_response] + ) + + tts_language_dropdown.change( + tts_utils.update_voice_dropdown, + [tts_language_dropdown], + [tts_voice_dropdown], + api_name=False + ) + + audio_mic.start_recording( + asr_utils.start_recording, + [audio_mic, asr_language_dropdown, state], + [msg, state], + api_name=False, + ) + audio_mic.stop_recording( + asr_utils.stop_recording, + [state], + [state], + api_name=False + ) + audio_mic.stream( + asr_utils.transcribe_streaming, + [audio_mic, asr_language_dropdown, state], + [msg, state], + api_name=False + ) + audio_mic.clear( + lambda: "", + [], + [msg], + api_name=False + ) + + latest_response.change( + tts_utils.text_to_speech, + [latest_response, tts_language_dropdown, tts_voice_dropdown, tts_checkbox], + [output_audio], + api_name=False ) page.queue() @@ -121,8 +231,11 @@ def _stream_predict( documents: Union[None, List[Dict[str, Union[str, float]]]] = None if use_knowledge_base: - documents = client.search(question) - - for chunk in client.predict(question, use_knowledge_base, OUTPUT_TOKENS): - chunks += chunk - yield "", chat_history + [[question, chunks]], documents + documents = client.search(prompt = question) + + for chunk in client.predict(query=question, use_knowledge_base=use_knowledge_base, num_tokens=OUTPUT_TOKENS): + if chunk: + chunks += chunk + yield "", chat_history + [[question, chunks]], documents, "" + else: + yield "", chat_history + [[question, chunks]], documents, chunks diff --git a/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py b/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py index c38c846c..35327f59 100644 --- a/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py +++ b/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py @@ -62,16 +62,19 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: def upload_file(files: List[Path], client: chat_client.ChatClient) -> List[str]: """Use the client to upload a file to the knowledge base.""" - file_paths = [file.name for file in files] - client.upload_documents(file_paths) - - # Save the uploaded file names to the state file - with open(STATE_FILE, 'a') as file: - for file_path in file_paths: - file_path = os.path.basename(file_path) - file.write(file_path + '\n') - - return file_paths + try: + file_paths = [file.name for file in files] + client.upload_documents(file_paths = file_paths) + + # Save the uploaded file names to the state file + with open(STATE_FILE, 'a') as file: + for file_path in file_paths: + file_path = os.path.basename(file_path) + file.write(file_path + '\n') + + return file_paths + except Exception as e: + raise gr.Error(f"{e}") def get_uploaded_files(): """Load previously uploaded files if the file exists""" diff --git a/RetrievalAugmentedGeneration/frontend/frontend/tracing.py b/RetrievalAugmentedGeneration/frontend/frontend/tracing.py new file mode 100644 index 00000000..945ae00f --- /dev/null +++ b/RetrievalAugmentedGeneration/frontend/frontend/tracing.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from opentelemetry import trace +from opentelemetry.sdk.resources import SERVICE_NAME, Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator +from opentelemetry.propagate import set_global_textmap, get_global_textmap +from opentelemetry.propagators.composite import CompositePropagator + +# Configure tracer used by the Frontend to create spans +resource = Resource.create({ + SERVICE_NAME: "frontend" +}) +provider = TracerProvider(resource=resource) +if os.environ.get("ENABLE_TRACING") == "true": + processor = SimpleSpanProcessor(OTLPSpanExporter()) + provider.add_span_processor(processor) +trace.set_tracer_provider(provider) +tracer = trace.get_tracer("frontend") + +# Configure Propagator used for processing trace context received by the Frontend +if os.environ.get("ENABLE_TRACING") == "true": + propagator = TraceContextTextMapPropagator() +else: + propagator = CompositePropagator([]) # No-op propagator + +set_global_textmap(propagator) + +# Include the contents of carrier in an HTTP header +# to propagate the span context into another microservice +def inject_context(ctx): + carrier = {} + get_global_textmap().inject(carrier, context=ctx) + return carrier + +# Wrapper Function to perform instrumentation +def instrumentation_wrapper(func): + def wrapper(self, *args, **kwargs): + span_name = func.__name__ + span = tracer.start_span(span_name) + span_ctx = trace.set_span_in_context(span) + carrier = inject_context(span_ctx) + [span.set_attribute(f"{kw}", kwargs[kw]) for kw in kwargs] + result = func(self, carrier, *args, **kwargs) + span.end() + return result + return wrapper + +# Wrapper function for the streaming predict call +def predict_instrumentation_wrapper(func): + def wrapper(self, *args, **kwargs): + span_name = func.__name__ + span = tracer.start_span(span_name) + span_ctx = trace.set_span_in_context(span) + [span.set_attribute(f"{kw}", kwargs[kw]) for kw in kwargs] + carrier = inject_context(span_ctx) + constructed_response = "" + for chunk in func(self, carrier, *args, **kwargs): + if chunk: + constructed_response += chunk + yield chunk + span.set_attribute("response", constructed_response) + span.end() + return wrapper \ No newline at end of file diff --git a/RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py b/RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py new file mode 100644 index 00000000..ac65a025 --- /dev/null +++ b/RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import json +import logging +import pycountry +from pathlib import Path +from threading import Thread +from typing import TYPE_CHECKING, Any, List +import gradio as gr +import numpy as np +import riva.client +import riva.client.proto.riva_tts_pb2 as riva_tts + +_LOGGER = logging.getLogger(__name__) + +# Extract environmental variables +RIVA_API_URI = os.getenv("RIVA_API_URI", None) +RIVA_API_KEY = os.getenv("RIVA_API_KEY", None) +RIVA_FUNCTION_ID = os.getenv("RIVA_FUNCTION_ID", None) + +try: + tts_sample_rate = int(os.getenv("TTS_SAMPLE_RATE", 48000)) +except Exception as e: + _LOGGER.info('TTS_SAMPLE_RATE is not set to an integer value. Defaulting to 48000.') + tts_sample_rate = 48000 + +# Establish a connection to the Riva server +try: + use_ssl = False + metadata = [] + auth = None + if RIVA_API_KEY: + use_ssl = True + metadata.append(("authorization", "Bearer " + RIVA_API_KEY)) + if RIVA_FUNCTION_ID: + use_ssl = True + metadata.append(("function-id", RIVA_FUNCTION_ID)) + auth = riva.client.Auth( + None, use_ssl=use_ssl, + uri=RIVA_API_URI, + metadata_args=metadata + ) + _LOGGER.info('Created riva.client.Auth success') +except: + _LOGGER.info('Error creating riva.client.Auth') + +# Obtain the TTS languages and voices available on the Riva server +TTS_MODELS = dict() +try: + tts_client = riva.client.SpeechSynthesisService(auth) + config_response = tts_client.stub.GetRivaSynthesisConfig(riva_tts.RivaSynthesisConfigRequest()) + for model_config in config_response.model_config: + language_code = model_config.parameters['language_code'] + language_name = f"{pycountry.languages.get(alpha_2=language_code[:2]).name} ({language_code})" + voice_name = model_config.parameters['voice_name'] + subvoices = [voice.split(':')[0] for voice in model_config.parameters['subvoices'].split(',')] + full_voice_names = [voice_name + "." + subvoice for subvoice in subvoices] + + if language_name in TTS_MODELS: + TTS_MODELS[language_name]['voices'].extend(full_voice_names) + else: + TTS_MODELS[language_name] = {"language_code": language_code, "voices": full_voice_names} + + TTS_MODELS = dict(sorted(TTS_MODELS.items())) + + _LOGGER.info(json.dumps(TTS_MODELS, indent=4)) +except: + TTS_MODELS["No TTS languages available"] = "No TTS languages available" + gr.Info('The app could not find any available TTS languages. Thus, none will appear in the "TTS Language" or "TTS Voice" dropdown menus. Check that you are connected to a Riva server with TTS enabled.') + _LOGGER.info('The app could not find any available TTS languages. Thus, none will appear in the "TTS Language" or "TTS Voice" dropdown menus. Check that you are connected to a Riva server with TTS enabled.') + +# Once the user selects a TTS language, narrow the options in the TTS voice +# dropdown menu accordingly +def update_voice_dropdown(language): + if language == "No TTS languages available": + voice_dropdown = gr.Dropdown( + label="Voice", choices="No TTS voices available", value="No TTS voices available" + ) + else: + voice_dropdown = gr.Dropdown( + label="Voice", choices=TTS_MODELS[language]['voices'], value=TTS_MODELS[language]['voices'][0] + ) + return voice_dropdown + +def text_to_speech(text, language, voice, enable_tts, auth=auth): + if not enable_tts: + return None + if auth == None: + _LOGGER.info('Riva client did not initialize properly. Skipping text to speech.') + return None, None + if language == "No TTS languages available": + gr.Info('The app cannot access TTS services. Any attempt to synthesize audio will be unsuccessful. Check that you are connected to a Riva server with TTS enabled.') + _LOGGER.info('The app cannot access TTS services. Any attempt to synthesize audio will be unsuccessful. Check that you are connected to a Riva server with TTS enabled.') + return None, gr.update(interactive=False) + if not text or not voice or not enable_tts: + gr.Info("Provide all inputs or select an example") + return None, gr.update(interactive=False) + if not text: + gr.Info('No text from which to synthesize a voice has been provided') + return None, gr.update(interactive=False) + if not voice: + gr.Info('No TTS voice or an invalid TTS voice has been selected') + return None, gr.update(interactive=False) + if not enable_tts: + gr.Info('TTS output is currently disabled. Click on the "Enable TTS output" checkbox to enable it.') + return None, gr.update(interactive=False) + + first_buffer = True + start_time = time.time() + + # TODO: Gradio Flagging doesn't work with streaming audio ouptut. + # See https://github.com/gradio-app/gradio/issues/5806 + # TODO: Audio download does not work with streaming audio output. + # See https://github.com/gradio-app/gradio/issues/6570 + + tts_client = riva.client.SpeechSynthesisService(auth) + + response = tts_client.synthesize_online( + text=text, + voice_name=voice, + language_code=TTS_MODELS[language]['language_code'], + sample_rate_hz=tts_sample_rate + ) + for result in response: + if len(result.audio): + if first_buffer: + _LOGGER.info( + f"TTS request [{result.id.value}] first buffer latency: {time.time() - start_time} sec" + ) + first_buffer = False + yield (tts_sample_rate, np.frombuffer(result.audio, dtype=np.int16)) + + _LOGGER.info(f"TTS request [{result.id.value}] last buffer latency: {time.time() - start_time} sec") + + yield (tts_sample_rate, np.frombuffer(b'', dtype=np.int16)) diff --git a/RetrievalAugmentedGeneration/frontend/requirements.txt b/RetrievalAugmentedGeneration/frontend/requirements.txt index a2a85130..78b1c5c8 100644 --- a/RetrievalAugmentedGeneration/frontend/requirements.txt +++ b/RetrievalAugmentedGeneration/frontend/requirements.txt @@ -1,8 +1,12 @@ dataclass_wizard==0.22.2 -gradio==3.39.0 +gradio==4.13.0 jinja2==3.1.2 numpy==1.25.2 protobuf==3.20.3 PyYAML==6.0 tritonclient[all]==2.36.0 uvicorn==0.22.0 +opentelemetry-sdk==1.21.0 +opentelemetry-api==1.21.0 +opentelemetry-exporter-otlp-proto-grpc==1.21.0 +pycountry==23.12.11 diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext deleted file mode 120000 index 056bf100..00000000 --- a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext +++ /dev/null @@ -1 +0,0 @@ -llama \ No newline at end of file diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/1/.tmp b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/1/.tmp new file mode 100644 index 00000000..e69de29b diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt new file mode 100755 index 00000000..cbd087ce --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt @@ -0,0 +1,228 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble" +platform: "ensemble" +max_batch_size: 128 +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "max_tokens" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "preprocessing" + model_version: -1 + input_map { + key: "QUERY" + value: "text_input" + } + input_map { + key: "REQUEST_OUTPUT_LEN" + value: "max_tokens" + } + output_map { + key: "REQUEST_INPUT_LEN" + value: "_REQUEST_INPUT_LEN" + } + output_map { + key: "INPUT_ID" + value: "_INPUT_ID" + } + output_map { + key: "REQUEST_OUTPUT_LEN" + value: "_REQUEST_OUTPUT_LEN" + } + }, + { + model_name: "tensorrt_llm" + model_version: -1 + input_map { + key: "input_ids" + value: "_INPUT_ID" + } + input_map { + key: "input_lengths" + value: "_REQUEST_INPUT_LEN" + } + input_map { + key: "request_output_len" + value: "_REQUEST_OUTPUT_LEN" + } + input_map { + key: "end_id" + value: "end_id" + } + input_map { + key: "pad_id" + value: "pad_id" + } + input_map { + key: "runtime_top_k" + value: "top_k" + } + input_map { + key: "runtime_top_p" + value: "top_p" + } + input_map { + key: "temperature" + value: "temperature" + } + input_map { + key: "len_penalty" + value: "length_penalty" + } + input_map { + key: "repetition_penalty" + value: "repetition_penalty" + } + input_map { + key: "min_length" + value: "min_length" + } + input_map { + key: "presence_penalty" + value: "presence_penalty" + } + input_map { + key: "random_seed" + value: "random_seed" + } + input_map { + key: "beam_width" + value: "beam_width" + } + input_map { + key: "streaming" + value: "stream" + } + output_map { + key: "output_ids" + value: "_TOKENS_BATCH" + } + }, + { + model_name: "postprocessing" + model_version: -1 + input_map { + key: "TOKENS_BATCH" + value: "_TOKENS_BATCH" + } + output_map { + key: "OUTPUT" + value: "text_output" + } + } + ] +} diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py new file mode 100755 index 00000000..bb8a7378 --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +import json +import os + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import LlamaTokenizer + +TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "/model") + +SPACE_CHAR = 9601 +NEWLINE_CHAR = 60 +STOP_TOKEN = 2 + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + self.model_config = model_config = json.loads(args["model_config"]) + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + self.tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR, legacy=False) + vocab = self.tokenizer.convert_ids_to_tokens( + list(range(self.tokenizer.vocab_size)) + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get input tensors + tokens_batch = pb_utils.get_input_tensor_by_name( + request, "TOKENS_BATCH" + ).as_numpy() + + # Reshape Input + # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) + # tokens_batch = tokens_batch.T + + # Postprocessing output data. + outputs = self._postprocessing(tokens_batch) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + output_tensor = pb_utils.Tensor( + "OUTPUT", np.array(outputs).astype(self.output_dtype) + ) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + `Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + pb_utils.Logger.log("Finalizing the Post-Processing Model.") + + def _id_to_token(self, token_id): + # handle special tokens (end of string, unknown, etc) + try: + special_token_index = self.tokenizer.all_special_ids.index(token_id) + return self.tokenizer.all_special_tokens[special_token_index] + except ValueError: + pass + + # handle typical tokens + tokens = self.tokenizer.convert_ids_to_tokens(token_id) + if ord(tokens[0]) == SPACE_CHAR: + return f" {tokens[1:]}" + if ord(tokens[0]) == NEWLINE_CHAR: + return "\n" + return tokens + + def _postprocessing(self, tokens_batch): + tokens_batch = tokens_batch.tolist() + return [ + self._id_to_token(token_id) + for beam_tokens in tokens_batch + for token_ids in beam_tokens + for token_id in token_ids + ] + + # for beam_tokens in tokens_batch: + # for token_ids in beam_tokens: + # for token_id in token_ids: + # # handle special tokens (end of string, unknown, etc) + # special_token = self.tokenizer.added_tokens_decoder.get(token_id) + # if special_token: + # tokens = special_token.content + + # # handle typical tokens + # else: + # tokens = self.tokenizer.convert_ids_to_tokens(token_id) + # if ord(tokens[0]) == SPACE_CHAR: + # tokens = f" {tokens[1:]}" + # elif ord(tokens[0]) == NEWLINE_CHAR: + # tokens = "\n" + + # outputs.append(tokens) + # return outputs diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt new file mode 100755 index 00000000..3c3ea10d --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt @@ -0,0 +1,50 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "postprocessing" +backend: "python" +max_batch_size: 128 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py new file mode 100644 index 00000000..44e8b9c4 --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py @@ -0,0 +1,244 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import csv +import json +import os + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.nn.utils.rnn import pad_sequence +from transformers import LlamaTokenizer + +TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "/model") + +END_ID = 2 + +# SYSTEM_PROMPT = ( +# """You are a helpful, respectful and honest assistant.""" +# """Always answer as helpfully as possible, while being safe.""" +# """Please ensure that your responses are positive in nature.""" +# ) + +# LLAMA_PROMPT_TEMPLATE = ( +# "[INST] <>" +# "{system_prompt}" +# "<>" +# "[/INST] {context} [INST] {question} [/INST]" +# ) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + self.model_config = model_config = json.loads(args["model_config"]) + + # Parse model output configs and convert Triton types to numpy types + input_names = ["INPUT_ID", "REQUEST_INPUT_LEN"] + for input_name in input_names: + setattr( + self, + input_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name(model_config, input_name)[ + "data_type" + ] + ), + ) + + self.encoder = LlamaTokenizer.from_pretrained(TOKENIZER_DIR, legacy=False) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() + request_output_len = pb_utils.get_input_tensor_by_name( + request, "REQUEST_OUTPUT_LEN" + ).as_numpy() + + input_id, request_input_len = self._create_request(query) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + "INPUT_ID", np.array(input_id).astype(self.input_id_dtype) + ) + request_input_len_tensor = pb_utils.Tensor( + "REQUEST_INPUT_LEN", + np.array(request_input_len).astype(self.request_input_len_dtype), + ) + request_output_len_tensor = pb_utils.Tensor( + "REQUEST_OUTPUT_LEN", request_output_len + ) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, + request_input_len_tensor, + request_output_len_tensor, + ] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + pb_utils.Logger.log("Finalizing the Pre-Processing Model.") + + def _create_request(self, prompts): + """ + prompts : batch string (2D numpy array) + """ + + start_ids = [ + torch.IntTensor(self.encoder.encode(prompt[0].decode())) + for prompt in prompts + ] + + start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) + + start_ids = pad_sequence(start_ids, batch_first=True, padding_value=END_ID) + + return start_ids, start_lengths + + def _create_word_list(self, word_dict): + flat_ids = [] + offsets = [] + for word_dict_item in word_dict: + item_flat_ids = [] + item_offsets = [] + + words = list(csv.reader([word_dict_item[0].decode()]))[0] + for word in words: + ids = self._encode(word) + + if len(ids) == 0: + continue + + item_flat_ids += ids + item_offsets.append(len(ids)) + + flat_ids.append(np.array(item_flat_ids)) + offsets.append(np.cumsum(np.array(item_offsets))) + + pad_to = max(1, max(len(ids) for ids in flat_ids)) + + for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) + + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) + + def to_word_list_format(self, word_dict): + flat_ids = [] + offsets = [] + for word_dict_item in word_dict: + item_flat_ids = [] + item_offsets = [] + + if isinstance(word_dict_item[0], bytes): + word_dict_item = [word_dict_item[0].decode()] + + words = list(csv.reader(word_dict_item))[0] + for word in words: + ids = self.encoder.encode(word) + + if len(ids) == 0: + continue + + item_flat_ids += ids + item_offsets.append(len(ids)) + + flat_ids.append(np.array(item_flat_ids)) + offsets.append(np.cumsum(np.array(item_offsets))) + + pad_to = max(1, max(len(ids) for ids in flat_ids)) + + for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) + + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) + + def _encode(self, sentence): + sentence = sentence.decode() if isinstance(sentence, bytes) else sentence + return self.encoder.encode(sentence) diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt new file mode 100644 index 00000000..d2e3029a --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt @@ -0,0 +1,65 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "preprocessing" +backend: "python" +max_batch_size: 128 +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/1/.gitkeep b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/1/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2 b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2 new file mode 100644 index 00000000..4b719b04 --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2 @@ -0,0 +1,208 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "tensorrt_llm" +backend: "tensorrtllm" +max_batch_size: 128 + +model_transaction_policy { + decoupled: {{ decoupled_mode }} +} + +input [ + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_UINT32 + dims: [ 1 ] + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "pad_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "min_length" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "streaming" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters: { + key: "max_beam_width" + value: { + string_value: "1" + } +} +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} +parameters: { + key: "gpt_model_type" + value: { + string_value: "{{ gpt_model_type }}" + } +} +parameters: { + key: "gpt_model_path" + value: { + string_value: "{{ engine_dir }}" + } +} +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "" + } +} +parameters: { + key: "batch_scheduler_policy" + value: { + string_value: "guaranteed_completion" + } +} +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: ".75" + } +} +parameters: { + key: "max_num_sequences" + value: { + string_value: "" + } +} +parameters: { + key: "enable_trt_overlap" + value: { + string_value: "" + } +} diff --git a/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py b/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py index 2c4d3d5a..e3475718 100644 --- a/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py +++ b/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py @@ -120,6 +120,8 @@ def main(args: argparse.Namespace) -> int: # print discovered model parameters _LOGGER.info("Model file format: %s", model.format.name) _LOGGER.info("World Size: %d", model.world_size) + _LOGGER.info("Max input length: %s", args.max_input_length) + _LOGGER.info("Max output length: %s", args.max_output_length) _LOGGER.info("Compute Capability: %s", model.compute_cap) _LOGGER.info("Quantization: %s", conversion_opts.quantization) diff --git a/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py b/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py index d0eb10f3..437f3075 100644 --- a/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py +++ b/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py @@ -32,7 +32,7 @@ _LOGGER = logging.getLogger(__name__) -def convert(model: Model, _: ConversionOptions) -> None: +def convert(model: Model, opts: ConversionOptions) -> None: """Convert a .nemo formatted model.""" # find the .nemo model file model_files = glob(os.path.join(model.model_dir, "*.nemo")) @@ -52,14 +52,6 @@ def convert(model: Model, _: ConversionOptions) -> None: config = yaml.safe_load(config_file) config_file.close() - if config.get("tensor_model_parallel_size", 1) != model.world_size: - raise ModelServerException( - f"The provided model has a tensor parallelism of {config.get('tensor_model_parallel_size', 1)} " - + f"and the server has been requested to use {model.world_size} " - + "gpus. Please use the NeMo inference container to rezise the parallelism of the model or change " - + "the model-server's world size." - ) - # run the nemo to trt llm conversion trt_llm_exporter = TensorRTLLM(model_dir=model.engine_dir) _LOGGER.info(".nemo to TensorRT Conversion started. This will take a few minutes.") @@ -68,4 +60,6 @@ def convert(model: Model, _: ConversionOptions) -> None: nemo_checkpoint_path=model_files[0], model_type=model.family, n_gpus=model.world_size, + max_input_token=opts.max_input_length, + max_output_token=opts.max_output_length ) diff --git a/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py b/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py index a9077bf9..272234ec 100644 --- a/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py +++ b/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py @@ -47,6 +47,8 @@ def _decoupled_mode(self) -> str: @property def _allow_http(self) -> str: """Indicate if Triton should allow http connections.""" + if self._model.format == ModelFormats.NEMO: + return "true" return "true" if self._http else "false" @property diff --git a/RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh b/RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh new file mode 100755 index 00000000..fa35fe16 --- /dev/null +++ b/RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -x + +MODEL_STORE="$1" +MODEL_IN="$2" +MODEL_IN_DIR=$(cd $(dirname "$MODEL_IN"); pwd) +MODEL_OUT="$3" +MODEL_OUT_DIR=$(cd $(dirname "$MODEL_OUT"); pwd) +TARGET_SIZE="$4" + +TRAINING_CONTAINER="nvcr.io/nvaie/nemo-framework-training:23.08.03" + +# init +echo $MODEL_IN " -> " $MODEL_OUT +cd "$MODEL_STORE" +mkdir -p "$MODEL_OUT_DIR" + +# find tokenizer +tar xvf $MODEL_IN model_config.yaml +mv model_config.yaml "$MODEL_OUT_DIR" +tokenizer=$(grep "tokenizer_model" gpt_8b_strict_skua_bf16_nemo_yi_dong_us_v1.0-tp1/model_config.yaml | awk -F: '{ + print $3 }') +tar xvf $MODEL_IN $tokenizer +mv $tokenizer $MODEL_OUT_DIR + +# run conversion +docker run --rm -it --gpus all --ipc host \ + -v $MODEL_STORE:$MODEL_STORE \ + -w $MODEL_STORE \ + $TRAINING_CONTAINER \ + /usr/bin/python3 \ + /opt/NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py \ + --model_file $MODEL_IN \ + --target_file $MODEL_OUT \ + --tensor_model_parallel_size=-1 \ + --target_tensor_model_parallel_size=$TARGET_SIZE \ + --pipeline_model_parallel_size=-1 \ + --target_pipeline_model_parallel_size=1 \ + --precision=bf16 \ + --tokenizer_model_path $MODEL_OUT_DIR/$tokenizer diff --git a/RetrievalAugmentedGeneration/requirements.txt b/RetrievalAugmentedGeneration/requirements.txt index b2f9b81a..2a9f6214 100644 --- a/RetrievalAugmentedGeneration/requirements.txt +++ b/RetrievalAugmentedGeneration/requirements.txt @@ -1,12 +1,21 @@ fastapi==0.104.1 uvicorn[standard]==0.24.0 python-multipart==0.0.6 -langchain==0.0.330 -tritonclient[all]==2.39.0 +langchain==0.0.352 unstructured[all-docs]==0.11.2 sentence-transformers==2.2.2 -llama-index==0.9.13 +llama-index==0.9.22 pymilvus==2.3.1 dataclass-wizard==0.22.2 opencv-python==4.8.0.74 -minio==7.2.0 \ No newline at end of file +minio==7.2.0 +asyncpg==0.29.0 +psycopg2-binary==2.9.9 +pgvector==0.2.4 +langchain-core==0.1.3 +langchain-nvidia-ai-endpoints==0.0.1 +langchain-nvidia-trt==0.0.1rc0 +nemollm==0.3.4 +opentelemetry-sdk==1.21.0 +opentelemetry-api==1.21.0 +opentelemetry-exporter-otlp-proto-grpc==1.21.0 diff --git a/deploy/compose/compose.env b/deploy/compose/compose.env index 3ac2946a..52cb1465 100644 --- a/deploy/compose/compose.env +++ b/deploy/compose/compose.env @@ -1,20 +1,24 @@ # full path to the local copy of the model weights # NOTE: This should be an absolute path and not relative path export MODEL_DIRECTORY="/home/nvidia/llama2_13b_chat_hf_v1/" - +# export MODEL_DIRECTORY="/home/nvidia/nemotron-3-8b-chat-4k-sft" # Fill this out if you dont have a GPU. Leave this empty if you have a local GPU -export AI_PLAYGROUND_API_KEY="nvapi-*" +export NVIDIA_API_KEY="nvapi-*" # flag to enable activation aware quantization for the LLM # export QUANTIZATION="int4_awq" -# the architecture of the model. eg: llama +# the architecture of the model. eg: llama, gptnext (for nemotron use gptnext) export MODEL_ARCHITECTURE="llama" + # the name of the model being used - only for displaying on frontend export MODEL_NAME="Llama-2-13b-chat-hf" +# the name of the RAG example being used +export RAG_EXAMPLE="developer_rag" + # [OPTIONAL] the maximum number of input tokens # export MODEL_MAX_INPUT_LENGTH=3000 @@ -29,3 +33,29 @@ export MODEL_NAME="Llama-2-13b-chat-hf" # [OPTIONAL] the config file for chain server w.r.t. pwd export APP_CONFIG_FILE=/dev/null + +# parameters for PGVector, update this when using PGVector Vecotor store +# export POSTGRES_PASSWORD=password +# export POSTGRES_USER=postgres +# export POSTGRES_DB=api + +### Riva Parameters: + +# Riva Speech API URI: Riva Server IP address/hostname and port +export RIVA_API_URI="" + +# [OPTIONAL] Riva Speech API Key +# If necessary, enter a key to access the Riva API +export RIVA_API_KEY="" + +# [OPTIONAL] Riva Function ID +# If necessary, enter a function ID to access the Riva API +export RIVA_FUNCTION_ID="" + +# TTS sample rate (Hz) +export TTS_SAMPLE_RATE=48000 + +# the config file for the OpenTelemetry collector +export OPENTELEMETRY_CONFIG_FILE="./configs/otel-collector-config.yaml" +# the config file for Jaeger +export JAEGER_CONFIG_FILE="./configs/jaeger.yaml" diff --git a/deploy/compose/config.yaml b/deploy/compose/config.yaml index 19084ca9..0cac6ae7 100644 --- a/deploy/compose/config.yaml +++ b/deploy/compose/config.yaml @@ -1,28 +1,33 @@ -milvus: - # The configuration of the Milvus connection. +vector_store: + # The configuration of the Vector Store connection. + + name: milvus + # The name of vector store db. Can be pgvector or milvus. + # Type: str + # ENV Variable: APP_VECTORSTORE_NAME url: "http://milvus:19530" - # The location of the Milvus Server. + # The location of the VectorStore DB. # Type: str - # ENV Variable: APP_MILVUS_URL + # ENV Variable: APP_VECTORSTORE_URL llm: # The configuration for the server hosting the Large Language models. model_engine: "triton-trt-llm" - # The backend name hosting the model. Options currently supported are: triton-trt-llm, ai-playground + # The backend name hosting the model. Options currently supported are: triton-trt-llm, nv-ai-foundation # Type: str # ENV Variable: APP_LLM_MODELENGINE server_url: "llm:8001" - # The location of the server hosting the large language model. Use this option when model engine is - # set to triton-trt-llm, ignore this option if model_engine is set to "ai-playground" + # The location of the server hosting the large language model. Use this option when model engine is + # set to triton-trt-llm, ignore this option if model_engine is set to "nv-ai-foundation" # Type: str # ENV Variable: APP_LLM_SERVERURL model_name: "ensemble" # if model_engine is "triton-trt-llm" set this to "ensemble" - # if model_engine is "ai-plaground" options are "llama2_13b", "llama2_70b", "mistral_7b" + # if model_engine is "ai-plaground" options are "llama2_13b", "llama2_70b", "mistral_7b" # The name of the hosted model. # Type: str # ENV Variable: APP_LLM_MODELNAME @@ -32,6 +37,7 @@ text_splitter: chunk_size: 510 # Chunk size for text splitting. + # When using a token-based text splitter, this is the number of 'tokens per chunk' # Type: int chunk_overlap: 200 @@ -42,7 +48,7 @@ embeddings: # The configuration embedding models. model_name: intfloat/e5-large-v2 - # The name embedding search model from huggingface or ai-playground. + # The name embedding search model from huggingface or nv-ai-foundation. # Type: str dimensions: 1024 @@ -50,7 +56,7 @@ embeddings: # Type: int model_engine: huggingface - # The backend name hosting the model, huggingface and ai-playground are supported. + # The backend name hosting the model, huggingface and nv-ai-foundation are supported. # Type: str prompts: diff --git a/deploy/compose/configs/jaeger.yaml b/deploy/compose/configs/jaeger.yaml new file mode 100644 index 00000000..64d3513c --- /dev/null +++ b/deploy/compose/configs/jaeger.yaml @@ -0,0 +1,3 @@ +query.base-path: /jaeger/ui +cassandra.keyspace: jaeger_v1_dc1 +cassandra.servers: cassandra \ No newline at end of file diff --git a/deploy/compose/configs/otel-collector-config.yaml b/deploy/compose/configs/otel-collector-config.yaml new file mode 100644 index 00000000..69d1cbe4 --- /dev/null +++ b/deploy/compose/configs/otel-collector-config.yaml @@ -0,0 +1,17 @@ +receivers: + otlp: + protocols: + grpc: + # endpoint: 0.0.0.0:4317 + http: + # endpoint: 0.0.0.0:4318 +exporters: + otlp: + endpoint: jaeger:4317 + tls: + insecure: true +service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlp] \ No newline at end of file diff --git a/deploy/compose/docker-compose-evaluation.yaml b/deploy/compose/docker-compose-evaluation.yaml new file mode 100644 index 00000000..caab7e8f --- /dev/null +++ b/deploy/compose/docker-compose-evaluation.yaml @@ -0,0 +1,22 @@ +services: + evaluation: + container_name: evaluation + image: evalulation:latest + build: + context: ../../ + dockerfile: ./tools/evaluation/Dockerfile.eval + ports: + - "8889:8889" + expose: + - "8889" + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + +networks: + default: + name: nvidia-llm \ No newline at end of file diff --git a/deploy/compose/docker-compose-playground.yaml b/deploy/compose/docker-compose-nemotron.yaml similarity index 65% rename from deploy/compose/docker-compose-playground.yaml rename to deploy/compose/docker-compose-nemotron.yaml index 2def0c08..ce00a85a 100644 --- a/deploy/compose/docker-compose-playground.yaml +++ b/deploy/compose/docker-compose-nemotron.yaml @@ -1,33 +1,47 @@ services: - jupyter-server: - container_name: notebook-server - image: notebook-server:latest + llm: + container_name: llm-inference-server + image: llm-inference-server:latest build: - context: ../../ - dockerfile: ./notebooks/Dockerfile.notebooks + context: ../.././RetrievalAugmentedGeneration/llm-inference-server/ + dockerfile: Dockerfile + volumes: + - ${MODEL_DIRECTORY:?please update the env file and source it before running}:/model + command: ${MODEL_ARCHITECTURE:?please update the env file and source it before running} --http --max-input-length ${MODEL_MAX_INPUT_LENGTH:-3000} --max-output-length ${MODEL_MAX_OUTPUT_LENGTH:-512} --quantization ${QUANTIZATION:-None} ports: - - "8888:8888" + - "8000:8000" + - "8001:8001" + - "8002:8002" expose: - - "8888" + - "8000" + - "8001" + - "8002" + shm_size: 20gb deploy: resources: reservations: devices: - driver: nvidia - count: 1 + device_ids: ["0", "1"] capabilities: [gpu] - - evaluation: - container_name: evaluation - image: evaluation:latest + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"] + interval: 30s + timeout: 20s + retries: 3 + start_period: 10m + + jupyter-server: + container_name: notebook-server + image: notebook-server:latest build: context: ../../ - dockerfile: ./evaluation/Dockerfile.eval + dockerfile: ./notebooks/Dockerfile.notebooks ports: - - "8889:8889" + - "8888:8888" expose: - - "8889" + - "8888" deploy: resources: reservations: @@ -35,6 +49,8 @@ services: - driver: nvidia count: 1 capabilities: [gpu] + depends_on: + - "llm" etcd: container_name: milvus-etcd @@ -60,13 +76,13 @@ services: MINIO_ACCESS_KEY: minioadmin MINIO_SECRET_KEY: minioadmin ports: - - "9001:9001" - - "9000:9000" + - "9011:9011" + - "9010:9010" volumes: - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data - command: minio server /minio_data --console-address ":9001" + command: minio server /minio_data --console-address ":9011" --address ":9010" healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"] interval: 30s timeout: 20s retries: 3 @@ -77,7 +93,7 @@ services: command: ["milvus", "run", "standalone"] environment: ETCD_ENDPOINTS: etcd:2379 - MINIO_ADDRESS: minio:9000 + MINIO_ADDRESS: minio:9010 KNOWHERE_GPU_MEM_POOL_SIZE: 2048:4096 volumes: - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus @@ -107,13 +123,20 @@ services: build: context: ../../ dockerfile: ./RetrievalAugmentedGeneration/Dockerfile + args: + EXAMPLE_NAME: ${RAG_EXAMPLE} command: --port 8081 --host 0.0.0.0 environment: - APP_MILVUS_URL: "http://milvus:19530" - APP_LLM_MODELNAME: "llama2_13b" - APP_LLM_MODELENGINE: "ai-playground" + APP_VECTORSTORE_URL: "http://milvus:19530" + APP_VECTORSTORE_NAME: "milvus" + APP_LLM_SERVERURL: "llm:8001" + APP_LLM_MODELNAME: ensemble + APP_LLM_MODELENGINE: triton-trt-llm APP_CONFIG_FILE: ${APP_CONFIG_FILE} NVAPI_KEY: ${AI_PLAYGROUND_API_KEY} + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + ENABLE_TRACING: false volumes: - ${APP_CONFIG_FILE}:${APP_CONFIG_FILE} ports: @@ -135,6 +158,7 @@ services: # retries: 3 depends_on: - "milvus" + - "llm" frontend: container_name: llm-playground @@ -147,6 +171,13 @@ services: APP_SERVERURL: http://query APP_SERVERPORT: 8081 APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}} + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + ENABLE_TRACING: false + RIVA_API_URI: ${RIVA_API_URI} + RIVA_API_KEY: ${RIVA_API_KEY} + RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID} + TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE} ports: - "8090:8090" expose: diff --git a/deploy/compose/docker-compose-nv-ai-foundation.yaml b/deploy/compose/docker-compose-nv-ai-foundation.yaml new file mode 100644 index 00000000..1ec0e74f --- /dev/null +++ b/deploy/compose/docker-compose-nv-ai-foundation.yaml @@ -0,0 +1,55 @@ +services: + + query: + container_name: chain-server + image: chain-server:latest + build: + context: ../../ + dockerfile: ./RetrievalAugmentedGeneration/Dockerfile + args: + EXAMPLE_NAME: ${RAG_EXAMPLE} + command: --port 8081 --host 0.0.0.0 + environment: + APP_LLM_MODELNAME: mixtral_8x7b + APP_LLM_MODELENGINE: nv-ai-foundation + APP_EMBEDDINGS_MODELNAME: nvolveqa_40k + APP_EMBEDDINGS_MODELENGINE: nv-ai-foundation + APP_TEXTSPLITTER_CHUNKSIZE: 2000 + APP_TEXTSPLITTER_CHUNKOVERLAP: 200 + APP_PROMPTS_CHATTEMPLATE: "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." + APP_PROMPTS_RAGTEMPLATE: "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user." + NVIDIA_API_KEY: ${NVIDIA_API_KEY} + APP_CONFIG_FILE: ${APP_CONFIG_FILE} + volumes: + - ${APP_CONFIG_FILE}:${APP_CONFIG_FILE} + ports: + - "8081:8081" + expose: + - "8081" + shm_size: 5gb + + frontend: + container_name: llm-playground + image: llm-playground:latest + build: + context: ../.././RetrievalAugmentedGeneration/frontend/ + dockerfile: Dockerfile + command: --port 8090 + environment: + APP_SERVERURL: http://query + APP_SERVERPORT: 8081 + APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}} + RIVA_API_URI: ${RIVA_API_URI} + RIVA_API_KEY: ${RIVA_API_KEY} + RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID} + TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE} + ports: + - "8090:8090" + expose: + - "8090" + depends_on: + - query + +networks: + default: + name: nvidia-llm diff --git a/deploy/compose/docker-compose-observability.yaml b/deploy/compose/docker-compose-observability.yaml new file mode 100644 index 00000000..cc2e11a0 --- /dev/null +++ b/deploy/compose/docker-compose-observability.yaml @@ -0,0 +1,47 @@ +services: + otel-collector: + container_name: otel-collector + image: otel/opentelemetry-collector:0.88.0 + restart: always + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ${OPENTELEMETRY_CONFIG_FILE}:/etc/otel-collector-config.yaml + + jaeger: + image: jaegertracing/all-in-one:1.52 + container_name: jaeger + command: + - "--config-file=/etc/jaeger.yaml" + environment: + - SPAN_STORAGE_TYPE=cassandra + deploy: + resources: + limits: + memory: 300M + restart: always + ports: + - "16686:16686" + - "4317" + - "4318" + expose: + - "4318" + - "4317" + volumes: + - ${JAEGER_CONFIG_FILE}:/etc/jaeger.yaml + depends_on: + - cassandra-schema + + cassandra: + image: cassandra:4.0 + container_name: cassandra + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/cassandra:/var/lib/cassandra + + cassandra-schema: + image: jaegertracing/jaeger-cassandra-schema + depends_on: + - cassandra + +networks: + default: + name: nvidia-llm diff --git a/deploy/compose/docker-compose-pgvector.yaml b/deploy/compose/docker-compose-pgvector.yaml new file mode 100644 index 00000000..9ad8ff56 --- /dev/null +++ b/deploy/compose/docker-compose-pgvector.yaml @@ -0,0 +1,111 @@ +services: + + llm: + container_name: llm-inference-server + image: llm-inference-server:latest + build: + context: ../.././RetrievalAugmentedGeneration/llm-inference-server/ + dockerfile: Dockerfile + volumes: + - ${MODEL_DIRECTORY:?please update the env file and source it before running}:/model + command: ${MODEL_ARCHITECTURE:?please update the env file and source it before running} --max-input-length ${MODEL_MAX_INPUT_LENGTH:-3000} --max-output-length ${MODEL_MAX_OUTPUT_LENGTH:-512} --quantization ${QUANTIZATION:-None} + ports: + - "8000:8000" + - "8001:8001" + - "8002:8002" + expose: + - "8000" + - "8001" + - "8002" + shm_size: 20gb + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: ${INFERENCE_GPU_COUNT:-all} + capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"] + interval: 30s + timeout: 20s + retries: 3 + start_period: 10m + + pgvector: + container_name: pgvector + image: ankane/pgvector:v0.5.1 + ports: + - 5432:5432 + expose: + - "5432" + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/data:/var/lib/postgresql/data + environment: + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password} + - POSTGRES_USER=${POSTGRES_USER:-postgres} + - POSTGRES_DB=${POSTGRES_DB:-api} + + query: + container_name: chain-server + image: chain-server:latest + build: + context: ../../ + dockerfile: ./RetrievalAugmentedGeneration/Dockerfile + args: + EXAMPLE_NAME: ${RAG_EXAMPLE} + command: --port 8081 --host 0.0.0.0 + environment: + APP_VECTORSTORE_URL: "pgvector:5432" + APP_VECTORSTORE_NAME: "pgvector" + APP_LLM_SERVERURL: "llm:8001" + APP_LLM_MODELNAME: "ensemble" + APP_LLM_MODELENGINE: "triton-trt-llm" + APP_CONFIG_FILE: ${APP_CONFIG_FILE} + NVAPI_KEY: ${AI_PLAYGROUND_API_KEY} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-password} + POSTGRES_USER: ${POSTGRES_USER:-postgres} + POSTGRES_DB: ${POSTGRES_DB:-api} + volumes: + - ${APP_CONFIG_FILE}:${APP_CONFIG_FILE} + ports: + - "8081:8081" + expose: + - "8081" + shm_size: 5gb + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + depends_on: + - "pgvector" + - "llm" + + frontend: + container_name: llm-playground + image: llm-playground:latest + build: + context: ../.././RetrievalAugmentedGeneration/frontend/ + dockerfile: Dockerfile + command: --port 8090 + environment: + APP_SERVERURL: http://query + APP_SERVERPORT: 8081 + APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}} + RIVA_API_URI: ${RIVA_API_URI} + RIVA_API_KEY: ${RIVA_API_KEY} + RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID} + TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE} + ports: + - "8090:8090" + expose: + - "8090" + depends_on: + - query + +networks: + default: + name: nvidia-llm diff --git a/deploy/compose/docker-compose.yaml b/deploy/compose/docker-compose.yaml index 52c675e0..ac60a34b 100644 --- a/deploy/compose/docker-compose.yaml +++ b/deploy/compose/docker-compose.yaml @@ -52,26 +52,6 @@ services: depends_on: - "llm" - evaluation: - container_name: evaluation - image: evalulation:latest - build: - context: ../../ - dockerfile: ./evaluation/Dockerfile.eval - ports: - - "8889:8889" - expose: - - "8889" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - depends_on: - - "llm" - etcd: container_name: milvus-etcd image: quay.io/coreos/etcd:v3.5.5 @@ -96,13 +76,13 @@ services: MINIO_ACCESS_KEY: minioadmin MINIO_SECRET_KEY: minioadmin ports: - - "9001:9001" - - "9000:9000" + - "9011:9011" + - "9010:9010" volumes: - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data - command: minio server /minio_data --console-address ":9001" + command: minio server /minio_data --console-address ":9011" --address ":9010" healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"] interval: 30s timeout: 20s retries: 3 @@ -113,7 +93,7 @@ services: command: ["milvus", "run", "standalone"] environment: ETCD_ENDPOINTS: etcd:2379 - MINIO_ADDRESS: minio:9000 + MINIO_ADDRESS: minio:9010 KNOWHERE_GPU_MEM_POOL_SIZE: 2048:4096 volumes: - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus @@ -143,14 +123,20 @@ services: build: context: ../../ dockerfile: ./RetrievalAugmentedGeneration/Dockerfile + args: + EXAMPLE_NAME: ${RAG_EXAMPLE} command: --port 8081 --host 0.0.0.0 environment: - APP_MILVUS_URL: "http://milvus:19530" + APP_VECTORSTORE_URL: "http://milvus:19530" + APP_VECTORSTORE_NAME: "milvus" APP_LLM_SERVERURL: "llm:8001" APP_LLM_MODELNAME: ensemble APP_LLM_MODELENGINE: triton-trt-llm APP_CONFIG_FILE: ${APP_CONFIG_FILE} NVAPI_KEY: ${AI_PLAYGROUND_API_KEY} + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + ENABLE_TRACING: false volumes: - ${APP_CONFIG_FILE}:${APP_CONFIG_FILE} ports: @@ -185,6 +171,13 @@ services: APP_SERVERURL: http://query APP_SERVERPORT: 8081 APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}} + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + ENABLE_TRACING: false + RIVA_API_URI: ${RIVA_API_URI} + RIVA_API_KEY: ${RIVA_API_KEY} + RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID} + TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE} ports: - "8090:8090" expose: diff --git a/deploy/compose/nemotron_config.yaml b/deploy/compose/nemotron_config.yaml index 53f1e041..658408f2 100644 --- a/deploy/compose/nemotron_config.yaml +++ b/deploy/compose/nemotron_config.yaml @@ -29,6 +29,7 @@ text_splitter: chunk_size: 510 # Chunk size for text splitting. + # When using a token-based text splitter, this is the number of 'tokens per chunk' # Type: int chunk_overlap: 200 diff --git a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml index 12ad1204..fc302398 100644 --- a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml +++ b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml @@ -22,15 +22,15 @@ spec: - server - /minio_data - --console-address - - :9001 - env: + - :9011 + env: - name: MINIO_ACCESS_KEY value: minioadmin - name: MINIO_SECRET_KEY value: minioadmin ports: - - containerPort: 9001 - - containerPort: 9000 + - containerPort: 9011 + - containerPort: 9010 volumeMounts: - mountPath: /minio_data name: minio-data @@ -38,8 +38,8 @@ spec: exec: command: - curl - - -f - - http://localhost:9000/minio/health/live + - -f + - http://localhost:9010/minio/health/live initialDelaySeconds: 20 periodSeconds: 5 volumes: @@ -57,6 +57,6 @@ spec: app.kubernetes.io/name: milvus-minio ports: - protocol: TCP - port: 9000 - targetPort: 9000 + port: 9010 + targetPort: 9010 diff --git a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml index 2873239e..1b9ab847 100644 --- a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml +++ b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml @@ -18,32 +18,32 @@ spec: - name: milvus-standalone image: milvusdb/milvus:v2.3.1-gpu command: - - /tini + - /tini - -- - milvus - run - standalone - env: + env: - name: ETCD_ENDPOINTS value: milvus-etcd:2379 - name: KNOWHERE_GPU_MEM_POOL_SIZE value: 2048:4096 - name: MINIO_ADDRESS - value: milvus-minio:9000 - ports: + value: milvus-minio:9010 + ports: - containerPort: 19530 - containerPort: 9091 readinessProbe: exec: command: - curl - - -f + - -f - http://localhost:9091/healthz initialDelaySeconds: 20 periodSeconds: 5 resources: limits: - {{ .Values.milvus.gpu.type }}: {{ .Values.milvus.gpu.count }} + {{ .Values.milvus.gpu.type }}: {{ .Values.milvus.gpu.count }} --- apiVersion: v1 kind: Service diff --git a/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml b/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml index 3e9d4d08..7c6f21f2 100644 --- a/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml +++ b/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml @@ -2,16 +2,12 @@ - repoEntry: name: "zvonkok" url: "https://zvonkok.github.io/helm-charts/" - #username: "zvonkok" - #password: "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" - #pass_credentials_all: true - #insecure_skip_tls_verify: true chartSpec: release: "flannel" chart: "zvonkok/flannel" namespace: "flannel" version: "v0.23.0" - + - repoEntry: name: "nfd" url: "https://kubernetes-sigs.github.io/node-feature-discovery/charts" @@ -22,7 +18,7 @@ version: "0.14.3" chartValues: kernelVersion: "{{ tpl .Values.runtime.kernelVersiosn }}" # {{ tpl .Values.chartValues.kernelVersion . }} - + - repoEntry: name: "nvidia" url: "https://helm.ngc.nvidia.com/nvidia" diff --git a/docs/README.md b/docs/README.md index 787a6a8f..47dc3a83 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,40 +6,43 @@ The RAG documentation is divided into the following sections: - [Getting Started](#getting-started) - [User Guides](#user-guides) - [Architecture Guide](#architecture-guide) - - [Evaluation Tools](#evaluation-tools) - - [Other](#other) + - [Evaluation Tool](#evaluation-tool) + - [Observability Tool](#observability-tool) + - [Others](#others) ## Getting Started -This section will help you get started quickly with the sample RAG example. - -* [Installation guide](../RetrievalAugmentedGeneration/README.md#prerequisites): This guide walks you through the process of setting up your environment and utilizing the -* [Getting Started guides](../RetrievalAugmentedGeneration/README.md#getting-started): A series of quick start steps that will help you to understand the core concepts and start the pipeline quickly. These guides include Jupyter notebooks that you can experiment with. +* [Getting Started guides](../RetrievalAugmentedGeneration/README.md): A series of quick start steps that will help you to understand the core concepts and start the pipeline quickly for the different examples and usecases provided in this repository. These guides also include Jupyter notebooks that you can experiment with. ## User Guides -The user guides cover the core details of the provided example and how to configure and use different features to make your own chains. +The user guides cover the core details of the provided sample canonical developer rag example and how to configure and use different features to make your own chains. * [LLM Inference Server](./rag/llm_inference_server.md): Learn about the service which accelerates LLM inference time using TRT-LLM. -* [Integration with Nvidia AI Playground](./rag/aiplayground.md): Understand how to access **NVIDIA AI Playground** on NGC which allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. +* [Integration with Nvidia AI Playground](./rag/aiplayground.md): Understand how to access **NVIDIA AI Playground** on NGC which allows developers to experience state of the art LLMs and embedding models accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT and Triton Inference Server. * [Configuration Guide](./rag/configuration.md): The complete guide to all the configuration options available in the `config.yaml` file. -* [Frontend](./rag/frontend.md): Learn more about the sample playground provided as part of the workflow. -* [Chat Server Guide](./rag/chat_server.md): Learn about the chat server which exposes core API's for end user. -* [Jupyter Server Guide](./rag/jupyter_server.md): Learn about the different notebooks available and the server which can be used to access them. +* [Frontend](./rag/frontend.md): Learn more about the sample playground provided as part of the workflow used by all the examples. +* [Chat Server Guide](./rag/chat_server.md): Learn about the chat server which exposes core API's for the end user. All the different examples are deployed behind these standardized API's, exposed by this server. +* [Notebooks Guide](./rag/jupyter_server.md): Learn about the different notebooks available and the server which can be used to access them. ## Architecture Guide -This guide sheds more light on the infrastructure details and the execution flow for a query when the runtime is used: +This guide sheds more light on the infrastructure details and the execution flow for a query when the runtime is used for the default canonical RAG example: * [Architecture](./rag/architecture.md): Understand the architecture of the sample RAG workflow. -## Evaluation Tools +## Evaluation Tool -The sample RAG worlflow provides a set of evaluation pipelines via notebooks which developers can use for benchmarking. +The sample RAG worlflow provides a set of evaluation pipelines via notebooks which developers can use for benchmarking the default canonical RAG example. There are also detailed guides on how to reproduce results and create datasets for the evaluation. -* [RAG Evaluation](../evaluation/README.md): Understand the different notebooks available. +* [RAG Evaluation](./rag/evaluation.md): Understand the different notebooks available. + +## Observability Tool + +Observability is a crucial aspect that facilitates the monitoring and comprehension of the internal state and behavior of a system or application. +* [Observability tool](./rag/observability.md): Understand the tool and deployment steps for the observability tool. -## Other +## Others * [Support Matrix](./rag/support_matrix.md) * [Open API schema references](./rag/api_reference/openapi_schema.json) diff --git a/docs/developer-llm-operator/install.md b/docs/developer-llm-operator/install.md index bafc570b..ba3df516 100644 --- a/docs/developer-llm-operator/install.md +++ b/docs/developer-llm-operator/install.md @@ -55,15 +55,15 @@ NVIDIA container runtime on the Kubernetes node. ```console $ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ - && helm repo update + && helm repo update ``` 1. Install the Operator: ```console $ helm install --wait --generate-name \ - -n gpu-operator --create-namespace \ - nvidia/gpu-operator + -n gpu-operator --create-namespace \ + nvidia/gpu-operator ``` 1. Optional: Configure GPU time-slicing if you have fewer than four GPUs. @@ -106,14 +106,12 @@ NVIDIA container runtime on the Kubernetes node. - Verify that at least `4` GPUs are allocatable: ```console - $ kubectl get nodes -l nvidia.com/gpu.present -o json | jq '.items[0].status.allocatable | - with_entries(select(.key | startswith("nvidia.com/"))) | - with_entries(select(.value != "0"))' + $ kubectl get nodes -l nvidia.com/gpu.present -o json | jq '.items[0].status.allocatable | with_entries(select(.key | startswith("nvidia.com/"))) | with_entries(select(.value != "0"))' ``` *Example Output* - ```output + ```json { "nvidia.com/gpu": "4" } @@ -242,8 +240,7 @@ in the NVIDIA GPU Operator documentation. - View the logs from the Operator controller pod: ```console - $ kubectl logs -n kube-trailblazer-system \ - $(kubectl get pod -n kube-trailblazer-system -o=jsonpath='{.items[0].metadata.name}') + $ kubectl logs -n kube-trailblazer-system $(kubectl get pod -n kube-trailblazer-system -o=jsonpath='{.items[0].metadata.name}') ``` - View the pods in the pipeline namespace: @@ -280,7 +277,7 @@ in the NVIDIA GPU Operator documentation. llm ClusterIP 10.107.213.112 8001/TCP 22h milvus ClusterIP 10.102.86.183 19530/TCP 22h milvus-etcd ClusterIP 10.109.74.142 2379/TCP 22h - milvus-minio ClusterIP 10.103.238.28 9000/TCP 22h + milvus-minio ClusterIP 10.103.238.28 9010/TCP 22h query ClusterIP 10.110.199.69 8081/TCP 22h ``` diff --git a/docs/developer-llm-operator/uninstall.md b/docs/developer-llm-operator/uninstall.md new file mode 100644 index 00000000..5cd096a7 --- /dev/null +++ b/docs/developer-llm-operator/uninstall.md @@ -0,0 +1,50 @@ + + +# Uninstalling the Operator + +To uninstall the Operator, perform the following steps: + +1. Delete the RAG pipeline: + + ```console + $ kubectl delete helmpipeline -n kube-trailblazer-system rag-llm-pipeline + ``` + + *Example Output* + + ```output + helmpipeline.package.nvidia.com "rag-llm-pipeline" deleted + ``` + +1. Optional: Delete the namespace for the RAG pipeline: + + ```console + $ kubectl delete namespace rag-llm-pipeline + ``` + +1. Uninstall the Operator: + + ```console + $ helm delete -n kube-trailblazer-system $(helm list -n kube-trailblazer-system | grep developer-llm-operator | awk '{print $1}') + ``` + + *Example Output* + + ```output + release "developer-llm-operator-0-1705070979" uninstalled + ``` diff --git a/docs/rag/aiplayground.md b/docs/rag/aiplayground.md index 5ea66bd9..050bc159 100644 --- a/docs/rag/aiplayground.md +++ b/docs/rag/aiplayground.md @@ -1,104 +1,62 @@ -# NVIDIA AI Playground +# NVIDIA AI Foundation -**NVIDIA AI Playground** on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. Developers get **free credits for 10K requests** to any of the available models. Sign up process is easy. +**NVIDIA AI Foundation** lets developers to experience state of the art LLMs accelerated by NVIDIA. Developers get **free credits for 10K requests** to any of the available models. -**Setup** +## Prepare the environment -Please follow the instruction below to get access to AI playground API key +1. Navigate to https://catalog.ngc.nvidia.com/ai-foundation-models. -* Navigate to https://catalog.ngc.nvidia.com/ai-foundation-models -* Select any of the available models and click on learn more +2. Find the Mixtral x7B model icon and click ``Learn More``. -![Diagram](./images/image5.png) +![Diagram](./images/image7.png) -* Select the ```API``` navigation bar and click on the ```Generate key``` option as shown below. +3. Select the ```API``` navigation bar and click on the ```Generate key``` option.. -![Diagram](./images/image6.png) +![Diagram](./images/image8.png) -* Copy the generated key over to a safe place. +4. Save the generated API key. +## Deploy -## Using Nvdia Cloud based LLM's +1. Clone the Generative AI examples Git repository. -#### Step 1: Sign up to AI playground +> ⚠️ **NOTE**: This example requires Git Large File Support (LFS) -- Follow the [above](#nvidia-ai-playground) instructions to get access to an API key. - -#### Step 2: Set Environment Variables - -- Modify ``compose.env`` in the ``deploy/compose`` directory to set your environment variables. The following variable is required. ``` - export AI_PLAYGROUND_API_KEY="nvapi-*" +$ sudo apt -y install git-lfs +$ git clone git@github.com:NVIDIA/GenerativeAIExamples.git +Cloning into 'GenerativeAIExamples'... +$ cd GenerativeAIExamples/ +$ git lfs pull ``` -#### Step 3: Build and Start Containers -- Pull lfs files. This will pull large files from repository. - ``` - git lfs pull - ``` -- Run the following command to build containers. - ``` - source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-playground.yaml build - ``` - -- Run the following command to start containers. - ``` - source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-playground.yaml up -d - ``` - -#### Step 4: Try out queries with the deployed pipeline -- Interact with the pipeline using UI as as mentioned [here.](../../RetrievalAugmentedGeneration/README.md#step-4-run-the-sample-web-application) - -- Example [notebook 6](../../notebooks/06_AI_playground.ipynb) showcases the usage of AI Playground based LLM. You can access the notebook server at `http://host-ip:8888` from your web browser. - - -## Using Nvidia Cloud based Embedding models +2. Add your NGC API key to compose.env to use the NVIDIA endpoint. -#### Step 1: Sign up to AI playground - -- Follow the [above](#nvidia-ai-playground) instructions to get access to an API key. +``` +$ cd GenerativeAIExamples -#### Step 2: Set Environment Variables +$ grep NVIDIA_API_KEY deploy/compose/compose.env + export NVIDIA_API_KEY="nvapi-*" +``` -- Modify ``compose.env`` in the ``deploy/compose`` directory to set your environment variables. The following variables are required. Provide your API key for NV playground and absolute path to [config.yaml](../../deploy/compose/config.yaml) file. +3. Set the nv-ai-foundation example in compose.env. ``` - export AI_PLAYGROUND_API_KEY="YOUR_NV_PLAYGROUND_API_KEY" - export APP_CONFIG_FILE="ABSOLUTE PATH TO config.yaml" + export RAG_EXAMPLE="nvidia_ai_foundation" ``` +4. Deploy the developer RAG example via Docker compose. -If you want to use the on-prem deployed LLM model provide the values of below variables as well: ``` - # full path to the local copy of the model weights - export MODEL_DIRECTORY="PATH TO MODEL CHECKPOINT DIrECTORY" +$ source deploy/compose/compose.env ; docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml build - # the architecture of the model. eg: llama - export MODEL_ARCHITECTURE="llama" +$ docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml up -d - # the name of the model being used - only for displaying on frontend - export MODEL_NAME="llama-2-13b-chat" +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +70ef27ae4c91 llm-playground Up 56 seconds +4aacfbe89464 chain-server Up 56 seconds ``` -#### Step 3: Update Config file -- Update the embedding model name and model engine in [config.yaml](../../deploy/compose/config.yaml) - - ``` - embeddings: - model_name: nvolve - model_engine: ai-playground - ``` +## Test -#### Step 4: Build and Start Containers -- Run the following command to build containers and start container if you want to use on-prem LLM model with playground based embedding model. - ``` - source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build - docker compose -f deploy/compose/docker-compose.yaml up -d - ``` - -Alternatively, run the following command to build and start the containers if you want to use playground based LLM model with playground based embedding model. -``` - source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-playground.yaml build - docker compose -f deploy/compose/docker-compose-playground.yaml up -d -``` +1. Follow steps 1 - 5 in the ["Test" section of example 02](../../RetrievalAugmentedGeneration/README.md#23-test). -#### Step 5: Try out queries with the deployed pipeline -- Interact with the pipeline using UI by following the steps mentioned [here.](../../RetrievalAugmentedGeneration/README.md#step-4-run-the-sample-web-application) \ No newline at end of file diff --git a/docs/rag/architecture.md b/docs/rag/architecture.md index 069c8459..a498e597 100644 --- a/docs/rag/architecture.md +++ b/docs/rag/architecture.md @@ -10,7 +10,10 @@ Generative AI starts with foundational models trained on vast quantities of unla To create true business value from LLMs, these foundational models need to be tailored to your enterprise use case. In this workflow, we use [RAG](https://blog.langchain.dev/tutorial-chatgpt-over-your-data/) with [Llama2](https://github.com/facebookresearch/llama/), an open source model from Meta, to achieve this. Augmenting an existing AI foundational model provides an advanced starting point and a low-cost solution that enterprises can leverage to generate accurate and clear responses to their specific use case. -This RAG-based reference chatbot workflow contains: +> ⚠️ **NOTE**: +This repository contains multiple examples. The architecture for the default canonical developer rag example is described below. + +This RAG-based reference default chatbot workflow contains: - [NVIDIA NeMo framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html) - part of NVIDIA AI Enterprise solution - [NVIDIA TensorRT-LLM](https://developer.nvidia.com/tensorrt) - for low latency and high throughput inference for LLMs diff --git a/docs/rag/chat_server.md b/docs/rag/chat_server.md index c0c68a18..224b92d9 100644 --- a/docs/rag/chat_server.md +++ b/docs/rag/chat_server.md @@ -1,7 +1,7 @@ # Chat Server A sample fastapi based server is provided in the workflow so that you can test the chat system in an interactive manner. -This server wraps calls made to different components and orchestrates the entire flow. +This server wraps calls made to different components and orchestrates the entire flow for all the provided examples. This API endpoint allows for several actions: - [Chat Server](#chat-server) diff --git a/docs/rag/configuration.md b/docs/rag/configuration.md index 2d1eaf76..2d7aa6b9 100644 --- a/docs/rag/configuration.md +++ b/docs/rag/configuration.md @@ -2,14 +2,16 @@ ### Chain Server Configuration -In this section, we explore the configurations for the [Chain Server](./chat_server.md). Chain server interaction with other components can be controlled by config. Chain Server interacts with components such as the `milvus` vector store and `triton` server, which hosts the Large Language Model (LLM). Additionally, we'll delve into customization options to fine-tune the behavior of the query server. These options include settings for the embedding model, chunk size, and prompts for generating responses. +In this section, we explore the configurations for the [Chain Server](./chat_server.md) used for the default canonical developer rag example. + +Chain server interaction with other components can be controlled by config. Chain Server interacts with components such as the `milvus` vector store and `triton` server, which hosts the Large Language Model (LLM). Additionally, we'll delve into customization options to fine-tune the behavior of the query server. These options include settings for the embedding model, chunk size, and prompts for generating responses. You can refer to [sample config](../../deploy/compose/config.yaml) to see the structure. -#### Milvus Configuration -`Milvus` serves as a vector database for storing embeddings. +#### Vector Database Configuration +The configuration of the solution which serves as a vector database for storing embeddings. - url: Configure the HTTP URI where the Milvus server is hosted. + url: Configure the HTTP URI where the vector database server is hosted. #### LLM server Configuration LLM Inference server hosts the Large Language Model (LLM) with triton backend. @@ -21,7 +23,7 @@ LLM Inference server hosts the Large Language Model (LLM) with triton backend. model_engine: An enum specifying the backend name hosting the model. Options currently supported are: 1. `triton-trt-llm` for using locally deployed LLM models. Follow steps [here](../../RetrievalAugmentedGeneration/README.md#local-llm-setup) to understand how to deploy and use on-prem deployed models. - 2. `ai-playground` for using NV AI Playground based models. Follow steps [here](../../RetrievalAugmentedGeneration/README.md#using-nvdia-cloud-based-llm) to understand how to deploy and use TRT-LLM optimized playground models from cloud. + 2. `nv-ai-foundation` for using NV AI Playground based models. Follow steps [here](../../RetrievalAugmentedGeneration/README.md#1-qa-chatbot----nvidia-ai-foundation-inference-endpoint) to understand how to deploy and use TRT-LLM optimized playground models from cloud. #### Text Splitter Configuration This section covers the settings for the Text Splitter component. @@ -34,7 +36,7 @@ This section covers the settings for the Text Splitter component. The Embeddings section contains information required for generating embeddings. model_name: Indicate the name of the model used to generate embeddings. - model_engine: An enum specifying the backend name hosting the model, Currently huggingface and ai-playground are supported. + model_engine: An enum specifying the backend name hosting the model, Currently huggingface and nv-ai-foundation are supported. dimensions: Integer value specifying the dimensions of the embedding search model from huggingface. Note: Any change in `model_name`` may also necessitate changes in the model's `dimensions`, which can be adjusted using this field. @@ -46,8 +48,8 @@ Customize prompts used for generating responses. You set path to use this config file to be used by chain server using enviornment variable `APP_CONFIG_FILE`. You can do the same in [compose.env](../../deploy/compose/compose.env) and source the file. -### Configuring docker compose file -In this section, we will look into the environment variables and parameters that can be configured within the [Docker Compose](../../deploy/compose/docker-compose.yaml) YAML file. Our system comprises multiple microservices that interact harmoniously to generate responses. These microservices include LLM Inference Server, Jupyter Server, Milvus, Query/chain server, and Frontend. +### Configuring docker compose file for default RAG example +In this section, we will look into the environment variables and parameters that can be configured within the [Docker Compose](../../deploy/compose/docker-compose.yaml) YAML file for the default canonical example. Our system comprises multiple microservices that interact harmoniously to generate responses. These microservices include LLM Inference Server, Jupyter Server, Milvus, Query/chain server, and Frontend. #### LLM server Configurations The LLM Inference Server is used for hosting the Large Language Model (LLM) with triton backend. You can configure the model information using the [compose.env](../../deploy/compose/compose.env) file or by setting the corresponding environment variables. Here is a list of environment variables utilized by the llm inference server: @@ -72,7 +74,7 @@ The Query service is the core component responsible for interacting with the llm APP_LLM_MODELNAME: The model name used by the Triton server. APP_LLM_MODELENGINE: An enum specifying the backend name hosting the model. Options currently supported are: 1. `triton-trt-llm` if you are using locally deployed LLM models. - 2. `ai-playground` if you are using NV AI Playground based models. + 2. `nv-ai-foundation` if you are using NV AI Playground based models. APP_CONFIG_FILE: Provides the path to the configuration file used by the Chain Server or this container. Defaults to /dev/null #### Frontend diff --git a/evaluation/README.md b/docs/rag/evaluation.md similarity index 58% rename from evaluation/README.md rename to docs/rag/evaluation.md index 5c98ca23..55942463 100644 --- a/evaluation/README.md +++ b/docs/rag/evaluation.md @@ -1,6 +1,6 @@ # Evaluation Tool -## Tool Details +## Introduction Evaluation is crucial for retrieval augmented generation (RAG) pipelines as it ensures the accuracy and relevance of information retrieved as well as the generated content. There are 3 components needed for evaluating the performance of a RAG pipeline: @@ -8,7 +8,8 @@ There are 3 components needed for evaluating the performance of a RAG pipeline: 2. Automated metrics to measure performance of both the context retrieval and response generation. 3. Human-like evaluation of the generated response from the end-to-end pipeline. -This tool provides a set of notebooks that show examples of how to address these requirements in an automated fashion. +> ⚠️ **NOTE** +This tool provides a set of notebooks that show examples of how to address these requirements in an automated fashion for the default canonical developer rag example. ### Synthetic Data Generation Using an existing knowledge base we can synthetically generate question|answer|context triplets using a LLM. This tool uses the Llama 2 70B model on [Nvidia AI Playground](https://www.nvidia.com/en-us/research/ai-playground/) for data generation. @@ -18,3 +19,16 @@ Using an existing knowledge base we can synthetically generate question|answer|c ### LLM-as-a-Judge We can use LLMs to provide human-like feedback and Likert evaluation scores for full end-to-end RAG pipelines. This tool uses Llama 2 70B as a judge LLM. + +## Deploy +1. Follow steps 1 - 5 in the ["Prepare the environment" section of example 02](../../RetrievalAugmentedGeneration/README.md#21-prepare-the-environment). + +2. Deploy the developer RAG example via Docker compose by following [these steps](../../RetrievalAugmentedGeneration/README.md#22-deploy). + +3. Build and deploy the evaluation service +``` + $ docker compose -f deploy/compose/docker-compose-evaluation.yaml build + $ docker compose -f deploy/compose/docker-compose-evaluation.yaml up -d +``` + +4. Access the notebook server at `http://host-ip:8889` from your web browser and try out the notebooks sequentially starting from [Notebook 1: Synthetic Data Generation for RAG Evaluation](../../tools/evaluation/01_synthetic_data_generation.ipynb) diff --git a/docs/rag/frontend.md b/docs/rag/frontend.md index 425acb3c..8ce78441 100644 --- a/docs/rag/frontend.md +++ b/docs/rag/frontend.md @@ -1,7 +1,7 @@ # Web Frontend ------------ The web frontend provides a UI on top of the [RAG chat server APIs](./chat_server.md). -- Users can chat with the LLM and see responses streamed back. +- Users can chat with the LLM and see responses streamed back for different examples. - By selecting “Use knowledge base,” the chatbot returns responses augmented with the data that’s been stored in the vector database. - To store content in the vector database, change the window to “Knowledge Base” in the upper right corner and upload documents. diff --git a/docs/rag/hf_model_download.md b/docs/rag/hf_model_download.md new file mode 100644 index 00000000..216020c9 --- /dev/null +++ b/docs/rag/hf_model_download.md @@ -0,0 +1,59 @@ +## Downloading Model from huggingface + +- Visit the Hugging Face Models Hub at https://huggingface.co/models + +- Search for the "llama-2" model in search bar. +![Model Search](../rag/images/hf/Slide1.JPG) + +- Choose the specific model you wish to download; for instance, let's select "llama-2-13b-chat-hf." + +- If you haven't already, sign up or log in to your Hugging Face account. +![Signup Page](../rag/images/hf/Slide2.JPG) + +- Agree to the terms and conditions provided. +![T and C Page](../rag/images/hf/Slide3.JPG) + + +- Confirm that your request to access the repository is successful. +![Success](../rag/images/hf/Slide4.JPG) + +- Complete the meta form by clicking on the link `Meta website` link mentioned in the previous steps. +![Meta Form](../rag/images/hf/Slide5.JPG) + +- Navigate to the "Files" section, which displays the available files. If you don't have access, it will be indicated like below. +![Default files](../rag/images/hf/Slide6.JPG) + +- Upon obtaining the necessary permissions, you will see all the files associated with the model on Hugging Face. +![Files list](../rag/images/hf/Slide7.JPG) + +- Click on the three dots (...) next to the train. +![Files list](../rag/images/hf/Slide8.JPG) + +- Select "Clone repository," which will prompt the following: +![Files list](../rag/images/hf/Slide9.JPG) + +- Execute the provided command in your terminal. When prompted, enter your Hugging Face username and token. +![Files list](../rag/images/hf/download.png) + +- In the password section, insert your token. If you haven't generated a token, you can do so in the Hugging Face settings. +![Files list](../rag/images/hf/Slide11.JPG) + +- Access the "Access Tokens" section in the right panel. +![Files list](../rag/images/hf/Slide12.JPG) + +- Generate a new token or copy an existing one. +![Files list](../rag/images/hf/Slide13.JPG) + +- Paste the token into your terminal. +![Files list](../rag/images/hf/download.png) + +- You may be asked for your username and password multiple times; provide the required information. + +- The terminal will initiate the download process for the model. This may take some time as it involves downloading checkpoints. + +- Once the download is complete, you will be able to view the contents of the downloaded model. + + + + + diff --git a/docs/rag/images/docker-output.png b/docs/rag/images/docker-output.png deleted file mode 100644 index 6311cc76..00000000 Binary files a/docs/rag/images/docker-output.png and /dev/null differ diff --git a/docs/rag/images/hf/Slide1.JPG b/docs/rag/images/hf/Slide1.JPG new file mode 100644 index 00000000..139333bb Binary files /dev/null and b/docs/rag/images/hf/Slide1.JPG differ diff --git a/docs/rag/images/hf/Slide10.JPG b/docs/rag/images/hf/Slide10.JPG new file mode 100644 index 00000000..d1554438 Binary files /dev/null and b/docs/rag/images/hf/Slide10.JPG differ diff --git a/docs/rag/images/hf/Slide11.JPG b/docs/rag/images/hf/Slide11.JPG new file mode 100644 index 00000000..3e68b016 Binary files /dev/null and b/docs/rag/images/hf/Slide11.JPG differ diff --git a/docs/rag/images/hf/Slide12.JPG b/docs/rag/images/hf/Slide12.JPG new file mode 100644 index 00000000..78974cc0 Binary files /dev/null and b/docs/rag/images/hf/Slide12.JPG differ diff --git a/docs/rag/images/hf/Slide13.JPG b/docs/rag/images/hf/Slide13.JPG new file mode 100644 index 00000000..adf90c98 Binary files /dev/null and b/docs/rag/images/hf/Slide13.JPG differ diff --git a/docs/rag/images/hf/Slide14.JPG b/docs/rag/images/hf/Slide14.JPG new file mode 100644 index 00000000..670b03a5 Binary files /dev/null and b/docs/rag/images/hf/Slide14.JPG differ diff --git a/docs/rag/images/hf/Slide15.JPG b/docs/rag/images/hf/Slide15.JPG new file mode 100644 index 00000000..e7bc4cef Binary files /dev/null and b/docs/rag/images/hf/Slide15.JPG differ diff --git a/docs/rag/images/hf/Slide2.JPG b/docs/rag/images/hf/Slide2.JPG new file mode 100644 index 00000000..6c593e75 Binary files /dev/null and b/docs/rag/images/hf/Slide2.JPG differ diff --git a/docs/rag/images/hf/Slide3.JPG b/docs/rag/images/hf/Slide3.JPG new file mode 100644 index 00000000..e02ccb97 Binary files /dev/null and b/docs/rag/images/hf/Slide3.JPG differ diff --git a/docs/rag/images/hf/Slide4.JPG b/docs/rag/images/hf/Slide4.JPG new file mode 100644 index 00000000..141bcc80 Binary files /dev/null and b/docs/rag/images/hf/Slide4.JPG differ diff --git a/docs/rag/images/hf/Slide5.JPG b/docs/rag/images/hf/Slide5.JPG new file mode 100644 index 00000000..d45203d9 Binary files /dev/null and b/docs/rag/images/hf/Slide5.JPG differ diff --git a/docs/rag/images/hf/Slide6.JPG b/docs/rag/images/hf/Slide6.JPG new file mode 100644 index 00000000..a2a2efb0 Binary files /dev/null and b/docs/rag/images/hf/Slide6.JPG differ diff --git a/docs/rag/images/hf/Slide7.JPG b/docs/rag/images/hf/Slide7.JPG new file mode 100644 index 00000000..0b15cdc7 Binary files /dev/null and b/docs/rag/images/hf/Slide7.JPG differ diff --git a/docs/rag/images/hf/Slide8.JPG b/docs/rag/images/hf/Slide8.JPG new file mode 100644 index 00000000..d630091c Binary files /dev/null and b/docs/rag/images/hf/Slide8.JPG differ diff --git a/docs/rag/images/hf/Slide9.JPG b/docs/rag/images/hf/Slide9.JPG new file mode 100644 index 00000000..6c39f266 Binary files /dev/null and b/docs/rag/images/hf/Slide9.JPG differ diff --git a/docs/rag/images/hf/download.png b/docs/rag/images/hf/download.png new file mode 100644 index 00000000..f04ec0d4 Binary files /dev/null and b/docs/rag/images/hf/download.png differ diff --git a/docs/rag/images/image10.png b/docs/rag/images/image10.png new file mode 100644 index 00000000..a1f4a14d Binary files /dev/null and b/docs/rag/images/image10.png differ diff --git a/docs/rag/images/image11.png b/docs/rag/images/image11.png new file mode 100644 index 00000000..86f7ad79 Binary files /dev/null and b/docs/rag/images/image11.png differ diff --git a/docs/rag/images/image12.png b/docs/rag/images/image12.png new file mode 100644 index 00000000..17348fe1 Binary files /dev/null and b/docs/rag/images/image12.png differ diff --git a/docs/rag/images/image7.png b/docs/rag/images/image7.png new file mode 100644 index 00000000..e34f926b Binary files /dev/null and b/docs/rag/images/image7.png differ diff --git a/docs/rag/images/image8.png b/docs/rag/images/image8.png new file mode 100644 index 00000000..78ae35cc Binary files /dev/null and b/docs/rag/images/image8.png differ diff --git a/docs/rag/images/image9.png b/docs/rag/images/image9.png new file mode 100644 index 00000000..1615f89b Binary files /dev/null and b/docs/rag/images/image9.png differ diff --git a/docs/rag/jupyter_server.md b/docs/rag/jupyter_server.md index 87cedecf..82130ba5 100644 --- a/docs/rag/jupyter_server.md +++ b/docs/rag/jupyter_server.md @@ -1,7 +1,7 @@ # Jupyter Notebooks For development and experimentation purposes, the Jupyter notebooks provide guidance to building knowledge augmented chatbots. -The following Jupyter notebooks are provided with the AI workflow: +The following Jupyter notebooks are provided with the AI workflow for the default canonical RAG example: 1. [**LLM Streaming Client**](../../notebooks/01-llm-streaming-client.ipynb) diff --git a/docs/rag/llm_inference_server.md b/docs/rag/llm_inference_server.md index 7d863034..df3d0544 100644 --- a/docs/rag/llm_inference_server.md +++ b/docs/rag/llm_inference_server.md @@ -2,6 +2,8 @@ We use [NeMo Framework Inference Server](https://docs.nvidia.com/nemo-framework/user-guide/latest/deployingthenemoframeworkmodel.html) container which help us to create optimized LLM using TensorRT LLM and deploy using NVIDIA Triton Server for high-performance, cost-effective, and low-latency inference. Within this workflow, We use Llama2 models and LLM Inference Server container contains modules and script required for TRT-LLM conversion of the Llama2 models and deployment using NVIDIA Triton Server. +> ⚠️ **NOTE**: LLM inference server is used by examples which deploys the model on-prem. There are examples in this repository which uses [Nvidia AI foundation models](https://www.nvidia.com/en-in/ai-data-science/foundation-models/) from cloud and may not use this component. + # Running the LLM Inference Server @@ -23,51 +25,3 @@ We use [NeMo Framework Inference Server](https://docs.nvidia.com/nemo-framework/ ``` - Once the optimized Llama2 is deployed in Triton Server, clients can send HTTP/REST or gRPC requests directly to Triton Server. Example implmentation of the client can be found [here](../../integrations/langchain/llms/triton_trt_llm.py). - - - -### Quantized Llama2 model deployment - -- Download Llama2 Chat Model Weights from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/). You can check [support matrix](support_matrix.md) for GPU requirements for the deployment. - -- For quantization of the Llama2 model using AWQ, first clone the [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0) repository separately and checkout release/v0.5.0. - - - Also copy the Llama2 model directory downloaded earlier to the TensorRT-LLM repo - -``` - git clone https://github.com/NVIDIA/TensorRT-LLM.git - cp -r TensorRT-LLM/ - cd TensorRT-LLM/ - git checkout release/0.5.0 -``` - -- Now setup the TensorRT-LLM repo seprately using steps [here](https://github.com/NVIDIA/TensorRT-LLM/blob/release/0.5.0/docs/source/installation.md) - -- Once the model is downloaded and TensorRT-LLM repo is setup, we can quantize the model using the TensorRT-LLM container. - - - Follow the steps from [here](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.5.0/examples/llama#awq) to quantize using AWQ, run these commands inside the container. - - - While running the quantization script, make sure to point `--model_dir` to your downloaded Llama2 model directory - - - Once the quantization is completed, copy the generated PyTorch (.pt) file inside the model directory - - ``` - cp .pt - ``` - -- Now, we will come back our repository, follow the steps below to deploy this quantized model using the inference server. - - - Update [compose.env](../../deploy/compose/compose.env) with `MODEL_DIRECTORY` pointing to Llama2 model directory containing the quantized checkpoint. - - - Make sure the qantized PyTorch model (.pt) file generated using above steps is present inside the MODEL_DIRECTORY. - - - - - Uncomment the QUANTIZATION variable which specifies quantization as "int4_awq" inside the [compose.env](../../deploy/compose/compose.env). - ``` - export QUANTIZATION="int4_awq" - ``` - -**Note for checkpoint downloaded using Meta**: - -*When downloading model weights from Meta, you can follow the instructions up to the point of downloading the models using ``download.sh``. Meta will download two additional files, namely `tokenizer.model` and `tokenizer_checklist.chk`, outside of the model checkpoint directory. Ensure that you copy these files into the same directory as the model checkpoint directory.* diff --git a/docs/rag/observability.md b/docs/rag/observability.md new file mode 100644 index 00000000..27d65223 --- /dev/null +++ b/docs/rag/observability.md @@ -0,0 +1,107 @@ +# RAG Observability Tool +## Introduction +Observability is a crucial aspect that facilitates the monitoring and comprehension of the internal state and behavior of a system or application. Applications based on RAG are intricate systems encompassing the interaction of numerous components. To enhance the performance of these RAG-based applications, observability serves as an efficient mechanism for both monitoring and debugging. + +Following diagram shows high level workflow of how traces are captured in the RAG Example +![RAG with Observability](./images/image9.png) + +The observability stack adds following containers on top of the RAG app containers: +1. **OpenTelemetry Collector**: Responsible for receiving, processing and exporting the traces. +2. **Jaeger**: Acts as OpenTelemetry backend providing storage, query service and visualizer. You can also configure any other OTLP compatible backend such as [Zipkin](https://zipkin.io/), [Prometheus](https://prometheus.io/) etc. To configure any other backend refer to [OpenTelemetry Collector configuration](https://opentelemetry.io/docs/collector/configuration/). +3. **Cassandra**: Persistent storage for traces. Jaeger supports many other [storage backends](https://www.jaegertracing.io/docs/1.18/deployment/#storage-backends) like ElasticSearch, Kafka, and Badger. Please note that for large scale production deployment the Jaeger team recommends Elasticsearch backend over Cassandra . + +## Key terms +1. **Span**: A unit of work within a system, encapsulating information about a specific operation (Eg. LLM call, embedding generation etc). +2. **Traces**: The recording of a request as it goes through a system, tracking every service the request comes in contact with. Multiple spans make a trace logically bound by parent-child relationship. +3. **Root Span**: The first span in a trace, denoting the beginning and end of the entire operation. +4. **Span Attributes**: Key-value pairs a Span may consist of to provide additional context or metadata. +5. **Collectors**: Components that process and export telemetry data from instrumented applications. +6. **Context**: Signifies current location within the trace hierarchy. It determines whether a new span initiates a trace or connects to an existing parent span. +7. **Services**: Microservices that generates telemetry data + +Following diagram depicts a typical trace for user query from knowledge base in our RAG example. +![Trace for query from knowledge base](./images/image10.png) + +## Deploy +1. Clone the Generative AI examples Git repository. + +> ⚠️ **NOTE**: This example requires Git Large File Support (LFS) + +``` +$ sudo apt -y install git-lfs +$ git clone git@github.com:NVIDIA/GenerativeAIExamples.git +Cloning into 'GenerativeAIExamples'... +$ cd GenerativeAIExamples/ +$ git lfs pull +``` +2. Update the [OpenTelemetry collector configurations](../../deploy/compose/configs/otel-collector-config.yaml) and [jaeger configurations](../../deploy/compose/configs/jaeger.yaml). + +To know more about available configurations please refer to [OpenTelemetry Collector configurations](https://opentelemetry.io/docs/collector/configuration/) and [Jaeger configurtions](https://github.com/jaegertracing/documentation/blob/main/data/cli/1.52/jaeger-all-in-one-cassandra.yaml) + +3. Update the [compose.env](../../deploy/compose/compose.env). + +4. For the frontend and query services, set the following environment variables in the [docker compose file](../../deploy/compose/docker-compose.yaml): +``` +environment: + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + ENABLE_TRACING: true +``` + +5. Deploy the developer RAG example via Docker compose. +``` +$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build + +$ docker compose -f deploy/compose/docker-compose.yaml up -d + +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +d11e35ee69f4 llm-playground Up 5 minutes +68f22b3842cb chain-server Up 5 minutes +751dd4fd80ec milvus-standalone Up 5 minutes (healthy) +b435006c95c1 milvus-minio Up 6 minutes (healthy) +9108253d058d notebook-server Up 6 minutes +5315a9dc9eb4 milvus-etcd Up 6 minutes (healthy) +``` + +6. Deploy the observability services +``` +$ docker compose -f deploy/compose/docker-compose-observability.yaml build + +$ docker compose -f deploy/compose/docker-compose-observability.yaml up -d + +$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" +CONTAINER ID NAMES STATUS +beb1582320d6 jaeger Up 5 minutes +674c7bbb367e cassandra Up 6 minutes +d11e35ee69f4 llm-playground Up 5 minutes +68f22b3842cb chain-server Up 5 minutes +751dd4fd80ec milvus-standalone Up 5 minutes (healthy) +b435006c95c1 milvus-minio Up 6 minutes (healthy) +9108253d058d notebook-server Up 6 minutes +5315a9dc9eb4 milvus-etcd Up 6 minutes (healthy) +d314a43074c8 otel-collector Up 6 minutes +``` +7. Access the Jaeger UI at `http://host-ip:16686` from your web browser. + +Below are the screenshots showcasing trace data from the Jaeger UI. + +- Upload document trace: +![upload document trace](./images/image11.png) +- User query using knowledge base trace: +![user query using knowledge base](./images/image12.png) + +## Implementation Details +Currently 2 services viz. frontend and chain-server are instrumented. +### frontend +[tracing.py](../../RetrievalAugmentedGeneration/frontend/frontend/tracing.py) module in frontend application code is responsible for instrumentation. At high level it does the following: +- Set up the OpenTelemetry configurations for resource name (i.e frontend), span processor and context propagator +- Provides an instrumentation decorator functions(`instrumentation_wrapper` and `predict_instrumentation_wrapper`) for managing trace context across different services. This decorator function is used with the API functions in [chat_client.py](../../RetrievalAugmentedGeneration/frontend/frontend/chat_client.py) to create new span contexts (that can then be injected in the headers of the request made to the chain server) and log span attributes extracted from the API request. + +### chain-server +[tracing.py](../../RetrievalAugmentedGeneration/common/tracing.py) module in the chain server application code is responsible for instrumentation. At high level it does the following: +- Set up the OpenTelemetry configurations for resource name(i.e chain-server), span processor and context propagator +- Initialize the [LlamaIndex OpenTelemetry callback handler](../../tools/observability/llamaindex/opentelemetry_callback.py) which uses [LlamaIndex callbacks](https://docs.llamaindex.ai/en/stable/module_guides/observability/callbacks/root.html) to track various events like llm calls, chunking, embedding etc +- Provides an instrumentation decorator function (`instrumentation_wrapper`) for managing trace context across different services. This decorator function is used with the API functions in [server.py](../../RetrievalAugmentedGeneration/common/server.py) to extract the trace context present in requests from the frontend service and attach it in the new span created by chain-server. + +**NOTE**: Instrumentation decorator function (`instrumentation_wrapper`) can be used for instrumenting any LlamaIndex application as long as [LlamaIndex OpenTelemetry callback handler](../../tools/observability/llamaindex/opentelemetry_callback.py) is set as global handler in it. diff --git a/docs/rag/support_matrix.md b/docs/rag/support_matrix.md index 51b58f30..cc4d9153 100644 --- a/docs/rag/support_matrix.md +++ b/docs/rag/support_matrix.md @@ -9,6 +9,8 @@ Llama2-70B-Chat requires about 320GB of GPU memory. Llama2-7B-Chat AWQ quantized requires about 25GB of GPU memory. +Nemotron-8B-Chat-SFT requires about 100GB of GPU memory. + These resources can be provided by multiple GPUs on the same machine. To perform retrieval augmentation, another model must be hosted. This model is much smaller and is called an embedding model. It is responsible for converting a sequence of words to a representation in the form of a vector of numbers. This model requires an additional 2GB of GPU memory. @@ -27,6 +29,8 @@ Llama2-13B-Chat requires about 50GB of storage. Llama2-70B-Chat requires about 150GB of storage. +Nemotron-8B-Chat-SFT requires about 50GB of storage. + The file space needed for the vector database varies by how many documents it will store. For development purposes, allocating 10 GB is plenty. You will need additionally about 60GB of storage for docker images. diff --git a/examples/5_mins_rag_no_gpu/main.py b/examples/5_mins_rag_no_gpu/main.py new file mode 100644 index 00000000..0c4207bc --- /dev/null +++ b/examples/5_mins_rag_no_gpu/main.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a simple standalone implementation showing rag pipeline using Nvidia AI Foundational models. +# It uses a simple Streamlit UI and one file implementation of a minimalistic RAG pipeline. + +############################################ +# Component #1 - Document Loader +############################################ + +import streamlit as st +import os + +st.set_page_config(layout = "wide") + +with st.sidebar: + DOCS_DIR = os.path.abspath("./uploaded_docs") + if not os.path.exists(DOCS_DIR): + os.makedirs(DOCS_DIR) + st.subheader("Add to the Knowledge Base") + with st.form("my-form", clear_on_submit=True): + uploaded_files = st.file_uploader("Upload a file to the Knowledge Base:", accept_multiple_files = True) + submitted = st.form_submit_button("Upload!") + + if uploaded_files and submitted: + for uploaded_file in uploaded_files: + st.success(f"File {uploaded_file.name} uploaded successfully!") + with open(os.path.join(DOCS_DIR, uploaded_file.name),"wb") as f: + f.write(uploaded_file.read()) + +############################################ +# Component #2 - Embedding Model and LLM +############################################ + +from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings + +# make sure to export your NVIDIA AI Playground key as NVIDIA_API_KEY! +llm = ChatNVIDIA(model="mixtral_8x7b") +document_embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="passage") +query_embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="query") + +############################################ +# Component #3 - Vector Database Store +############################################ + +from langchain.text_splitter import CharacterTextSplitter +from langchain.document_loaders import DirectoryLoader +from langchain.vectorstores import FAISS +import pickle + +with st.sidebar: + # Option for using an existing vector store + use_existing_vector_store = st.radio("Use existing vector store if available", ["Yes", "No"], horizontal=True) + +# Path to the vector store file +vector_store_path = "vectorstore.pkl" + +# Load raw documents from the directory +raw_documents = DirectoryLoader(DOCS_DIR).load() + + +# Check for existing vector store file +vector_store_exists = os.path.exists(vector_store_path) +vectorstore = None +if use_existing_vector_store == "Yes" and vector_store_exists: + with open(vector_store_path, "rb") as f: + vectorstore = pickle.load(f) + with st.sidebar: + st.success("Existing vector store loaded successfully.") +else: + with st.sidebar: + if raw_documents: + with st.spinner("Splitting documents into chunks..."): + text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200) + documents = text_splitter.split_documents(raw_documents) + + with st.spinner("Adding document chunks to vector database..."): + vectorstore = FAISS.from_documents(documents, document_embedder) + + with st.spinner("Saving vector store"): + with open(vector_store_path, "wb") as f: + pickle.dump(vectorstore, f) + st.success("Vector store created and saved.") + else: + st.warning("No documents available to process!", icon="⚠️") + +############################################ +# Component #4 - LLM Response Generation and Chat +############################################ + +st.subheader("Chat with your AI Assistant, Envie!") + +if "messages" not in st.session_state: + st.session_state.messages = [] + +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate + +prompt_template = ChatPromptTemplate.from_messages( + [("system", "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."), ("user", "{input}")] +) +user_input = st.chat_input("Can you tell me what NVIDIA is known for?") +llm = ChatNVIDIA(model="mixtral_8x7b") + +chain = prompt_template | llm | StrOutputParser() + +if user_input and vectorstore!=None: + st.session_state.messages.append({"role": "user", "content": user_input}) + retriever = vectorstore.as_retriever() + docs = retriever.get_relevant_documents(user_input) + with st.chat_message("user"): + st.markdown(user_input) + + context = "" + for doc in docs: + context += doc.page_content + "\n\n" + + augmented_user_input = "Context: " + context + "\n\nQuestion: " + user_input + "\n" + + with st.chat_message("assistant"): + message_placeholder = st.empty() + full_response = "" + + for response in chain.stream({"input": augmented_user_input}): + full_response += response + message_placeholder.markdown(full_response + "▌") + message_placeholder.markdown(full_response) + st.session_state.messages.append({"role": "assistant", "content": full_response}) diff --git a/examples/5_mins_rag_no_gpu/requirements.txt b/examples/5_mins_rag_no_gpu/requirements.txt new file mode 100644 index 00000000..a82c08db --- /dev/null +++ b/examples/5_mins_rag_no_gpu/requirements.txt @@ -0,0 +1,5 @@ +streamlit==1.30.0 +langchain-nvidia-ai-endpoints==0.0.1 +faiss-cpu==1.7.4 +langchain==0.0.352 +unstructured[all-docs]==0.11.2 diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..773c1585 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,37 @@ +# Standalone examples + +This directory contains standalone examples which has their own seperate end to end workflow like UI, deployment methodologies and tools showcasing different usecases. + + +## RAG in 5 minutes example + +This is a simple standalone implementation showing a a minimalistic RAG pipeline using models available in [Nvidia AI playground.](https://catalog.ngc.nvidia.com/ai-foundation-models) +**NVIDIA AI Foundation** lets developers to experience state of the art LLMs accelerated by NVIDIA. Developers get **free credits for 10K requests** to any of the available models. +It uses [connectors available in Langchain to build the workflow.](https://python.langchain.com/docs/integrations/providers/nvidia) These open source connectors are maintained and tested by NVIDIA engineers. +This example leverages a simple [Streamlit](https://streamlit.io/) based UI and has a one file implementation. This example does not need any GPU to run. + +### Steps +1. Create a python virtual environment and activate it + ``` + python3 -m virtualenv genai + source genai/bin/activate + ``` + +2. Goto the root of this repository `GenerativeAIExamples` and execute below command to install the requirements + ``` + pip install -r examples/5_mins_rag_no_gpu/requirements.txt + ``` + +3. Set your NVIDIA_API_KEY. Follow the steps 1-4 mentioned [here](../docs/rag/aiplayground.md#prepare-the-environment) to get this. + ``` + export NVIDIA_API_KEY="provide_your_key" + ``` + +4. Run the example using streamlit +``` +streamlit run examples/5_mins_rag_no_gpu/main.py +``` + +5. Finally to test the deployed example, goto the URL `http://:8501` in a web browser. Click on `browse files` and select your knowledge source. After selecting click on `Upload!` button to complete the ingestion process. + +6. You are all set now! Try out queries pertinent to the knowledge base using text from the UI. diff --git a/experimental/AzureML/02.5_langchain_simple_AzureML.ipynb b/experimental/AzureML/02.5_langchain_simple_AzureML.ipynb new file mode 100644 index 00000000..4114f479 --- /dev/null +++ b/experimental/AzureML/02.5_langchain_simple_AzureML.ipynb @@ -0,0 +1,370 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "77c8ac2e-eb68-4b84-85fe-3a6661eba976", + "metadata": {}, + "source": [ + "# Notebook 2.5: Document Question-Answering with LangChain and AzureML\n", + "This notebook demonstrates how to use LangChain to build a chatbot that references a custom knowledge-base and sends requests to a remote AzureML hosted NVIDIA Nemotron LLM. \n", + "\n", + "Before proceeding with this notebook you must have an accepible Nemotron3-8B Model hosted as and enpoint in AzureML. The Nemotron-8B models are curated by Microsoft in the ‘nvidia-ai’ Azure Machine Learning (AzureML) registry and show up on the model catalog under the NVIDIA Collection. Explore the model card to learn more about the model architecture, use-cases and limitations. \n", + "\n", + "![alt text](./images/azureml-github.gif \"Launch Nemotron3-8B LLM Endpoint\")\n", + "\n", + "Simply sending requests to the Nemotron3-8B LLM will likely not fit your needs as it is un aware of your proprietary data. Suppose you have some text documents (PDF, blog, Notion pages, etc.) and want to ask questions related to the contents of those documents. LLMs, given their proficiency in understanding text, are a great tool for this. \n", + "\n", + "### [LangChain](https://python.langchain.com/docs/get_started/introduction)\n", + "[**LangChain**](https://python.langchain.com/docs/get_started/introduction) provides a simple framework for connecting LLMs to your own data sources. Since LLMs are both only trained up to a fixed point in time and do not contain knowledge that is proprietary to an Enterprise, they can't answer questions about new or proprietary knowledge. LangChain solves this problem." + ] + }, + { + "cell_type": "markdown", + "id": "80ca0402-3e14-4414-8977-ba4617f7da74", + "metadata": {}, + "source": [ + "### Step 1: Integrate TensorRT-LLM to LangChain [*(Model I/O)*](https://python.langchain.com/docs/modules/model_io/)\n", + "\n", + "#### Custom TRT-LLM Langchain integration.\n", + "Langchain allows you to [create custom wrappers for your LLM](https://python.langchain.com/docs/modules/model_io/models/llms/custom_llm) in case you want to use your own LLM or a different wrapper than the one that is supported in LangChain. Since we are using a remote Nemotron-3-8B modle hosteon Triton with TRT-LLM, we have written a custom wrapper for our LLM. \n", + "\n", + "Below is a snippet of the custom wrapper. Take a look at ```trt_llm_azureml.py``` for the full implementation.\n", + "```\n", + "class TensorRTLLM(LLM):\n", + " server_url: str = Field(None, alias=\"server_url\")\n", + "\n", + " # some of the optional arguments\n", + " model_name: str = \"ensemble\"\n", + " temperature: Optional[float] = 1.0\n", + " top_p: Optional[float] = 0\n", + "\n", + " @property\n", + " def _llm_type(self) -> str:\n", + " return \"triton_tensorrt\"\n", + "\n", + " def _call(\n", + " self,\n", + " prompt: str,\n", + " run_manager: Optional[CallbackManagerForLLMRun] = None,\n", + " **kwargs,\n", + " ) -> str:\n", + " \"\"\"\n", + " Args:\n", + " prompt: The prompt to pass into the model.\n", + " stop: A list of strings to stop generation when encountered\n", + "\n", + " Returns:\n", + " The string generated by the model\n", + " \"\"\"\n", + "\n", + "```\n", + "\n", + "A ```_call``` method that takes in a string, some optional stop words, and returns a string. Take a look at ```trt_llm_aureml.py``` for the code of LangChain wrapper for a Llama2 model deployed on Triton with TRT-LLM.\n", + "\n", + "``llm = TensorRTLLM( # type: ignore\n", + "server_url=\"tme-demo-ml-zfqjc.eastus.inference.ml.azure.com/\", model_name=\"ensemble\", tokens=500, use_ssl=True, api_key=\"\", extra_headers=extra_headers,)``\n", + "\n", + "
\n", + " \n", + "WARNING! Be sure to replace `extra_headers[\"azureml-model-deployment\"]`, `server_url`, and `api_key` with the AzureML Model Deployment, Endpoint URL, and API-KEY respectively.\n", + "\n", + "![alt text](./images/connection-info.png \"Connection Info\")\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7c4cb93-95c7-4665-b091-7719d996acb8", + "metadata": {}, + "outputs": [], + "source": [ + "from trt_llm_azureml import TensorRTLLM\n", + "extra_headers = {}\n", + "extra_headers[\"azureml-model-deployment\"] = \"nemotron-3-8b-chat-rlhf-1\"\n", + "\n", + "# Connect to the TRT-LLM Llama-2 model running on the Triton server at the url below\n", + "llm = TensorRTLLM( # type: ignore\n", + " server_url=\"tme-demo-ml-zfqjc.eastus.inference.ml.azure.com/\",\n", + " model_name=\"ensemble\",\n", + " tokens=500,\n", + " use_ssl=True,\n", + " api_key=\"REPLACE-WITH-API-KEY\",\n", + " extra_headers=extra_headers,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "8d835a3b-a4fd-423e-b594-c8be749f4f39", + "metadata": {}, + "source": [ + "### Step 2: Create a Prompt Template [*(Model I/O)*](https://python.langchain.com/docs/modules/model_io/)\n", + "\n", + "A [**prompt template**](https://python.langchain.com/docs/modules/model_io/prompts/prompt_templates/) is a common paradigm in LLM development. \n", + "\n", + "They are a pre-defined set of instructions provided to the LLM and guide the output produced by the model. They can contain few shot examples and guidance and are a quick way to engineer the responses from the LLM. Nemotron3-8b accepts the [prompt format](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-rlhf#prompt-format) shown in `GPT_RAG_TEMPLATE`, which we manipulate to be constructed with:\n", + "- The system prompt\n", + "- The context\n", + "- The user's question\n", + "Langchain allows you to [create custom wrappers for your LLM](https://python.langchain.com/docs/modules/model_io/models/llms/custom_llm) in case you want to use your own LLM or a different wrapper than the one that is supported in LangChain. Since we are using a Nemotron3-8b model hosted in AzureML, we have written a custom wrapper for our LLM. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc894491-1239-4a71-83fb-c312a873e2c5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "GPT_RAG_TEMPLATE = (\n", + " \"System\\n\"\n", + " \"A chat between a curious user and an artificial intelligence assistant.\"\n", + " \"The assistant gives helpful, detailed, and polite answers to the user's questions.\\n\"\n", + " \"User\\n\"\n", + " \"Context: {context}\\n\\n\"\n", + " \"Given the above context, answer the following question: {question}\\n\"\n", + " \"Assistant\\n\"\n", + ")\n", + "\n", + "GPT_PROMPT = PromptTemplate.from_template(GPT_RAG_TEMPLATE)" + ] + }, + { + "cell_type": "markdown", + "id": "3310462b-f215-4d00-9d59-e613921bed0a", + "metadata": {}, + "source": [ + "### Step 3: Load Documents [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", + "LangChain provides a variety of [document loaders](https://python.langchain.com/docs/integrations/document_loaders) that load various types of documents (HTML, PDF, code) from many different sources and locations (private s3 buckets, public websites).\n", + "\n", + "Document loaders load data from a source as **Documents**. A **Document** is a piece of text (the page_content) and associated metadata. Document loaders provide a ``load`` method for loading data as documents from a configured source. \n", + "\n", + "In this example, we use a LangChain [`UnstructuredFileLoader`](https://python.langchain.com/docs/integrations/document_loaders/unstructured_file) to load a research paper about Llama2 from Meta.\n", + "\n", + "[Here](https://python.langchain.com/docs/integrations/document_loaders) are some of the other document loaders available from LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70c92132-4c34-44fc-af28-6aa0769b006c", + "metadata": {}, + "outputs": [], + "source": [ + "! wget -O \"llama2_paper.pdf\" -nc --user-agent=\"Mozilla\" https://arxiv.org/pdf/2307.09288.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4382b61", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredFileLoader\n", + "loader = UnstructuredFileLoader(\"llama2_paper.pdf\")\n", + "data = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "4e0449e4", + "metadata": {}, + "source": [ + "### Step 4: Transform Documents [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", + "Once documents have been loaded, they are often transformed. One method of transformation is known as **chunking**, which breaks down large pieces of text, for example, a long document, into smaller segments. This technique is valuable because it helps [optimize the relevance of the content returned from the vector database](https://www.pinecone.io/learn/chunking-strategies/). \n", + "\n", + "LangChain provides a [variety of document transformers](https://python.langchain.com/docs/integrations/document_transformers/), such as text splitters. In this example, we use a [``SentenceTransformersTokenTextSplitter``](https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.SentenceTransformersTokenTextSplitter.html#langchain.text_splitter.SentenceTransformersTokenTextSplitter). The ``SentenceTransformersTokenTextSplitter`` is a specialized text splitter for use with the sentence-transformer models. The default behaviour is to split the text into chunks that fit the token window of the sentence transformer model that you would like to use. This sentence transformer model is used to generate the embeddings from documents. \n", + "\n", + "There are some nuanced complexities to text splitting since semantically related text, in theory, should be kept together. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21ec0438", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import SentenceTransformersTokenTextSplitter\n", + "TEXT_SPLITTER_MODEL = \"intfloat/e5-large-v2\"\n", + "TEXT_SPLITTER_CHUNCK_SIZE = 510\n", + "TEXT_SPLITTER_CHUNCK_OVERLAP = 200\n", + "\n", + "text_splitter = SentenceTransformersTokenTextSplitter(\n", + " model_name=TEXT_SPLITTER_MODEL,\n", + " chunk_size=TEXT_SPLITTER_CHUNCK_SIZE,\n", + " chunk_overlap=TEXT_SPLITTER_CHUNCK_OVERLAP,\n", + ")\n", + "documents = text_splitter.split_documents(data)" + ] + }, + { + "cell_type": "markdown", + "id": "183aaeeb-7461-4f58-9fc4-2a51fa723714", + "metadata": {}, + "source": [ + "Let's view a sample of content that is chunked together in the documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46525e4e", + "metadata": {}, + "outputs": [], + "source": [ + "documents[40].page_content" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3f580c54", + "metadata": {}, + "source": [ + "### Step 5: Generate Embeddings and Store Embeddings in the Vector Store [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", + "\n", + "#### a) Generate Embeddings\n", + "[Embeddings](https://python.langchain.com/docs/modules/data_connection/text_embedding/) for documents are created by vectorizing the document text; this vectorization captures the semantic meaning of the text. This allows you to quickly and efficiently find other pieces of text that are similar. The embedding model used below is [intfloat/e5-large-v2](https://huggingface.co/intfloat/e5-large-v2).\n", + "\n", + "LangChain provides a wide variety of [embedding models](https://python.langchain.com/docs/integrations/text_embedding) from many providers and makes it simple to swap out the models. \n", + "\n", + "When a user sends in their query, the query is also embedded using the same embedding model that was used to embed the documents. As explained earlier, this allows to find similar (relevant) documents to the user's query. \n", + "\n", + "#### b) Store Document Embeddings in the Vector Store\n", + "Once the document embeddings are generated, they are stored in a vector store so that at query time we can:\n", + "1) Embed the user query and\n", + "2) Retrieve the embedding vectors that are most similar to the embedding query.\n", + "\n", + "A vector store takes care of storing the embedded data and performing a vector search.\n", + "\n", + "LangChain provides support for a [great selection of vector stores](https://python.langchain.com/docs/integrations/vectorstores/). \n", + "\n", + "
\n", + " \n", + "⚠️ For this workflow, [Milvus](https://milvus.io/) vector database is running as a microservice. \n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bd8b943", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain.vectorstores import Milvus\n", + "import torch\n", + "\n", + "#In the production deployment (API server shown as part of the 5th notebook we run the model on GPU)\n", + "model_name = \"intfloat/e5-large-v2\"\n", + "model_kwargs = {\"device\": \"cpu\"} #Can run the model on GPU since LLM is remote. e.g. model_kwargs = {\"device\": \"cuda:0\"}\n", + "encode_kwargs = {\"normalize_embeddings\": False}\n", + "hf_embeddings = HuggingFaceEmbeddings(\n", + " model_name=model_name,\n", + " model_kwargs=model_kwargs,\n", + " encode_kwargs=encode_kwargs,\n", + ")\n", + "vectorstore = Milvus.from_documents(documents=documents, embedding=hf_embeddings, connection_args={\"host\": \"milvus\", \"port\": \"19530\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7fa622f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Simple Example: Retrieve Documents from the Vector Database\n", + "# note: this is just for demonstration purposes of a similarity search\n", + "question = \"Can you talk about safety evaluation of llama2 chat?\"\n", + "docs = vectorstore.similarity_search(question)\n", + "print(docs[2].page_content)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f6960255", + "metadata": {}, + "source": [ + " > ### Simple Example: Retrieve Documents from the Vector Database [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", + ">Given a user query, relevant splits for the question are returned through a **similarity search**. This is also known as a semantic search, and it is done with meaning. It is different from a lexical search, where the search engine looks for literal matches of the query words or variants of them, without understanding the overall meaning of the query. A semantic search tends to generate more relevant results than a lexical search." + ] + }, + { + "cell_type": "markdown", + "id": "9c8148dc", + "metadata": {}, + "source": [ + "### Step 6: Compose a streamed answer using a Chain\n", + "We have already integrated the AzureML hosted Nemotron3-8b LLM into LangChain with a custom wrapper, loaded and transformed documents, and generated and stored document embeddings in a vector database. To finish the pipeline, we need to add a few more LangChain components and combine all the components together with a [chain](https://python.langchain.com/docs/modules/chains/).\n", + "\n", + "A [LangChain chain](https://python.langchain.com/docs/modules/chains/) combines components together. In this case, we use a [RetrievalQA chain](https://js.langchain.com/docs/modules/chains/popular/vector_db_qa/), which is a chain type for question-answering against a vector index. It combines a *Retriever* and a *question answering (QA) chain*.\n", + "\n", + "We pass it 3 of our LangChain components:\n", + "- Our instance of the LLM (from step 1).\n", + "- A [retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/), which is an interface that returns documents given an unstructured query. In this case, we use our vector store as the retriever.\n", + "- Our prompt template constructed from the prompt format for Llama2 (from step 2)\n", + "\n", + "```\n", + "qa_chain = RetrievalQA.from_chain_type(\n", + " llm,\n", + " retriever=vectorstore.as_retriever(),\n", + " chain_type_kwargs={\"prompt\": GPT_PROMPT}\n", + ")\n", + "```\n", + "\n", + "Lastly, we pass a user query to the chain and stream the result. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69de32a0", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import RetrievalQA\n", + "\n", + "qa_chain = RetrievalQA.from_chain_type(\n", + " llm,\n", + " retriever=vectorstore.as_retriever(),\n", + " chain_type_kwargs={\"prompt\": GPT_PROMPT}\n", + ")\n", + "result = qa_chain({\"query\": question})\n", + "print(result[\"result\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experimental/AzureML/README.md b/experimental/AzureML/README.md new file mode 100644 index 00000000..c8a65d3e --- /dev/null +++ b/experimental/AzureML/README.md @@ -0,0 +1,41 @@ +# NVIDIA Generative AI with AzureML Example + +## Introduction +This example shows how to modify the canonical RAG example to use a remote NVIDIA Nemotron-8B LLM hosted in AzureML. A custom LangChain connector is used to instantiate the LLM from within a sample notebook. + +### Setup Guide +1. Comment out the `llm`, `query` and `frontend` services from the [docker compose file](../../deploy/compose/docker-compose.yaml) since we will be using a notebook server and milvus vector DB server for this flow. +2. Build and deploy the services using the modified compose file + ``` + $ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build + $ docker compose -f deploy/compose/docker-compose.yaml up -d + + $ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" + CONTAINER ID NAMES STATUS + 4a8c4aebe4ad notebook-server Up 1 minutes + 5be2b57bb5c1 milvus-standalone Up 1 minutes (healthy) + a6609c22c171 milvus-minio Up 1 minutes (healthy) + b23c0858c4d4 milvus-etcd Up 1 minutes (healthy) + ``` +3. Upload the `02.5_langchain_simple_AzureML.ipynb` and `trt_llm_azureml.py` files from this directory into the Jupyter environment by going to the URL ``http://host-ip:8888``. +4. Follow the steps mentioned in `02.5_langchain_simple_AzureML.ipynb` after uploading it to Jupyter Lab environment. + +The Nemotron-8B models are curated by Microsoft in the ‘nvidia-ai’ Azure Machine Learning (AzureML) registry and show up on the model catalog under the NVIDIA Collection. Explore the model card to learn more about the model architecture, use-cases and limitations. + +## Large Language Models +NVIDIA LLMs are optimized for building enterprise generative AI applications. + +| Name | Description | Type | Context Length | Example | License | +|---------------|-----------------------|------------|----------------|---------|---------| +| [nemotron-3-8b-qa-4k](https://huggingface.co/nvidia/nemotron-3-8b-qa-4k) | Q&A LLM customized on knowledge bases | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | +| [nemotron-3-8b-chat-4k-steerlm](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-steerlm) | Best out-of-the-box chat model with flexible alignment at inference | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | +| [nemotron-3-8b-chat-4k-rlhf](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-rlhf) | Best out-of-the-box chat model performance| Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | +| [nemotron-3-8b-chat-sft](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-sft) | building block for instruction tuning custom models, user-defined alignment, such as RLHF or SteerLM models. | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | +| [nemotron-3-8b-base-4k](https://huggingface.co/nvidia/nemotron-3-8b-base-4k) | enables customization, including parameter-efficient fine-tuning and continuous pre-training for domain-adapted LLMs | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) | + + +## NVIDIA support +This example is experimental and the workflow may not be streamlined with other examples in this repository. + +## Feedback / Contributions +We're posting these examples on GitHub to better support the community, facilitate feedback, as well as collect and implement contributions using GitHub Issues and pull requests. We welcome all contributions! \ No newline at end of file diff --git a/experimental/AzureML/images/azureml-github.gif b/experimental/AzureML/images/azureml-github.gif new file mode 100644 index 00000000..8586e72f Binary files /dev/null and b/experimental/AzureML/images/azureml-github.gif differ diff --git a/experimental/AzureML/images/connection-info.png b/experimental/AzureML/images/connection-info.png new file mode 100644 index 00000000..c63314d4 Binary files /dev/null and b/experimental/AzureML/images/connection-info.png differ diff --git a/experimental/AzureML/trt_llm_azureml.py b/experimental/AzureML/trt_llm_azureml.py new file mode 100644 index 00000000..f56c0dce --- /dev/null +++ b/experimental/AzureML/trt_llm_azureml.py @@ -0,0 +1,362 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""A Langchain LLM component for connecting to Triton + TensorRT LLM backend for AzureML hosted endpoints.""" + +# pylint: disable=too-many-lines +import time +from functools import partial +from typing import Any, Callable, Dict, List, Optional, Type + +import gevent.ssl +import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +try: + from langchain.callbacks.manager import CallbackManagerForLLMRun + from langchain.llms.base import LLM + from langchain.pydantic_v1 import Field, root_validator + + USE_LANGCHAIN = True +except ImportError: + USE_LANGCHAIN = False + + +STOP_WORDS = ["
"] +RANDOM_SEED = 0 + +if USE_LANGCHAIN: + # pylint: disable-next=too-few-public-methods # Interface is defined by LangChain + class TensorRTLLM(LLM): # type: ignore # LLM class not typed in langchain + """A custom Langchain LLM class that integrates with TRTLLM triton models. + + Arguments: + server_url: (str) The URL of the Triton inference server to use. + model_name: (str) The name of the Triton TRT model to use. + temperature: (str) Temperature to use for sampling + top_p: (float) The top-p value to use for sampling + top_k: (float) The top k values use for sampling + beam_width: (int) Last n number of tokens to penalize + repetition_penalty: (int) Last n number of tokens to penalize + length_penalty: (float) The penalty to apply repeated tokens + tokens: (int) The maximum number of tokens to generate. + client: The client object used to communicate with the inference server + """ + + server_url: str = Field(None, alias="server_url") + + # # all the optional arguments + model_name: str = "ensemble" + temperature: Optional[float] = 1.0 + top_p: Optional[float] = 0 + top_k: Optional[int] = 1 + tokens: Optional[int] = 100 + beam_width: Optional[int] = 1 + repetition_penalty: Optional[float] = 1.0 + length_penalty: Optional[float] = 1.0 + client: Any + api_key: Optional[str] = None + use_ssl = False + extra_headers: Dict[str, str] = {} + + @root_validator() # type: ignore # typing not declared in langchain + @classmethod + def validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """Validate that python package exists in environment.""" + try: + values["client"] = HttpTritonClient( + values["server_url"], + values["use_ssl"], + values["api_key"], + **values["extra_headers"], + ) + + except ImportError as err: + raise ImportError( + "Could not import triton client python package. " + "Please install it with `pip install tritonclient[all]`." + ) from err + return values + + @property + def _get_model_default_parameters(self) -> Dict[str, Any]: + return { + "tokens": self.tokens, + "top_k": self.top_k, + "top_p": self.top_p, + "temperature": self.temperature, + "repetition_penalty": self.repetition_penalty, + "length_penalty": self.length_penalty, + "beam_width": self.beam_width, + } + + @property + def _invocation_params(self, **kwargs: Any) -> Dict[str, Any]: + params = {**self._get_model_default_parameters, **kwargs} + return params + + @property + def _identifying_params(self) -> Dict[str, Any]: + """Get all the identifying parameters.""" + return { + "server_url": self.server_url, + "model_name": self.model_name, + } + + @property + def _llm_type(self) -> str: + return "triton_tensorrt" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, # pylint: disable=unused-argument + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> str: + """ + Execute an inference request. + + Args: + prompt: The prompt to pass into the model. + stop: A list of strings to stop generation when encountered + + Returns: + The string generated by the model + """ + text_callback = None + if run_manager: + text_callback = partial( + run_manager.on_llm_new_token, verbose=self.verbose + ) + + invocation_params = self._get_model_default_parameters + invocation_params.update(kwargs) + invocation_params["prompt"] = [[prompt]] + model_params = self._identifying_params + model_params.update(kwargs) + + #self.client.load_model(model_params["model_name"]) + return self._request(model_params, invocation_params, text_callback) + + def _streaming_request( + self, + model_params: Dict[str, Any], + request_id: str, + invocation_params: Dict[str, Any], + text_callback: Optional[Callable[[str], None]], + ) -> str: + """Request a streaming inference session.""" + result_queue = self.client.request_streaming( + model_params["model_name"], request_id, **invocation_params + ) + + response = "" + for token in result_queue: + if text_callback: + text_callback(token) + response = response + token + return response + + def _request( + self, + model_params: Dict[str, Any], + invocation_params: Dict[str, Any], + text_callback: Optional[Callable[[str], None]], + ) -> str: + """Request a streaming inference session.""" + token: str = self.client.request( + model_params["model_name"], **invocation_params + ) + if text_callback: + text_callback(token) + return token + + +class HttpTritonClient: + """HTTP connection to a triton inference server.""" + + def __init__( + self, + server_url: str, + use_ssl: Optional[bool] = False, + api_key: Optional[str] = None, + **extra_headers, + ) -> None: + """Initialize the client.""" + self._server_url = server_url + + use_ssl = use_ssl or False # ensure use ssl is a bool and not None + # pylint: disable-next=no-member ; false positive + ssl_factory = gevent.ssl._create_default_https_context if use_ssl else None + self._client: httpclient.InferenceServerClient = self._inference_server_client( + server_url, + ssl=use_ssl, + ssl_context_factory=ssl_factory, + ) + self._headers = { + "Content-Type": "application/json", + } + if api_key: + self._headers["Authorization"] = "Bearer " + api_key + if extra_headers: + self._headers.update(extra_headers) + + @property + def _inference_server_client( + self, + ) -> Type[httpclient.InferenceServerClient]: + """Return the prefered InferenceServerClient class.""" + return httpclient.InferenceServerClient # type: ignore + + @property + def _infer_input(self) -> Type[httpclient.InferInput]: + """Return the preferred InferInput.""" + return httpclient.InferInput # type: ignore + + @property + def _infer_output( + self, + ) -> Type[httpclient.InferRequestedOutput]: + """Return the preferred InferRequestedOutput.""" + return httpclient.InferRequestedOutput # type: ignore + + def load_model(self, model_name: str, timeout: int = 1000) -> None: + """Load a model into the server.""" + if self._client.is_model_ready(model_name, "1", headers=self._headers): + return + + #self._client.load_model(model_name, headers=self._headers) + t0 = time.perf_counter() + t1 = t0 + while ( + not self._client.is_model_ready(model_name, headers=self._headers) + and t1 - t0 < timeout + ): + t1 = time.perf_counter() + + if not self._client.is_model_ready(model_name, headers=self._headers): + raise RuntimeError(f"Failed to load {model_name} on Triton in {timeout}s") + + def get_model_list(self) -> List[str]: + """Get a list of models loaded in the triton server.""" + res = self._client.get_model_repository_index(headers=self._headers) + return [model["name"] for model in res["models"]] + + def get_model_concurrency(self, model_name: str, timeout: int = 1000) -> int: + """Get the modle concurrency.""" + self.load_model(model_name, timeout) + instances = self._client.get_model_config(model_name, headers=self._headers)[ + "config" + ]["instance_group"] + return sum(instance["count"] * len(instance["gpus"]) for instance in instances) + + def _generate_outputs( + self, + ) -> List[httpclient.InferRequestedOutput]: + """Generate the expected output structure.""" + return [self._infer_output("text_output")] + + def _prepare_tensor(self, name: str, input_data: Any) -> httpclient.InferInput: + """Prepare an input data structure.""" + t = self._infer_input( + name, input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + t.set_data_from_numpy(input_data) + return t + + def _generate_inputs( # pylint: disable=too-many-arguments,too-many-locals + self, + prompt: str, + tokens: int = 300, + temperature: float = 1.0, + top_k: float = 1, + top_p: float = 0, + beam_width: int = 1, + repetition_penalty: float = 1, + length_penalty: float = 1.0, + stream: bool = False, + ) -> List[httpclient.InferInput]: + """Create the input for the triton inference server.""" + query = np.array(prompt).astype(object) + request_output_len = np.array([tokens]).astype(np.uint32).reshape((1, -1)) + runtime_top_k = np.array([top_k]).astype(np.uint32).reshape((1, -1)) + runtime_top_p = np.array([top_p]).astype(np.float32).reshape((1, -1)) + temperature_array = np.array([temperature]).astype(np.float32).reshape((1, -1)) + len_penalty = np.array([length_penalty]).astype(np.float32).reshape((1, -1)) + repetition_penalty_array = ( + np.array([repetition_penalty]).astype(np.float32).reshape((1, -1)) + ) + random_seed = np.array([RANDOM_SEED]).astype(np.uint64).reshape((1, -1)) + beam_width_array = np.array([beam_width]).astype(np.uint32).reshape((1, -1)) + streaming_data = np.array([[stream]], dtype=bool) + + inputs = [ + self._prepare_tensor("text_input", query), + self._prepare_tensor("max_tokens", request_output_len), + self._prepare_tensor("top_k", runtime_top_k), + self._prepare_tensor("top_p", runtime_top_p), + self._prepare_tensor("temperature", temperature_array), + self._prepare_tensor("length_penalty", len_penalty), + self._prepare_tensor("repetition_penalty", repetition_penalty_array), + self._prepare_tensor("random_seed", random_seed), + self._prepare_tensor("beam_width", beam_width_array), + self._prepare_tensor("stream", streaming_data), + ] + return inputs + + def _trim_batch_response(self, result_str: str) -> str: + """Trim the resulting response from a batch request by removing provided prompt and extra generated text.""" + # extract the generated part of the prompt + assistant_block = False + generated = [] + for line in result_str.split("\n"): + if assistant_block: + if line == "User": + break + generated += [line] + continue + + if line == "Assistant": + assistant_block = True + + return "\n".join(generated).strip() + + def request( + self, + model_name: str, + **params: Any, + ) -> str: + """Request inferencing from the triton server.""" + if not self._client.is_model_ready(model_name, headers=self._headers): + raise RuntimeError("Cannot request streaming, model is not loaded") + + # create model inputs and outputs + inputs = self._generate_inputs(stream=False, **params) + #outputs = self._generate_outputs() + + # call the model for inference + result = self._client.infer( + model_name, inputs=inputs, headers=self._headers + ) + result_str = "".join( + [val.decode("utf-8") for val in result.as_numpy("text_output").tolist()] + ) + + # extract the generated part of the prompt + # return result_str + return self._trim_batch_response(result_str) diff --git a/notebooks/00-llm-non-streaming-nemotron.ipynb b/notebooks/00-llm-non-streaming-nemotron.ipynb new file mode 100644 index 00000000..e41955ca --- /dev/null +++ b/notebooks/00-llm-non-streaming-nemotron.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a18bc7fc-c0bb-498d-8ac2-56b513645357", + "metadata": {}, + "outputs": [], + "source": [ + "from triton_trt_llm import HttpTritonClient" + ] + }, + { + "cell_type": "markdown", + "id": "6917492a-b287-422a-a787-38fd862339f8", + "metadata": {}, + "source": [ + "#### Step 1: Structure the Query in a Prompt Template" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bd40ebb-45bb-4a4b-b7b4-b6c726cd8ea7", + "metadata": {}, + "outputs": [], + "source": [ + "NEMOTRON_PROMPT_TEMPLATE = (\n", + " \"\"\"System\n", + "{system}\n", + "User\n", + "{prompt}\n", + "Assistant\n", + "\"\"\"\n", + ")\n", + "system = \"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature.\"\n", + "prompt = 'What is the fastest land animal?'\n", + "prompt = NEMOTRON_PROMPT_TEMPLATE.format(prompt=prompt, system=system)" + ] + }, + { + "cell_type": "markdown", + "id": "c0e7ec48-3865-4811-a175-6a6142240da2", + "metadata": {}, + "source": [ + "#### Step 2: Create the Triton Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b8476aa-6364-408a-840a-a8afb83ea08e", + "metadata": {}, + "outputs": [], + "source": [ + "triton_url = \"llm:8000\"\n", + "client = HttpTritonClient(triton_url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9c89aca-1eae-4afc-b520-ed14ebef2606", + "metadata": {}, + "outputs": [], + "source": [ + "pload = {\n", + " 'prompt':[[prompt]],\n", + " 'tokens':64,\n", + " 'temperature':1.0,\n", + " 'top_k':1,\n", + " 'top_p':0,\n", + " 'beam_width':1,\n", + " 'repetition_penalty':1.0,\n", + " 'length_penalty':1.0\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0c32f8d8-f383-4faf-997a-1c8412412a5a", + "metadata": {}, + "source": [ + "#### Step 3: Load the Model and Generate Response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b02689bd-36d6-41fe-8636-9deeb52bc68f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "model_name = \"ensemble\"\n", + "client.load_model(model_name)\n", + "val = client.request(model_name, **pload)\n", + "print(val)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/01-llm-streaming-client.ipynb b/notebooks/01-llm-streaming-client.ipynb index 7168e1cc..13c5e9c1 100644 --- a/notebooks/01-llm-streaming-client.ipynb +++ b/notebooks/01-llm-streaming-client.ipynb @@ -9,9 +9,9 @@ "This notebook demonstrates how to stream responses from the LLM. \n", "\n", "### Triton Inference Server\n", - "The LLM has been deployed to [NVIDIA Triton Inference Server](https://developer.nvidia.com/triton-inference-server) and leverages NVIDIA TensorRT-LLM (TRT-LLM), so it's optimized for low latency and high throughput inference. \n", + "The LLM has been deployed to [NVIDIA Triton Inference Server](https://developer.nvidia.com/triton-inference-server) and leverages NVIDIA TensorRT-LLM (TRT-LLM), so it's optimized for low latency and high throughput inference.\n", "\n", - "The **Triton client** is used to communicate with the inference server hosting the LLM. \n", + "The **Triton client** is used to communicate with the inference server hosting the LLM and is available in [Langchain](https://github.com/langchain-ai/langchain/tree/master/libs/partners/nvidia-trt). \n", "\n", "### Streaming LLM Responses\n", "TRT-LLM on its own can provide drastic improvements to LLM response latency, but streaming can take the user-experience to the next level. Instead of waiting for an entire response to be returned from the LLM, chunks of it can be processed as soon as they are available. This helps reduce the perceived latency by the user. " @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "42a2f2cb", "metadata": {}, "outputs": [], @@ -81,20 +81,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "5670011e-f52b-4c16-be4d-8a782b622541", "metadata": {}, "outputs": [], "source": [ - "from triton_trt_llm import GrpcTritonClient\n", + "from langchain_nvidia_trt.llms import TritonTensorRTLLM\n", "\n", "triton_url = \"llm:8001\"\n", - "client = GrpcTritonClient(triton_url)" + "pload = {\n", + " 'tokens':300,\n", + " 'server_url': triton_url,\n", + " 'model_name': \"ensemble\",\n", + " 'temperature':1.0,\n", + " 'top_k':1,\n", + " 'top_p':0,\n", + " 'beam_width':1,\n", + " 'repetition_penalty':1.0,\n", + " 'length_penalty':1.0\n", + "}\n", + "client = TritonTensorRTLLM(**pload)" ] }, { "cell_type": "markdown", - "id": "eec1fda2-e974-4c7d-b656-0b81e452cefb", + "id": "03676629-33d8-46d8-b5fc-557d526609b4", "metadata": {}, "source": [ "Additional inputs to the LLM can be modified:\n", @@ -106,25 +117,6 @@ "- length_penalty: 1 means no penalty for length of generation" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "6907e68d-8ab9-4f94-b6a0-825446261bfc", - "metadata": {}, - "outputs": [], - "source": [ - "pload = {\n", - " 'prompt':[[prompt]],\n", - " 'tokens':300,\n", - " 'temperature':1.0,\n", - " 'top_k':1,\n", - " 'top_p':0,\n", - " 'beam_width':1,\n", - " 'repetition_penalty':1.0,\n", - " 'length_penalty':1.0\n", - "}" - ] - }, { "cell_type": "markdown", "id": "c526b20b-258a-4eb7-87e6-5430d57e32ea", @@ -136,18 +128,17 @@ { "cell_type": "code", "execution_count": null, - "id": "d389f26d-e68d-4466-a7cb-89e62683af0b", + "id": "274e4164-7460-4471-8625-90562237cf11", "metadata": {}, "outputs": [], "source": [ - "model_name = \"ensemble\"\n", - "client.load_model(model_name)\n", "import time\n", "import random\n", "\n", "start_time = time.time()\n", "tokens_generated = 0\n", - "for val in client.request_streaming(model_name, request_id = str(random.getrandbits(64)), **pload):\n", + "\n", + "for val in client.stream(prompt):\n", " tokens_generated += 1\n", " print(val, end=\"\", flush=True)\n", "\n", diff --git a/notebooks/02_langchain_simple.ipynb b/notebooks/02_langchain_simple.ipynb index e4415b22..8b5cc548 100644 --- a/notebooks/02_langchain_simple.ipynb +++ b/notebooks/02_langchain_simple.ipynb @@ -34,73 +34,47 @@ "id": "9dd510db", "metadata": {}, "source": [ - "### Step 1: Integrate TensorRT-LLM to LangChain [*(Model I/O)*](https://python.langchain.com/docs/modules/model_io/)\n", - "\n", - "#### Custom TRT-LLM Langchain integration.\n", - "Langchain allows you to [create custom wrappers for your LLM](https://python.langchain.com/docs/modules/model_io/models/llms/custom_llm) in case you want to use your own LLM or a different wrapper than the one that is supported in LangChain. Since we are using a custom Llama2 model hosted on Triton with TRT-LLM, we have written a custom wrapper for our LLM. \n", - "\n", - "Below is a snippet of the custom wrapper. Take a look at ```triton_trt_llm.py``` under `integrations/langchain/llms` of the project root for the full implementation.\n", - "```\n", - "class TensorRTLLM(LLM):\n", - " server_url: str = Field(None, alias=\"server_url\")\n", - "\n", - " # some of the optional arguments\n", - " model_name: str = \"ensemble\"\n", - " temperature: Optional[float] = 1.0\n", - " top_p: Optional[float] = 0\n", - "\n", - " @property\n", - " def _llm_type(self) -> str:\n", - " return \"triton_tensorrt\"\n", - "\n", - " def _call(\n", - " self,\n", - " prompt: str,\n", - " run_manager: Optional[CallbackManagerForLLMRun] = None,\n", - " **kwargs,\n", - " ) -> str:\n", - " \"\"\"\n", - " Args:\n", - " prompt: The prompt to pass into the model.\n", - " stop: A list of strings to stop generation when encountered\n", - "\n", - " Returns:\n", - " The string generated by the model\n", - " \"\"\"\n", - "\n", - "```\n", - "\n", - "A ```_call``` method that takes in a string, some optional stop words, and returns a string. Take a look at ```triton_trt_llm.py``` for the code of LangChain wrapper for a Llama2 model deployed on Triton with TRT-LLM.\n", - "\n", - "``llm = TensorRTLLM(server_url =\"triton_ip:8001\", model_name=\"ensemble\", callbacks=callbacks, tokens=500)``\n", - "\n", - "
\n", - " \n", - "WARNING! Be sure to replace server_url with the address and port of the Triton server hosting the LLM. If you are running this as part of the generative AI Workflow, you don't have to replace the Triton url.\n", - "\n", - "
" + "### Step 1: Integrate TensorRT-LLM to LangChain [*(Connector)*](https://docs.llamaindex.ai/en/stable/examples/llm/nvidia_tensorrt.html)" ] }, { "cell_type": "code", "execution_count": null, + "id": "f7c4cb93-95c7-4665-b091-7719d996acb8", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from langchain_nvidia_trt.llms import TritonTensorRTLLM\n", + "\n", + "# Connect to the TRT-LLM Llama-2 model running on the Triton server at the url below\n", + "# Replace \"llm\" with the url of the system where llama2 is hosted\n", + "triton_url = \"llm:8001\"\n", + "pload = {\n", + " 'tokens':500,\n", + " 'server_url': triton_url,\n", + " 'model_name': \"ensemble\"\n", + "}\n", + "llm = TritonTensorRTLLM(**pload)" + ] + }, + { + "cell_type": "markdown", + "id": "552c7fc8", + "metadata": {}, + "source": [ + "#### Note: Follow this step for nemotron models\n", + "1. In case you have deployed a trt-llm optimized nemotron model following steps [here](../RetrievalAugmentedGeneration/README.md#6-qa-chatbot----nemotron-model), execute the cell below by uncommenting the lines. Here we use a custom wrapper for talking with the model server." + ] }, { "cell_type": "code", "execution_count": null, - "id": "f7c4cb93-95c7-4665-b091-7719d996acb8", + "id": "4f125aea", "metadata": {}, "outputs": [], "source": [ - "from triton_trt_llm import TensorRTLLM\n", - "from langchain.callbacks import streaming_stdout\n", - "\n", - "callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]\n", - "# Connect to the TRT-LLM Llama-2 model running on the Triton server at the url below\n", - "llm = TensorRTLLM(server_url =\"llm:8001\", model_name=\"ensemble\", callbacks=callbacks, tokens=500)" + "# from triton_trt_llm import TensorRTLLM\n", + "# llm = TensorRTLLM(server_url =\"llm:8000\", model_name=\"ensemble\", tokens=500, streaming=False)" ] }, { @@ -198,12 +172,12 @@ "import time\n", "from langchain.text_splitter import SentenceTransformersTokenTextSplitter\n", "TEXT_SPLITTER_MODEL = \"intfloat/e5-large-v2\"\n", - "TEXT_SPLITTER_CHUNCK_SIZE = 510\n", + "TEXT_SPLITTER_TOKENS_PER_CHUNK = 510\n", "TEXT_SPLITTER_CHUNCK_OVERLAP = 200\n", "\n", "text_splitter = SentenceTransformersTokenTextSplitter(\n", " model_name=TEXT_SPLITTER_MODEL,\n", - " chunk_size=TEXT_SPLITTER_CHUNCK_SIZE,\n", + " tokens_per_chunk=TEXT_SPLITTER_TOKENS_PER_CHUNK,\n", " chunk_overlap=TEXT_SPLITTER_CHUNCK_OVERLAP,\n", ")\n", "start_time = time.time()\n", @@ -320,24 +294,11 @@ "metadata": {}, "source": [ "### Step 6: Compose a streamed answer using a Chain\n", - "We have already integrated the Llama2 TRT LLM into LangChain with a custom wrapper, loaded and transformed documents, and generated and stored document embeddings in a vector database. To finish the pipeline, we need to add a few more LangChain components and combine all the components together with a [chain](https://python.langchain.com/docs/modules/chains/).\n", + "We have already integrated the Llama2 TRT LLM with the help of LangChain connector, loaded and transformed documents, and generated and stored document embeddings in a vector database. To finish the pipeline, we need to add a few more LangChain components and combine all the components together with a [chain](https://python.langchain.com/docs/modules/chains/).\n", "\n", - "A [LangChain chain](https://python.langchain.com/docs/modules/chains/) combines components together. In this case, we use a [RetrievalQA chain](https://js.langchain.com/docs/modules/chains/popular/vector_db_qa/), which is a chain type for question-answering against a vector index. It combines a *Retriever* and a *question answering (QA) chain*.\n", - "\n", - "We pass it 3 of our LangChain components:\n", - "- Our instance of the LLM (from step 1).\n", - "- A [retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/), which is an interface that returns documents given an unstructured query. In this case, we use our vector store as the retriever.\n", - "- Our prompt template constructed from the prompt format for Llama2 (from step 2)\n", - "\n", - "```\n", - "qa_chain = RetrievalQA.from_chain_type(\n", - " llm,\n", - " retriever=vectorstore.as_retriever(),\n", - " chain_type_kwargs={\"prompt\": LLAMA_PROMPT}\n", - ")\n", - "```\n", + "A [LangChain chain](https://python.langchain.com/docs/modules/chains/) combines components together. In this case, we use [Langchain Expression Language](https://python.langchain.com/docs/expression_language/why) to build a chain.\n", "\n", - "Lastly, we pass a user query to the chain and stream the result. " + "We formulate the prompt placeholders (context and question) and pipe it to our trt-llm connector as shown below and finally stream the result." ] }, { @@ -347,16 +308,17 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain.chains import RetrievalQA\n", + "from langchain_core.runnables import RunnablePassthrough\n", "import time\n", "\n", - "qa_chain = RetrievalQA.from_chain_type(\n", - " llm,\n", - " retriever=vectorstore.as_retriever(),\n", - " chain_type_kwargs={\"prompt\": LLAMA_PROMPT}\n", + "chain = (\n", + " {\"context\": vectorstore.as_retriever(), \"question\": RunnablePassthrough()}\n", + " | LLAMA_PROMPT\n", + " | llm\n", ")\n", "start_time = time.time()\n", - "result = qa_chain({\"query\": question})\n", + "for token in chain.stream(question):\n", + " print(token, end=\"\", flush=True)\n", "print(f\"\\n--- {time.time() - start_time} seconds ---\")" ] } diff --git a/notebooks/03_llama_index_simple.ipynb b/notebooks/03_llama_index_simple.ipynb index 3c067663..4657b4d7 100644 --- a/notebooks/03_llama_index_simple.ipynb +++ b/notebooks/03_llama_index_simple.ipynb @@ -38,7 +38,7 @@ "source": [ "### Step 1: Integrate TensorRT-LLM to LangChain *and* LlamaIndex\n", "#### Customized LangChain LLM in LlamaIndex\n", - "As noted in the previous notebook, Langchain allows you to [create custom wrappers for your LLM](https://python.langchain.com/docs/modules/model_io/models/llms/custom_llm) in case you want to use your own LLM or a different wrapper than the one that is supported in LangChain. Since we are using a custom Llama2 model hosted on Triton with TRT-LLM, we have written a custom wrapper for our LLM. \n", + "Langchain allows you to create custom wrappers for your LLM in case you want to use your own LLM or a different wrapper than the one that is supported in LangChain. Since we are using LlamaIndex, we have written a custom langchain wrapper compatible with LlamaIndex.\n", "\n", "We can easily take a custom LLM that has been wrapped for LangChain and plug it into [LlamaIndex as an LLM](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms.html#using-llms)! We use the [LlamaIndex LangChainLLM library](https://gpt-index.readthedocs.io/en/latest/api_reference/llms/langchain.html) so the LangChain LLM can be used in LlamaIndex. \n", "\n", @@ -206,12 +206,12 @@ "\n", "\n", "TEXT_SPLITTER_MODEL = \"intfloat/e5-large-v2\"\n", - "TEXT_SPLITTER_CHUNCK_SIZE = 510\n", + "TEXT_SPLITTER_TOKENS_PER_CHUNK = 510\n", "TEXT_SPLITTER_CHUNCK_OVERLAP = 200\n", "\n", "text_splitter = SentenceTransformersTokenTextSplitter(\n", " model_name=TEXT_SPLITTER_MODEL,\n", - " chunk_size=TEXT_SPLITTER_CHUNCK_SIZE,\n", + " tokens_per_chunk=TEXT_SPLITTER_TOKENS_PER_CHUNK,\n", " chunk_overlap=TEXT_SPLITTER_CHUNCK_OVERLAP,\n", ")\n", "\n", diff --git a/notebooks/04_llamaindex_hier_node_parser.ipynb b/notebooks/04_llamaindex_hier_node_parser.ipynb index 5f36ec3d..640188f8 100644 --- a/notebooks/04_llamaindex_hier_node_parser.ipynb +++ b/notebooks/04_llamaindex_hier_node_parser.ipynb @@ -316,13 +316,14 @@ "source": [ "from typing import Callable, Optional\n", "\n", - "from llama_index.utils import globals_helper\n", + "from llama_index.utils import globals_helper, get_tokenizer\n", "from llama_index.schema import MetadataMode\n", "\n", "class LimitRetrievedNodesLength:\n", "\n", " def __init__(self, limit: int = 2500, tokenizer: Optional[Callable] = None):\n", - " self._tokenizer = tokenizer or globals_helper.tokenizer\n", + " self._tokenizer = tokenizer or get_tokenizer()\n", + "\n", " self.limit = limit\n", "\n", " def postprocess_nodes(self, nodes, query_bundle):\n", diff --git a/notebooks/06_AI_playground.ipynb b/notebooks/06_AI_playground.ipynb deleted file mode 100644 index 94c401b5..00000000 --- a/notebooks/06_AI_playground.ipynb +++ /dev/null @@ -1,276 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "65d4e77f-cc3d-402d-987b-a9c361ce99e1", - "metadata": {}, - "source": [ - "# Notebook 6: RAG with NVIDIA AI Playground and Langchain\n", - "\n", - "**NVIDIA AI Playground** on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. Developers get **free credits for 10K requests** to any of the available models. Sign up process is easy. Follow the instructions here \n", - "\n", - "This notebook demonstrates how to use LangChain and NVIDIA AI Playground to build a chatbot that references a custom knowledge-base. \n", - "\n", - "Suppose you have some text documents (PDF, blog, Notion pages, etc.) and want to ask questions related to the contents of those documents. LLMs, given their proficiency in understanding text, are a great tool for this. \n", - "\n", - "### [LangChain](https://python.langchain.com/docs/get_started/introduction)\n", - "[**LangChain**](https://python.langchain.com/docs/get_started/introduction) provides a simple framework for connecting LLMs to your own data sources. Since LLMs are both only trained up to a fixed point in time and do not contain knowledge that is proprietary to an Enterprise, they can't answer questions about new or proprietary knowledge. LangChain solves this problem." - ] - }, - { - "cell_type": "markdown", - "id": "23110dc7-df11-4413-b8d3-9db2f3e1187a", - "metadata": {}, - "source": [ - "![data_connection](./imgs/data_connection_langchain.jpeg)" - ] - }, - { - "cell_type": "markdown", - "id": "e6806b29-cbc1-4dc8-b378-5c6333eb7229", - "metadata": {}, - "source": [ - "### Step 1: Load Documents [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", - "LangChain provides a variety of [document loaders](https://python.langchain.com/docs/integrations/document_loaders) that load various types of documents (HTML, PDF, code) from many different sources and locations (private s3 buckets, public websites).\n", - "\n", - "Document loaders load data from a source as **Documents**. A **Document** is a piece of text (the page_content) and associated metadata. Document loaders provide a ``load`` method for loading data as documents from a configured source. \n", - "\n", - "In this example, we use a LangChain [`UnstructuredFileLoader`](https://python.langchain.com/docs/integrations/document_loaders/unstructured_file) to load a research paper about Llama2 from Meta.\n", - "\n", - "[Here](https://python.langchain.com/docs/integrations/document_loaders) are some of the other document loaders available from LangChain." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c09ede8-58cc-44d5-b46c-b1060dd578c5", - "metadata": {}, - "outputs": [], - "source": [ - "!wget -O \"llama2_paper.pdf\" -nc --user-agent=\"Mozilla\" https://arxiv.org/pdf/2307.09288.pdf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43793ae9-f25f-4e9d-a7a5-13b64f02b115", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.document_loaders import UnstructuredFileLoader\n", - "loader = UnstructuredFileLoader(\"llama2_paper.pdf\")\n", - "data = loader.load()" - ] - }, - { - "cell_type": "markdown", - "id": "21c8568d-c20e-4d79-a828-c2bcd2b6cd20", - "metadata": {}, - "source": [ - "### Step 2: Transform Documents [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", - "Once documents have been loaded, they are often transformed. One method of transformation is known as **chunking**, which breaks down large pieces of text, for example, a long document, into smaller segments. This technique is valuable because it helps [optimize the relevance of the content returned from the vector database](https://www.pinecone.io/learn/chunking-strategies/). \n", - "\n", - "LangChain provides a [variety of document transformers](https://python.langchain.com/docs/integrations/document_transformers/), such as text splitters. In this example, we use a [``SentenceTransformersTokenTextSplitter``](https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.SentenceTransformersTokenTextSplitter.html#langchain.text_splitter.SentenceTransformersTokenTextSplitter). The ``SentenceTransformersTokenTextSplitter`` is a specialized text splitter for use with the sentence-transformer models. The default behaviour is to split the text into chunks that fit the token window of the sentence transformer model that you would like to use. This sentence transformer model is used to generate the embeddings from documents. \n", - "\n", - "There are some nuanced complexities to text splitting since semantically related text, in theory, should be kept together. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fea998e-c776-4763-b811-b42b6372fcb9", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "from langchain.text_splitter import SentenceTransformersTokenTextSplitter\n", - "TEXT_SPLITTER_MODEL = \"intfloat/e5-large-v2\"\n", - "TEXT_SPLITTER_CHUNCK_SIZE = 510\n", - "TEXT_SPLITTER_CHUNCK_OVERLAP = 200\n", - "\n", - "text_splitter = SentenceTransformersTokenTextSplitter(\n", - " model_name=TEXT_SPLITTER_MODEL,\n", - " chunk_size=TEXT_SPLITTER_CHUNCK_SIZE,\n", - " chunk_overlap=TEXT_SPLITTER_CHUNCK_OVERLAP,\n", - ")\n", - "start_time = time.time()\n", - "documents = text_splitter.split_documents(data)\n", - "print(f\"--- {time.time() - start_time} seconds ---\")" - ] - }, - { - "cell_type": "markdown", - "id": "c72a07ff-119a-468c-ab68-f8f6e9f7c6fc", - "metadata": {}, - "source": [ - "### Step 3: Generate Embeddings and Store Embeddings in the Vector Store [*(Retrieval)*](https://python.langchain.com/docs/modules/data_connection/)\n", - "\n", - "#### a) Generate Embeddings\n", - "[Embeddings](https://python.langchain.com/docs/modules/data_connection/text_embedding/) for documents are created by vectorizing the document text; this vectorization captures the semantic meaning of the text. This allows you to quickly and efficiently find other pieces of text that are similar. The embedding model used below is [intfloat/e5-large-v2](https://huggingface.co/intfloat/e5-large-v2).\n", - "\n", - "LangChain provides a wide variety of [embedding models](https://python.langchain.com/docs/integrations/text_embedding) from many providers and makes it simple to swap out the models. \n", - "\n", - "When a user sends in their query, the query is also embedded using the same embedding model that was used to embed the documents. As explained earlier, this allows to find similar (relevant) documents to the user's query. \n", - "\n", - "#### b) Store Document Embeddings in the Vector Store\n", - "Once the document embeddings are generated, they are stored in a vector store so that at query time we can:\n", - "1) Embed the user query and\n", - "2) Retrieve the embedding vectors that are most similar to the embedding query.\n", - "\n", - "A vector store takes care of storing the embedded data and performing a vector search.\n", - "\n", - "LangChain provides support for a [great selection of vector stores](https://python.langchain.com/docs/integrations/vectorstores/). \n", - "\n", - "
\n", - " \n", - "⚠️ For this workflow, [Milvus](https://milvus.io/) vector database is running as a microservice. \n", - "\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ee3be33-5f78-4937-9e07-a7cc93d5d7df", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.embeddings import HuggingFaceEmbeddings\n", - "from langchain.vectorstores import Milvus\n", - "import torch\n", - "import time\n", - "\n", - "#Running the model on CPU as we want to conserve gpu memory.\n", - "#In the production deployment (API server shown as part of the 5th notebook we run the model on GPU)\n", - "model_name = \"intfloat/e5-large-v2\"\n", - "model_kwargs = {\"device\": \"cuda:0\"}\n", - "encode_kwargs = {\"normalize_embeddings\": False}\n", - "hf_embeddings = HuggingFaceEmbeddings(\n", - " model_name=model_name,\n", - " model_kwargs=model_kwargs,\n", - " encode_kwargs=encode_kwargs,\n", - ")\n", - "start_time = time.time()\n", - "vectorstore = Milvus.from_documents(documents=documents, embedding=hf_embeddings, connection_args={\"host\": \"milvus\", \"port\": \"19530\"})\n", - "print(f\"--- {time.time() - start_time} seconds ---\")" - ] - }, - { - "cell_type": "markdown", - "id": "bc71d0c9-aeab-4a26-959f-5493ca1f2f02", - "metadata": {}, - "source": [ - "### Step 4: Sign up to NVIDIA AI Playground \n", - "\n", - "**NVIDIA AI Playground** on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. Developers get **free credits for 10K requests** to any of the available models. Sign up process is easy. Follow the instructions here and replace the API key below. For this example we will be using the llama2 13B model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cddceb0-1f58-48c4-aba4-cc1962ed8806", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from nv_aiplay import GeneralLLM\n", - "from langchain.callbacks import streaming_stdout\n", - "\n", - "os.environ['NVAPI_KEY'] = \"REPLACE_WITH_API_KEY\"\n", - "\n", - "callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]\n", - "\n", - "llm = GeneralLLM(\n", - " temperature=0.2,\n", - " max_tokens=300,\n", - " streaming=True,\n", - " callbacks = callbacks\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "8f25e8af-9aff-460a-a214-87a3e285469d", - "metadata": {}, - "source": [ - "### Step 5: Ask a question without context\n", - "\n", - "Send request to the llm without any context from the vector DB. The answer is generic and irrelvant." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fced303-d825-4797-9e4a-9a8c6e5f82aa", - "metadata": {}, - "outputs": [], - "source": [ - "question = \"Can you talk about the safety features of llama2 chat?\"\n", - "answer = llm(question)" - ] - }, - { - "cell_type": "markdown", - "id": "225fc935-4bc9-4957-a476-6d23d147e664", - "metadata": {}, - "source": [ - "### Step 6: Compose a streamed answer using a Chain\n", - "We have already integrated the Llama2 TRT LLM into LangChain with a custom wrapper, loaded and transformed documents, and generated and stored document embeddings in a vector database. To finish the pipeline, we need to add a few more LangChain components and combine all the components together with a [chain](https://python.langchain.com/docs/modules/chains/).\n", - "\n", - "A [LangChain chain](https://python.langchain.com/docs/modules/chains/) combines components together. In this case, we use a [RetrievalQA chain](https://js.langchain.com/docs/modules/chains/popular/vector_db_qa/), which is a chain type for question-answering against a vector index. It combines a *Retriever* and a *question answering (QA) chain*.\n", - "\n", - "We pass it 3 of our LangChain components:\n", - "- Our instance of the LLM (from step 1).\n", - "- A [retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/), which is an interface that returns documents given an unstructured query. In this case, we use our vector store as the retriever.\n", - "- Our prompt template constructed from the prompt format for Llama2 (from step 2)\n", - "\n", - "```\n", - "qa_chain = RetrievalQA.from_chain_type(\n", - " llm,\n", - " retriever=vectorstore.as_retriever(),\n", - " chain_type_kwargs={\"prompt\": LLAMA_PROMPT}\n", - ")\n", - "```\n", - "\n", - "Lastly, we pass a user query to the chain and stream the result. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65b7bac7-fda6-4584-b469-43f9eba75468", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains import RetrievalQA\n", - "\n", - "qa_chain = RetrievalQA.from_chain_type(\n", - " llm,\n", - " retriever=vectorstore.as_retriever()\n", - ")\n", - "result = qa_chain({\"query\": question})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/imgs/chrome_flags_fix_media_device_access_error.png b/notebooks/imgs/chrome_flags_fix_media_device_access_error.png new file mode 100755 index 00000000..a7cc6f51 Binary files /dev/null and b/notebooks/imgs/chrome_flags_fix_media_device_access_error.png differ diff --git a/notebooks/imgs/grace_answer.png b/notebooks/imgs/grace_answer.png new file mode 100644 index 00000000..76afc8c0 Binary files /dev/null and b/notebooks/imgs/grace_answer.png differ diff --git a/notebooks/imgs/grace_answer_with_riva.png b/notebooks/imgs/grace_answer_with_riva.png new file mode 100755 index 00000000..458555bf Binary files /dev/null and b/notebooks/imgs/grace_answer_with_riva.png differ diff --git a/notebooks/imgs/grace_noanswer.png b/notebooks/imgs/grace_noanswer.png new file mode 100644 index 00000000..f957762c Binary files /dev/null and b/notebooks/imgs/grace_noanswer.png differ diff --git a/notebooks/imgs/grace_noanswer_with_riva.png b/notebooks/imgs/grace_noanswer_with_riva.png new file mode 100755 index 00000000..5a1e2548 Binary files /dev/null and b/notebooks/imgs/grace_noanswer_with_riva.png differ diff --git a/notebooks/imgs/media_device_access_error.png b/notebooks/imgs/media_device_access_error.png new file mode 100755 index 00000000..d7427477 Binary files /dev/null and b/notebooks/imgs/media_device_access_error.png differ diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt index 6e68824f..b430e057 100644 --- a/notebooks/requirements.txt +++ b/notebooks/requirements.txt @@ -1,13 +1,16 @@ fastapi==0.104.1 uvicorn[standard]==0.24.0 python-multipart==0.0.6 -langchain==0.0.330 -tritonclient[all]==2.39.0 +langchain==0.0.352 unstructured[all-docs]==0.11.2 sentence-transformers==2.2.2 -llama-index==0.9.13 +llama-index==0.9.22 dataclass-wizard==0.22.2 opencv-python==4.8.0.74 llama-hub==0.0.43 pymilvus==2.3.1 -jupyterlab==4.0.8 \ No newline at end of file +jupyterlab==4.0.8 +langchain-nvidia-trt==0.0.1rc0 +langchain-core==0.1.3 +langchain-nvidia-ai-endpoints==0.0.1 +atlassian-python-api==3.41.4 \ No newline at end of file diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 00000000..e42268fe --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/evaluation/01_synthetic_data_generation.ipynb b/tools/evaluation/01_synthetic_data_generation.ipynb similarity index 96% rename from evaluation/01_synthetic_data_generation.ipynb rename to tools/evaluation/01_synthetic_data_generation.ipynb index abbc4eb8..262e8e79 100644 --- a/evaluation/01_synthetic_data_generation.ipynb +++ b/tools/evaluation/01_synthetic_data_generation.ipynb @@ -207,7 +207,7 @@ "source": [ "#### a) AI Playground LLM generator\n", "\n", - "**NVIDIA AI Playground** on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. Developers get **free credits for 10K requests** to any of the available models. Sign up process is easy. follow the steps here. \n", + "**NVIDIA AI Playground** on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. Developers get **free credits for 10K requests** to any of the available models. Sign up process is easy. follow the steps here. \n", "\n", "We are going to use theAI playground'ss `llama2-70B `LLM to generate the Question-Answer pairs." ] @@ -219,8 +219,8 @@ "metadata": {}, "outputs": [], "source": [ - "# import the relevant libraries\n", - "from nv_aiplay import GeneralLLM" + "# import the relevant libraries from langchain\n", + "from langchain_nvidia_ai_endpoints import ChatNVIDIA" ] }, { @@ -239,9 +239,9 @@ "outputs": [], "source": [ "import os\n", - "os.environ['NVAPI_KEY'] = \"nvapi-*\"\n", + "os.environ['NVIDIA_API_KEY'] = \"nvapi-*\"\n", "\n", - "llm = GeneralLLM(\n", + "llm = ChatNVIDIA(\n", " model=\"llama2_70b\",\n", " temperature=0.2,\n", " max_tokens=300\n", @@ -256,7 +256,7 @@ "outputs": [], "source": [ "# check the output\n", - "answer = llm(context)" + "answer = llm.invoke(context)" ] }, { @@ -317,7 +317,7 @@ "source": [ "# Synthetic Data Post-processing \n", "\n", - "So far, the generated JSON file structure embeds `gt_context`, `document` and the `question`, `answer` pair.\n", + "So far, the generated JSON file structure embeds `gt_context`, `document`, the `question` and `gt_answer` pair.\n", "\n", "In order to evaluate Retrieval Augmented Generation (RAG) systems, we need to add the RAG results fields (To be populated in the next notebook):\n", " - `contexts`: Retrieved documents by the retriever \n", diff --git a/evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb b/tools/evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb similarity index 88% rename from evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb rename to tools/evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb index 8e05d6cd..b82497b8 100644 --- a/evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb +++ b/tools/evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb @@ -28,7 +28,7 @@ "#### Define the LLM\n", "Here we are using a local llm on triton and the address and gRPC port that the Triton is available on. \n", "\n", - "***If you are using AI Playground (no local GPU) replace, the code in the cell below with the following: ***\n", + "***If you are using AI Playground (no local GPU) replace, the code in the cell two cells below with the following: ***\n", "\n", "```\n", "import os\n", @@ -43,6 +43,17 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a18dfc7b", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!test -d dataset || unzip dataset.zip" + ] + }, { "cell_type": "code", "execution_count": null, @@ -125,12 +136,12 @@ "\n", "# setup the text splitter\n", "TEXT_SPLITTER_MODEL = \"intfloat/e5-large-v2\"\n", - "TEXT_SPLITTER_CHUNCK_SIZE = 510\n", + "TEXT_SPLITTER_TOKENS_PER_CHUNK = 510\n", "TEXT_SPLITTER_CHUNCK_OVERLAP = 200\n", "\n", "text_splitter = SentenceTransformersTokenTextSplitter(\n", " model_name=TEXT_SPLITTER_MODEL,\n", - " chunk_size=TEXT_SPLITTER_CHUNCK_SIZE,\n", + " tokens_per_chunk=TEXT_SPLITTER_TOKENS_PER_CHUNK,\n", " chunk_overlap=TEXT_SPLITTER_CHUNCK_OVERLAP,\n", ")\n", "\n", @@ -234,6 +245,60 @@ "set_global_service_context(service_context)" ] }, + { + "cell_type": "markdown", + "id": "44e10c13", + "metadata": {}, + "source": [ + "Ingest the dataset using the /uploadDocument endpoint in the chain-server." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acdc51db", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import mimetypes\n", + "\n", + "def upload_document(file_path, url):\n", + " headers = {\n", + " 'accept': 'application/json'\n", + " }\n", + " mime_type, _ = mimetypes.guess_type(file_path)\n", + " files = {\n", + " 'file': (file_path, open(file_path, 'rb'), mime_type)\n", + " }\n", + " response = requests.post(url, headers=headers, files=files)\n", + "\n", + " return response.text\n", + "\n", + "def upload_pdf_files(folder_path, upload_url):\n", + " for files in os.listdir(folder_path):\n", + " _, ext = os.path.splitext(files)\n", + " # Ingest only pdf files\n", + " if ext.lower() == \".pdf\":\n", + " file_path = os.path.join(folder_path, files)\n", + " print(upload_document(file_path, upload_url))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823c89f9", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "start_time = time.time()\n", + "upload_pdf_files(\"dataset\", \"http://query:8081/uploadDocument\")\n", + "print(f\"--- {time.time() - start_time} seconds ---\")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -259,7 +324,13 @@ "from llama_index.vector_stores import MilvusVectorStore\n", "\n", "# store\n", - "vector_store = MilvusVectorStore(uri=\"http://milvus:19530\", dim=1024, overwrite=False)\n", + "vector_store = MilvusVectorStore(uri=\"http://milvus:19530\",\n", + " dim=1024,\n", + " collection_name=\"document_store_ivfflat\",\n", + " index_config={\"index_type\": \"IVF_FLAT\", \"nlist\": 64},\n", + " search_config={\"nprobe\": 16},\n", + " overwrite=False\n", + ")\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", "index = VectorStoreIndex.from_vector_store(vector_store)" ] diff --git a/evaluation/03_eval_ragas.ipynb b/tools/evaluation/03_eval_ragas.ipynb similarity index 100% rename from evaluation/03_eval_ragas.ipynb rename to tools/evaluation/03_eval_ragas.ipynb diff --git a/evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb b/tools/evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb similarity index 97% rename from evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb rename to tools/evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb index c28e30eb..fb310d08 100644 --- a/evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb +++ b/tools/evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb @@ -92,8 +92,9 @@ "invoke_url = \"https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0e349b44-440a-44e1-93e9-abe8dcb27158\" #Llama 2 70B\n", "fetch_url_format = \"https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/\"\n", "\n", + "# do not remove Bearer from Authorization, replace with api key\n", "headers = {\n", - " \"Authorization\": \"Bearer REPLACE_THIS_WITH_API_KEY\",\n", + " \"Authorization\": \"Bearer \",\n", " \"Accept\": \"application/json\",\n", "}\n" ] @@ -221,7 +222,7 @@ " response_body = response.json()\n", " llama_judge_responses.append(response_body['choices'][0]['message']['content'])\n", " except Exception as e:\n", - " print(\"pass\")\n", + " print(\"Exception:\", e)\n", " llama_judge_responses.append(None)\n" ] }, @@ -256,11 +257,11 @@ "\n", " # Extract and print the rating and explanation\n", " llama_ratings.append(int(rating_match.group(1)) if rating_match else None)\n", - " llama_explanations.append(explanation_match.group(1) if explanation_match else None)\n", + " llama_explanations.append(explanation_match.group(1) if explanation_match else response)\n", " except Exception as e:\n", - " print(\"pass\")\n", + " print(\"Exception\", e)\n", " llama_ratings.append(None)\n", - " llama_explanations.append(None)\n" + " llama_explanations.append(response)\n" ] }, { diff --git a/evaluation/Dockerfile.eval b/tools/evaluation/Dockerfile.eval similarity index 78% rename from evaluation/Dockerfile.eval rename to tools/evaluation/Dockerfile.eval index e76734f3..d0fc923a 100644 --- a/evaluation/Dockerfile.eval +++ b/tools/evaluation/Dockerfile.eval @@ -4,21 +4,21 @@ FROM python:3.10-slim WORKDIR /app #COPY notebooks -COPY ./evaluation/*.ipynb . +COPY ./tools/evaluation/*.ipynb . RUN mkdir -p /app/imgs COPY ./notebooks/dataset.zip . -COPY ./evaluation/imgs/* imgs/ +COPY ./tools/evaluation/imgs/* imgs/ COPY ./integrations/langchain/llms/triton_trt_llm.py . COPY ./integrations/langchain/llms/nv_aiplay.py . -COPY ./evaluation/requirements.txt . +COPY ./tools/evaluation/requirements.txt . -COPY ./evaluation/qa_generation.json . +COPY ./tools/evaluation/qa_generation.json . # Run pip dependencies RUN pip3 install -r requirements.txt diff --git a/evaluation/imgs/ragas.png b/tools/evaluation/imgs/ragas.png similarity index 100% rename from evaluation/imgs/ragas.png rename to tools/evaluation/imgs/ragas.png diff --git a/evaluation/imgs/synthetic_data_pipeline.png b/tools/evaluation/imgs/synthetic_data_pipeline.png similarity index 100% rename from evaluation/imgs/synthetic_data_pipeline.png rename to tools/evaluation/imgs/synthetic_data_pipeline.png diff --git a/evaluation/qa_generation.json b/tools/evaluation/qa_generation.json similarity index 100% rename from evaluation/qa_generation.json rename to tools/evaluation/qa_generation.json diff --git a/evaluation/requirements.txt b/tools/evaluation/requirements.txt similarity index 54% rename from evaluation/requirements.txt rename to tools/evaluation/requirements.txt index 001a80ca..7c471e45 100644 --- a/evaluation/requirements.txt +++ b/tools/evaluation/requirements.txt @@ -1,5 +1,4 @@ -langchain==0.0.348 -tritonclient[all]==2.39.0 +langchain==0.0.352 unstructured[all-docs]==0.11.2 sentence-transformers==2.2.2 llama-index==0.9.13 @@ -8,3 +7,7 @@ pymilvus==2.3.1 jupyterlab==4.0.8 ragas==0.0.21 seaborn==0.13.0 +langchain-core==0.1.3 +langchain-nvidia-ai-endpoints==0.0.1 +langchain-nvidia-trt==0.0.1rc0 +atlassian-python-api==3.41.4 diff --git a/tools/observability/__init__.py b/tools/observability/__init__.py new file mode 100644 index 00000000..e42268fe --- /dev/null +++ b/tools/observability/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/tools/observability/llamaindex/__init__.py b/tools/observability/llamaindex/__init__.py new file mode 100644 index 00000000..e42268fe --- /dev/null +++ b/tools/observability/llamaindex/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/tools/observability/llamaindex/opentelemetry_callback.py b/tools/observability/llamaindex/opentelemetry_callback.py new file mode 100644 index 00000000..48d89e85 --- /dev/null +++ b/tools/observability/llamaindex/opentelemetry_callback.py @@ -0,0 +1,198 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from opentelemetry.trace import Tracer, get_tracer, set_span_in_context, Status, StatusCode +from opentelemetry.trace.span import Span +from opentelemetry.context import Context, get_current, attach, detach +from typing import Any, Dict, List, Optional, Callable +from llama_index.callbacks.base_handler import BaseCallbackHandler +from llama_index.callbacks.base import CallbackManager +from llama_index.callbacks.schema import CBEventType, EventPayload, BASE_TRACE_EVENT +from llama_index.callbacks.token_counting import get_llm_token_counts, TokenCountingEvent +from llama_index.utilities.token_counting import TokenCounter +from llama_index.utils import get_tokenizer +from dataclasses import dataclass +from contextvars import ContextVar +import threading + +global_root_trace = ContextVar("trace", default=None) + +@dataclass +class SpanWithContext: + """Object for tracking a span, its context, and its context token""" + span: Span + context: Context + token: object + + def __init__(self, span: Span, context: Context, token: object, thread_identity): + self.span = span + self.context = context + self.token = token + self.thread_identity = thread_identity + +class OpenTelemetryCallbackHandler(BaseCallbackHandler): + """Callback handler for creating OpenTelemetry traces from llamaindex traces and events.""" + + def __init__( + self, + tracer: Optional[Tracer] = get_tracer(__name__), + tokenizer: Optional[Callable[[str], List]] = None, + ) -> None: + """Initializes the OpenTelemetryCallbackHandler. + + Args: + tracer: Optional[Tracer]: A OpenTelemetry tracer used to create OpenTelemetry spans + """ + super().__init__(event_starts_to_ignore=[], event_ends_to_ignore=[]) + self._tracer = tracer + self._event_map: Dict[str, SpanWithContext] = {} + self.tokenizer = tokenizer or get_tokenizer() + self._token_counter = TokenCounter(tokenizer=self.tokenizer) + + def start_trace(self, trace_id: Optional[str] = None) -> None: + trace_name = "llamaindex.trace" + if trace_id is not None: + trace_name = "llamaindex.trace." + trace_id + span = self._tracer.start_span(trace_name) + ctx = set_span_in_context(span) + token = attach(ctx) + global_root_trace.set(SpanWithContext(span=span, context=ctx, token=token, thread_identity=threading.get_ident())) + + def end_trace( + self, + trace_id: Optional[str] = None, + trace_map: Optional[Dict[str, List[str]]] = None, + ) -> None: + root_trace = global_root_trace.get() + if root_trace is not None: + if root_trace.thread_identity == threading.get_ident(): + detach(root_trace.token) + root_trace.span.end() + + def on_event_start( + self, + event_type: CBEventType, + payload: Optional[Dict[str, Any]] = None, + event_id: str = "", + parent_id: str = "", + **kwargs: Any, + ) -> str: + parent_ctx = None + # Case where the parent of this event is another event + if parent_id in self._event_map: + parent_ctx = self._event_map[parent_id].context + # Case where the parent of this event is the root trace, and the root trace exists + elif parent_id is BASE_TRACE_EVENT and global_root_trace.get() is not None: + parent_ctx = global_root_trace.get().context + # Case where the parent of this event is the root trace, but the trace does not exist + else: + return + + span_prefix = "llamaindex.event." + span = self._tracer.start_span(span_prefix + event_type.value, context=parent_ctx) + ctx = set_span_in_context(span) + token = attach(ctx) + self._event_map[event_id] = SpanWithContext(span=span, context=ctx, token=token, thread_identity=threading.get_ident()) + + span.set_attribute("event_id", event_id) + if payload is not None: + if event_type is CBEventType.QUERY: + span.set_attribute("query.text", payload[EventPayload.QUERY_STR]) + elif event_type is CBEventType.RETRIEVE: + pass + elif event_type is CBEventType.EMBEDDING: + span.set_attribute("embedding.model", payload[EventPayload.SERIALIZED]['model_name']) + span.set_attribute("embedding.batch_size", payload[EventPayload.SERIALIZED]['embed_batch_size']) + span.set_attribute("embedding.class_name", payload[EventPayload.SERIALIZED]['class_name']) + elif event_type is CBEventType.SYNTHESIZE: + span.set_attribute("synthesize.query_text", payload[EventPayload.QUERY_STR]) + elif event_type is CBEventType.CHUNKING: + for i, chunk in enumerate(payload[EventPayload.CHUNKS]): + span.set_attribute(f"chunk.{i}", chunk) + elif event_type is CBEventType.TEMPLATING: + if payload[EventPayload.QUERY_WRAPPER_PROMPT]: + span.set_attribute("query_wrapper_prompt", payload[EventPayload.QUERY_WRAPPER_PROMPT]) + if payload[EventPayload.SYSTEM_PROMPT]: + span.set_attribute("system_prompt", payload[EventPayload.SYSTEM_PROMPT]) + if payload[EventPayload.TEMPLATE]: + span.set_attribute("template", payload[EventPayload.TEMPLATE]) + if payload[EventPayload.TEMPLATE_VARS]: + for key, var in payload[EventPayload.TEMPLATE_VARS].items(): + span.set_attribute(f"template_variables.{key}", var) + elif event_type is CBEventType.LLM: + span.set_attribute("llm.class_name", payload[EventPayload.SERIALIZED]['class_name']) + span.set_attribute("llm.formatted_prompt", payload[EventPayload.PROMPT]) + span.set_attribute("llm.additional_kwargs", str(payload[EventPayload.ADDITIONAL_KWARGS])) + elif event_type is CBEventType.NODE_PARSING: + span.set_attribute("node_parsing.num_documents", len(payload[EventPayload.DOCUMENTS])) + elif event_type is CBEventType.EXCEPTION: + span.set_status(Status(StatusCode.ERROR)) + span.record_exception(payload[EventPayload.EXCEPTION]) + return event_id + + def on_event_end( + self, + event_type: CBEventType, + payload: Optional[Dict[str, Any]] = None, + event_id: str = "", + **kwargs: Any, + ) -> None: + if event_id in self._event_map: + span = self._event_map[event_id].span + span.set_attribute("event_id", event_id) + if payload is not None: + if event_type is CBEventType.QUERY: + pass + elif event_type is CBEventType.RETRIEVE: + for i, node_with_score in enumerate(payload[EventPayload.NODES]): + node = node_with_score.node + score = node_with_score.score + span.set_attribute(f"query.node.{i}.id", node.hash) + span.set_attribute(f"query.node.{i}.score", score) + span.set_attribute(f"query.node.{i}.text", node.text) + elif event_type is CBEventType.EMBEDDING: + texts = payload[EventPayload.CHUNKS] + vectors = payload[EventPayload.EMBEDDINGS] + total_chunk_tokens = 0 + for text, vector in zip(texts, vectors) : + span.set_attribute(f"embedding_text_{texts.index(text)}", text) + span.set_attribute(f"embedding_vector_{vectors.index(vector)}", vector) + total_chunk_tokens +=self._token_counter.get_string_tokens(text) + span.set_attribute(f"embedding_token_usage", total_chunk_tokens) + elif event_type is CBEventType.SYNTHESIZE: + pass + elif event_type is CBEventType.CHUNKING: + pass + elif event_type is CBEventType.TEMPLATING: + pass + elif event_type is CBEventType.LLM: + span.set_attribute("response.text", str( + payload.get(EventPayload.RESPONSE, "") + ) or str(payload.get(EventPayload.COMPLETION, "")) + ) + token_counts = get_llm_token_counts(self._token_counter, payload, event_id) + span.set_attribute("llm_prompt.token_usage", token_counts.prompt_token_count) + span.set_attribute("llm_completion.token_usage", token_counts.completion_token_count) + span.set_attribute("total_tokens_used", token_counts.total_token_count) + elif event_type is CBEventType.NODE_PARSING: + span.set_attribute("node_parsing.num_nodes", len(payload[EventPayload.NODES])) + elif event_type is CBEventType.EXCEPTION: + span.set_status(Status(StatusCode.ERROR)) + span.record_exception(payload[EventPayload.EXCEPTION]) + if self._event_map[event_id].thread_identity == threading.get_ident(): + detach(self._event_map[event_id].token) + self._event_map.pop(event_id, None) + span.end() + \ No newline at end of file