From 3d29acf677466c5c301370cab5867cb09e04e318 Mon Sep 17 00:00:00 2001
From: Shubhadeep Das <149712532+shubhadeepd@users.noreply.github.com>
Date: Mon, 22 Jan 2024 22:05:15 +0530
Subject: [PATCH] Upstream changes for v0.3.0 release (#29)
- Detailed changes mentioned in CHANGELOG.md file
---
.gitignore | 14 +
CHANGELOG.md | 39 +-
README.md | 69 +-
RetrievalAugmentedGeneration/.gitattributes | 1 -
RetrievalAugmentedGeneration/.gitignore | 25 -
RetrievalAugmentedGeneration/Dockerfile | 12 +-
RetrievalAugmentedGeneration/README.md | 725 +++++++++++++++---
RetrievalAugmentedGeneration/common/base.py | 33 +
.../common/configuration.py | 48 +-
RetrievalAugmentedGeneration/common/server.py | 80 +-
.../common/tracing.py | 69 ++
RetrievalAugmentedGeneration/common/utils.py | 126 ++-
.../examples/developer_rag/chains.py | 177 +++--
.../examples/nvidia_ai_foundation/chains.py | 151 ++++
.../nvidia_ai_foundation/requirements.txt | 1 +
.../query_decomposition_rag/__init__.py | 14 +
.../query_decomposition_rag/chains.py | 341 ++++++++
.../query_decomposition_rag/requirements.txt | 1 +
.../frontend/Dockerfile | 4 +-
.../frontend/frontend/asr_utils.py | 231 ++++++
.../frontend/frontend/assets/kaizen-theme.css | 15 +
.../frontend/frontend/chat_client.py | 31 +-
.../frontend/frontend/pages/converse.py | 135 +++-
.../frontend/frontend/pages/kb.py | 23 +-
.../frontend/frontend/tracing.py | 80 ++
.../frontend/frontend/tts_utils.py | 150 ++++
.../frontend/requirements.txt | 6 +-
.../ensemble_models/gptnext | 1 -
.../ensemble_models/gptnext/ensemble/1/.tmp | 0
.../gptnext/ensemble/config.pbtxt | 228 ++++++
.../gptnext/postprocessing/1/model.py | 158 ++++
.../gptnext/postprocessing/config.pbtxt | 50 ++
.../gptnext/preprocessing/1/model.py | 244 ++++++
.../gptnext/preprocessing/config.pbtxt | 65 ++
.../gptnext/tensorrt_llm/1/.gitkeep | 0
.../gptnext/tensorrt_llm/config.pbtxt.j2 | 208 +++++
.../model_server/__init__.py | 2 +
.../model_server/conversion/nemo.py | 12 +-
.../model_server/server.py | 2 +
.../tools/resize_nemo_model.sh | 40 +
RetrievalAugmentedGeneration/requirements.txt | 17 +-
deploy/compose/compose.env | 36 +-
deploy/compose/config.yaml | 26 +-
deploy/compose/configs/jaeger.yaml | 3 +
.../configs/otel-collector-config.yaml | 17 +
deploy/compose/docker-compose-evaluation.yaml | 22 +
...ound.yaml => docker-compose-nemotron.yaml} | 77 +-
.../docker-compose-nv-ai-foundation.yaml | 55 ++
.../compose/docker-compose-observability.yaml | 47 ++
deploy/compose/docker-compose-pgvector.yaml | 111 +++
deploy/compose/docker-compose.yaml | 45 +-
deploy/compose/nemotron_config.yaml | 1 +
.../templates/milvus-minio.yaml | 16 +-
.../templates/milvus-standalone.yaml | 12 +-
.../pkg/helmer/controller/test.yaml | 8 +-
docs/README.md | 35 +-
docs/developer-llm-operator/install.md | 17 +-
docs/developer-llm-operator/uninstall.md | 50 ++
docs/rag/aiplayground.md | 110 +--
docs/rag/architecture.md | 5 +-
docs/rag/chat_server.md | 2 +-
docs/rag/configuration.md | 20 +-
.../README.md => docs/rag/evaluation.md | 18 +-
docs/rag/frontend.md | 2 +-
docs/rag/hf_model_download.md | 59 ++
docs/rag/images/docker-output.png | Bin 343710 -> 0 bytes
docs/rag/images/hf/Slide1.JPG | Bin 0 -> 87536 bytes
docs/rag/images/hf/Slide10.JPG | Bin 0 -> 79054 bytes
docs/rag/images/hf/Slide11.JPG | Bin 0 -> 49832 bytes
docs/rag/images/hf/Slide12.JPG | Bin 0 -> 37311 bytes
docs/rag/images/hf/Slide13.JPG | Bin 0 -> 39497 bytes
docs/rag/images/hf/Slide14.JPG | Bin 0 -> 109891 bytes
docs/rag/images/hf/Slide15.JPG | Bin 0 -> 15027 bytes
docs/rag/images/hf/Slide2.JPG | Bin 0 -> 122757 bytes
docs/rag/images/hf/Slide3.JPG | Bin 0 -> 138028 bytes
docs/rag/images/hf/Slide4.JPG | Bin 0 -> 118758 bytes
docs/rag/images/hf/Slide5.JPG | Bin 0 -> 49692 bytes
docs/rag/images/hf/Slide6.JPG | Bin 0 -> 124868 bytes
docs/rag/images/hf/Slide7.JPG | Bin 0 -> 80341 bytes
docs/rag/images/hf/Slide8.JPG | Bin 0 -> 69000 bytes
docs/rag/images/hf/Slide9.JPG | Bin 0 -> 91604 bytes
docs/rag/images/hf/download.png | Bin 0 -> 39482 bytes
docs/rag/images/image10.png | Bin 0 -> 61357 bytes
docs/rag/images/image11.png | Bin 0 -> 53104 bytes
docs/rag/images/image12.png | Bin 0 -> 59367 bytes
docs/rag/images/image7.png | Bin 0 -> 78669 bytes
docs/rag/images/image8.png | Bin 0 -> 152618 bytes
docs/rag/images/image9.png | Bin 0 -> 72726 bytes
docs/rag/jupyter_server.md | 2 +-
docs/rag/llm_inference_server.md | 50 +-
docs/rag/observability.md | 107 +++
docs/rag/support_matrix.md | 4 +
examples/5_mins_rag_no_gpu/main.py | 144 ++++
examples/5_mins_rag_no_gpu/requirements.txt | 5 +
examples/README.md | 37 +
.../02.5_langchain_simple_AzureML.ipynb | 370 +++++++++
experimental/AzureML/README.md | 41 +
.../AzureML/images/azureml-github.gif | Bin 0 -> 1540321 bytes
.../AzureML/images/connection-info.png | Bin 0 -> 140864 bytes
experimental/AzureML/trt_llm_azureml.py | 362 +++++++++
notebooks/00-llm-non-streaming-nemotron.ipynb | 124 +++
notebooks/01-llm-streaming-client.ipynb | 51 +-
notebooks/02_langchain_simple.ipynb | 116 +--
notebooks/03_llama_index_simple.ipynb | 6 +-
.../04_llamaindex_hier_node_parser.ipynb | 5 +-
notebooks/06_AI_playground.ipynb | 276 -------
...me_flags_fix_media_device_access_error.png | Bin 0 -> 116826 bytes
notebooks/imgs/grace_answer.png | Bin 0 -> 74659 bytes
notebooks/imgs/grace_answer_with_riva.png | Bin 0 -> 102455 bytes
notebooks/imgs/grace_noanswer.png | Bin 0 -> 81122 bytes
notebooks/imgs/grace_noanswer_with_riva.png | Bin 0 -> 113831 bytes
notebooks/imgs/media_device_access_error.png | Bin 0 -> 106337 bytes
notebooks/requirements.txt | 11 +-
tools/__init__.py | 14 +
.../01_synthetic_data_generation.ipynb | 14 +-
...2_filling_RAG_outputs_for_Evaluation.ipynb | 79 +-
.../evaluation}/03_eval_ragas.ipynb | 0
.../04_Human_Like_RAG_Evaluation-AIP.ipynb | 11 +-
.../evaluation}/Dockerfile.eval | 8 +-
.../evaluation}/imgs/ragas.png | Bin
.../imgs/synthetic_data_pipeline.png | Bin
.../evaluation}/qa_generation.json | 0
.../evaluation}/requirements.txt | 7 +-
tools/observability/__init__.py | 14 +
tools/observability/llamaindex/__init__.py | 14 +
.../llamaindex/opentelemetry_callback.py | 198 +++++
126 files changed, 5459 insertions(+), 1033 deletions(-)
create mode 100644 .gitignore
delete mode 100644 RetrievalAugmentedGeneration/.gitattributes
delete mode 100644 RetrievalAugmentedGeneration/.gitignore
create mode 100644 RetrievalAugmentedGeneration/common/base.py
create mode 100644 RetrievalAugmentedGeneration/common/tracing.py
create mode 100644 RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py
create mode 100644 RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt
create mode 100644 RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py
create mode 100644 RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py
create mode 100644 RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt
create mode 100644 RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py
create mode 100644 RetrievalAugmentedGeneration/frontend/frontend/tracing.py
create mode 100644 RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py
delete mode 120000 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext
create mode 100644 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/1/.tmp
create mode 100755 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt
create mode 100755 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py
create mode 100755 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt
create mode 100644 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py
create mode 100644 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt
create mode 100644 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/1/.gitkeep
create mode 100644 RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2
create mode 100755 RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh
create mode 100644 deploy/compose/configs/jaeger.yaml
create mode 100644 deploy/compose/configs/otel-collector-config.yaml
create mode 100644 deploy/compose/docker-compose-evaluation.yaml
rename deploy/compose/{docker-compose-playground.yaml => docker-compose-nemotron.yaml} (65%)
create mode 100644 deploy/compose/docker-compose-nv-ai-foundation.yaml
create mode 100644 deploy/compose/docker-compose-observability.yaml
create mode 100644 deploy/compose/docker-compose-pgvector.yaml
create mode 100644 docs/developer-llm-operator/uninstall.md
rename evaluation/README.md => docs/rag/evaluation.md (58%)
create mode 100644 docs/rag/hf_model_download.md
delete mode 100644 docs/rag/images/docker-output.png
create mode 100644 docs/rag/images/hf/Slide1.JPG
create mode 100644 docs/rag/images/hf/Slide10.JPG
create mode 100644 docs/rag/images/hf/Slide11.JPG
create mode 100644 docs/rag/images/hf/Slide12.JPG
create mode 100644 docs/rag/images/hf/Slide13.JPG
create mode 100644 docs/rag/images/hf/Slide14.JPG
create mode 100644 docs/rag/images/hf/Slide15.JPG
create mode 100644 docs/rag/images/hf/Slide2.JPG
create mode 100644 docs/rag/images/hf/Slide3.JPG
create mode 100644 docs/rag/images/hf/Slide4.JPG
create mode 100644 docs/rag/images/hf/Slide5.JPG
create mode 100644 docs/rag/images/hf/Slide6.JPG
create mode 100644 docs/rag/images/hf/Slide7.JPG
create mode 100644 docs/rag/images/hf/Slide8.JPG
create mode 100644 docs/rag/images/hf/Slide9.JPG
create mode 100644 docs/rag/images/hf/download.png
create mode 100644 docs/rag/images/image10.png
create mode 100644 docs/rag/images/image11.png
create mode 100644 docs/rag/images/image12.png
create mode 100644 docs/rag/images/image7.png
create mode 100644 docs/rag/images/image8.png
create mode 100644 docs/rag/images/image9.png
create mode 100644 docs/rag/observability.md
create mode 100644 examples/5_mins_rag_no_gpu/main.py
create mode 100644 examples/5_mins_rag_no_gpu/requirements.txt
create mode 100644 examples/README.md
create mode 100644 experimental/AzureML/02.5_langchain_simple_AzureML.ipynb
create mode 100644 experimental/AzureML/README.md
create mode 100644 experimental/AzureML/images/azureml-github.gif
create mode 100644 experimental/AzureML/images/connection-info.png
create mode 100644 experimental/AzureML/trt_llm_azureml.py
create mode 100644 notebooks/00-llm-non-streaming-nemotron.ipynb
delete mode 100644 notebooks/06_AI_playground.ipynb
create mode 100755 notebooks/imgs/chrome_flags_fix_media_device_access_error.png
create mode 100644 notebooks/imgs/grace_answer.png
create mode 100755 notebooks/imgs/grace_answer_with_riva.png
create mode 100644 notebooks/imgs/grace_noanswer.png
create mode 100755 notebooks/imgs/grace_noanswer_with_riva.png
create mode 100755 notebooks/imgs/media_device_access_error.png
create mode 100644 tools/__init__.py
rename {evaluation => tools/evaluation}/01_synthetic_data_generation.ipynb (96%)
rename {evaluation => tools/evaluation}/02_filling_RAG_outputs_for_Evaluation.ipynb (88%)
rename {evaluation => tools/evaluation}/03_eval_ragas.ipynb (100%)
rename {evaluation => tools/evaluation}/04_Human_Like_RAG_Evaluation-AIP.ipynb (97%)
rename {evaluation => tools/evaluation}/Dockerfile.eval (78%)
rename {evaluation => tools/evaluation}/imgs/ragas.png (100%)
rename {evaluation => tools/evaluation}/imgs/synthetic_data_pipeline.png (100%)
rename {evaluation => tools/evaluation}/qa_generation.json (100%)
rename {evaluation => tools/evaluation}/requirements.txt (54%)
create mode 100644 tools/observability/__init__.py
create mode 100644 tools/observability/llamaindex/__init__.py
create mode 100644 tools/observability/llamaindex/opentelemetry_callback.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..9e2aa624
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+# Python Exclusions
+.venv
+**__pycache__**
+
+# Helm Exclusions
+**/charts/*.tgz
+
+# project temp files
+deploy/*.log
+deploy/*.txt
+
+# Docker Compose exclusions
+volumes/
+uploaded_files/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fbabf83e..af30477c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,25 +3,52 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.3.0] - 2024-01-22
+
+### Added
+
+- [New dedicated example](./docs/rag/aiplayground.md) showcasing Nvidia AI Playground based models using Langchain connectors.
+- [New example](./RetrievalAugmentedGeneration/README.md#5-qa-chatbot-with-task-decomposition-example----a100h100l40s) demonstrating query decomposition.
+- Support for using [PG Vector as a vector database in the developer rag canonical example.](./RetrievalAugmentedGeneration/README.md#deploying-with-pgvector-vector-store)
+- Support for using Speech-in Speech-out interface in the sample frontend leveraging RIVA Skills.
+- New tool showcasing [RAG observability support.](./tools/observability/)
+- Support for on-prem deployment of [TRTLLM based nemotron models.](./RetrievalAugmentedGeneration/README.md#6-qa-chatbot----nemotron-model)
+
+### Changed
+
+- Upgraded Langchain and llamaindex dependencies for all container.
+- Restructured [README](./README.md) files for better intuitiveness.
+- Added provision to plug in multiple examples using [a common base class](./RetrievalAugmentedGeneration/common/base.py).
+- Changed `minio` service's port to `9010`from `9000` in docker based deployment.
+- Moved `evaluation` directory from top level to under `tools` and created a [dedicated compose file](./deploy/compose/docker-compose-evaluation.yaml).
+- Added an [experimental directory](./experimental/) for plugging in experimental features.
+- Modified notebooks to use TRTLLM and Nvidia AI foundation based connectors from langchain.
+- Changed `ai-playground` model engine name to `nv-ai-foundation` in configurations.
+
+### Fixed
+
+- [Fixed issue #19](https://github.com/NVIDIA/GenerativeAIExamples/issues/19)
+
## [0.2.0] - 2023-12-15
### Added
-- Support for using [Nvidia AI Foundational LLM models](./docs/rag/aiplayground.md#using-nvdia-cloud-based-llms)
-- Support for using [Nvidia AI Foundational embedding models](./docs/rag/aiplayground.md#using-nvidia-cloud-based-embedding-models)
+- Support for using [Nvidia AI Playground based LLM models](./docs/rag/aiplayground.md)
+- Support for using [Nvidia AI Playground based embedding models](./docs/rag/aiplayground.md)
- Support for [deploying and using quantized LLM models](./docs/rag/llm_inference_server.md#quantized-llama2-model-deployment)
-- Support for [evaluating RAG pipeline](./evaluation/README.md)
+- Support for Kubernetes deployment support using helm charts
+- Support for [evaluating RAG pipeline](./tools/evaluation/README.md)
### Changed
- Repository restructing to allow better open source contributions
- [Upgraded dependencies](./RetrievalAugmentedGeneration/Dockerfile) for chain server container
-- [Upgraded NeMo Inference Framework container version](./RetrievalAugmentedGeneration/llm-inference-server/Dockerfile), no seperate sign up needed now for access.
+- [Upgraded NeMo Inference Framework container version](./RetrievalAugmentedGeneration/llm-inference-server/Dockerfile), no seperate sign up needed for access.
- Main [README](./README.md) now provides more details.
- Documentation improvements.
-- Better error handling and reporting mechanism for corner cases.
-- Renamed `triton-inference-server` container and service to `llm-inference-server`
+- Better error handling and reporting mechanism for corner cases
+- Renamed `triton-inference-server` container to `llm-inference-server`
### Fixed
diff --git a/README.md b/README.md
index 06c977c0..f9056109 100644
--- a/README.md
+++ b/README.md
@@ -8,40 +8,67 @@ Generative AI Examples uses resources from the [NVIDIA NGC AI Development Catalo
Sign up for a [free NGC developer account](https://ngc.nvidia.com/signin) to access:
-- The GPU-optimized NVIDIA containers, models, scripts, and tools used in these examples
-- The latest NVIDIA upstream contributions to the respective programming frameworks
-- The latest NVIDIA Deep Learning and LLM software libraries
-- Release notes for each of the NVIDIA optimized containers
-- Links to developer documentation
+- GPU-optimized containers used in these examples
+- Release notes and developer documentation
## Retrieval Augmented Generation (RAG)
-A RAG pipeline embeds multimodal data -- such as documents, images, and video -- into a database connected to a Large Language Model. RAG lets users use an LLM to chat with their own data.
+A RAG pipeline embeds multimodal data -- such as documents, images, and video -- into a database connected to a LLM. RAG lets users chat with their data!
-| Name | Description | LLM | Framework | Multi-GPU | Multi-node | Embedding | TRT-LLM | Triton | VectorDB | K8s |
-|---------------|-----------------------|------------|-------------------------|-----------|------------|-------------|---------|--------|----------|-----|
-| [Linux developer RAG](https://github.com/NVIDIA/GenerativeAIExamples/tree/main/RetrievalAugmentedGeneration) | Single VM, single GPU | llama2-13b | Langchain + Llama Index | No | No | e5-large-v2 | Yes | Yes | Milvus | No |
-| [Windows developer RAG](https://github.com/NVIDIA/trt-llm-rag-windows) | RAG on Windows | llama2-13b | Llama Index | No | No | NA | Yes | No | FAISS | NA |
-| [Developer LLM Operator for Kubernetes](./docs/developer-llm-operator/) | Single node, single GPU | llama2-13b | Langchain + Llama Index | No | No | e5-large-v2 | Yes | Yes | Milvus | Yes |
+### Developer RAG Examples
+The developer RAG examples run on a single VM. They demonstrate how to combine NVIDIA GPU acceleration with popular LLM programming frameworks using NVIDIA's [open source connectors](#open-source-integrations). The examples are easy to deploy via [Docker Compose](https://docs.docker.com/compose/).
-## Large Language Models
-NVIDIA LLMs are optimized for building enterprise generative AI applications.
+Examples support local and remote inference endpoints. If you have a GPU, you can inference locally via [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). If you don't have a GPU, you can inference and embed remotely via [NVIDIA AI Foundations endpoints](https://www.nvidia.com/en-us/ai-data-science/foundation-models/).
-| Name | Description | Type | Context Length | Example | License |
-|---------------|-----------------------|------------|----------------|---------|---------|
-| [nemotron-3-8b-qa-4k](https://huggingface.co/nvidia/nemotron-3-8b-qa-4k) | Q&A LLM customized on knowledge bases | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) |
-| [nemotron-3-8b-chat-4k-steerlm](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-steerlm) | Best out-of-the-box chat model with flexible alignment at inference | Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) |
-| [nemotron-3-8b-chat-4k-rlhf](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-rlhf) | Best out-of-the-box chat model performance| Text Generation | 4096 | No | [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license) |
+| Model | Embedding | Framework | Description | Multi-GPU | TRT-LLM | NVIDIA AI Foundation | Triton | Vector Database |
+|---------------|-----------------------|------------|-------------------------|-----------|------------|-------------|---------|--------|
+| llama-2 | e5-large-v2 | Llamaindex | Canonical QA Chatbot | [YES](RetrievalAugmentedGeneration/README.md#3-qa-chatbot-multi-gpu----a100h100l40s) | [YES](RetrievalAugmentedGeneration/README.md#2-qa-chatbot----a100h100l40s-gpu) | No | YES | Milvus/[PGVector]((RetrievalAugmentedGeneration/README.md#2-qa-chatbot----a100h100l40s-gpu))|
+| mixtral_8x7b | nvolveqa_40k | Langchain | [Nvidia AI foundation based QA Chatbot](RetrievalAugmentedGeneration/README.md#1-qa-chatbot----nvidia-ai-foundation-inference-endpoint) | No | No | YES | YES | FAISS|
+| llama-2 | all-MiniLM-L6-v2 | Llama Index | [QA Chatbot, GeForce, Windows](https://github.com/NVIDIA/trt-llm-rag-windows/tree/release/1.0) | NO | YES | NO | NO | FAISS |
+| llama-2 | nvolveqa_40k | Langchain | [QA Chatbot, Task Decomposition Agent](./RetrievalAugmentedGeneration/README.md#5-qa-chatbot-with-task-decomposition-example----a100h100l40s) | No | No | YES | YES | FAISS
+| mixtral_8x7b | nvolveqa_40k | Langchain | [Minimilastic example showcasing RAG using Nvidia AI foundation models](./examples/README.md#rag-in-5-minutes-example) | No | No | YES | YES | FAISS|
-## Integration Examples
+
+### Enterprise RAG Examples
+
+The enterprise RAG examples run as microservies distributed across multiple VMs and GPUs. They show how RAG pipelines can be orchestrated with [Kubernetes](https://kubernetes.io/) and deployed with [Helm](https://helm.sh/).
+
+Enterprise RAG examples include a [Kubernetes operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) for LLM lifecycle management. It is compatible with the [NVIDIA GPU operator](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/gpu-operator) that automates GPU discovery and lifecycle management in a Kubernetes cluster.
+
+Enterprise RAG examples also support local and remote inference via [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and [NVIDIA AI Foundations endpoints](https://www.nvidia.com/en-us/ai-data-science/foundation-models/).
+
+| Model | Embedding | Framework | Description | Multi-GPU | Multi-node | TRT-LLM | NVIDIA AI Foundation | Triton | Vector Database |
+|---------------|-----------------------|------------|--------|-------------------------|-----------|------------|-------------|---------|--------|
+| llama-2 | NV-Embed-QA-003 | Llamaindex | QA Chatbot, Helm, k8s | NO | NO | [YES](./docs/developer-llm-operator/) | NO | YES | Milvus|
+
+## Tools
+
+Example tools and tutorials to enhance LLM development and productivity when using NVIDIA RAG pipelines.
+
+| Name | Description | Deployment | Tutorial |
+|------|-------------|------|--------|
+| Evaluation | Example open source RAG eval tool that uses synthetic data generation and LLM-as-a-judge | [Docker compose file](./deploy/compose/docker-compose-evaluation.yaml) | [README](./docs/rag/evaluation.md) |]
+| Observability | Observability serves as an efficient mechanism for both monitoring and debugging RAG pipelines. | [Docker compose file](./deploy/compose/docker-compose-observability.yaml) | [README](./docs/rag/observability.md) |]
+
+## Open Source Integrations
+
+These are open source connectors for NVIDIA-hosted and self-hosted API endpoints. These open source connectors are maintained and tested by NVIDIA engineers.
+
+| Name | Framework | Chat | Text Embedding | Python | Description |
+|------|-----------|------|-----------|--------|-------------|
+|[NVIDIA AI Foundation Endpoints](https://python.langchain.com/docs/integrations/providers/nvidia) | [Langchain](https://www.langchain.com/) |[YES](https://python.langchain.com/docs/integrations/chat/nvidia_ai_endpoints)|[YES](https://python.langchain.com/docs/integrations/text_embedding/nvidia_ai_endpoints)|[YES](https://pypi.org/project/langchain-nvidia-ai-endpoints/)|Easy access to NVIDIA hosted models. Supports chat, embedding, code generation, steerLM, multimodal, and RAG.|
+|[NVIDIA Triton + TensorRT-LLM](https://github.com/langchain-ai/langchain/tree/master/libs/partners/nvidia-trt) | [Langchain](https://www.langchain.com/) |[YES](https://github.com/langchain-ai/langchain/blob/master/libs/partners/nvidia-trt/docs/llms.ipynb)|[YES](https://github.com/langchain-ai/langchain/blob/master/libs/partners/nvidia-trt/docs/llms.ipynb)|[YES](https://pypi.org/project/langchain-nvidia-trt/)|This connector allows Langchain to remotely interact with a Triton inference server over GRPC or HTTP tfor optimized LLM inference.|
+|[NVIDIA Triton Inference Server](https://docs.llamaindex.ai/en/stable/examples/llm/nvidia_triton.html) | [LlamaIndex](https://www.llamaindex.ai/) |YES|YES|NO|Triton inference server provides API access to hosted LLM models over gRPC. |
+|[NVIDIA TensorRT-LLM](https://docs.llamaindex.ai/en/stable/examples/llm/nvidia_tensorrt.html) | [LlamaIndex](https://www.llamaindex.ai/) |YES|YES|NO|TensorRT-LLM provides a Python API to build TensorRT engines with state-of-the-art optimizations for LLM inference on NVIDIA GPUs. |
+
## NVIDIA support
-In each of the READMEs, we indicate the level of support provided.
+In each example README we indicate the level of support provided.
## Feedback / Contributions
-We're posting these examples on GitHub to better support the community, facilitate feedback, as well as collect and implement contributions using GitHub Issues and pull requests. We welcome all contributions!
+We're posting these examples on GitHub to support the NVIDIA LLM community, facilitate feedback. We invite contributions via GitHub Issues or pull requests!
## Known issues
- In each of the READMEs, we indicate any known issues and encourage the community to provide feedback.
diff --git a/RetrievalAugmentedGeneration/.gitattributes b/RetrievalAugmentedGeneration/.gitattributes
deleted file mode 100644
index c8a8d73b..00000000
--- a/RetrievalAugmentedGeneration/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-notebooks/dataset.zip filter=lfs diff=lfs merge=lfs -text
diff --git a/RetrievalAugmentedGeneration/.gitignore b/RetrievalAugmentedGeneration/.gitignore
deleted file mode 100644
index baec5514..00000000
--- a/RetrievalAugmentedGeneration/.gitignore
+++ /dev/null
@@ -1,25 +0,0 @@
-# Python Exclusions
-.venv
-__pycache__
-
-# Sphinx Exclusions
-_build
-
-# Helm Exclusions
-**/charts/*.tgz
-
-# project temp files
-deploy/*.log
-deploy/*.txt
-**/my.*
-**/my-*
-
-# Next JS Exclusions
-**/.next
-frontend/frontend_js/out
-frontend-sdxl/frontend_js/out
-**/node_modules
-
-# Docker Compose exclusions
-volumes/
-uploaded_files/
diff --git a/RetrievalAugmentedGeneration/Dockerfile b/RetrievalAugmentedGeneration/Dockerfile
index 25e879cd..20578559 100644
--- a/RetrievalAugmentedGeneration/Dockerfile
+++ b/RetrievalAugmentedGeneration/Dockerfile
@@ -1,14 +1,22 @@
ARG BASE_IMAGE_URL=nvcr.io/nvidia/pytorch
ARG BASE_IMAGE_TAG=23.08-py3
-
FROM ${BASE_IMAGE_URL}:${BASE_IMAGE_TAG}
+
+ARG EXAMPLE_NAME
COPY RetrievalAugmentedGeneration/__init__.py /opt/RetrievalAugmentedGeneration/
COPY RetrievalAugmentedGeneration/common /opt/RetrievalAugmentedGeneration/common
-COPY RetrievalAugmentedGeneration/examples /opt/RetrievalAugmentedGeneration/examples
+COPY RetrievalAugmentedGeneration/examples/${EXAMPLE_NAME} /opt/RetrievalAugmentedGeneration/example
COPY integrations /opt/integrations
+COPY tools /opt/tools
+RUN apt-get update && apt-get install -y libpq-dev
RUN --mount=type=bind,source=RetrievalAugmentedGeneration/requirements.txt,target=/opt/requirements.txt \
python3 -m pip install --no-cache-dir -r /opt/requirements.txt
+RUN if [ -f "/opt/RetrievalAugmentedGeneration/example/requirements.txt" ] ; then \
+ python3 -m pip install --no-cache-dir -r /opt/RetrievalAugmentedGeneration/example/requirements.txt ; else \
+ echo "Skipping example dependency installation, since requirements.txt was not found" ; \
+ fi
+
WORKDIR /opt
ENTRYPOINT ["uvicorn", "RetrievalAugmentedGeneration.common.server:app"]
diff --git a/RetrievalAugmentedGeneration/README.md b/RetrievalAugmentedGeneration/README.md
index b47d967d..614d19fc 100644
--- a/RetrievalAugmentedGeneration/README.md
+++ b/RetrievalAugmentedGeneration/README.md
@@ -1,205 +1,694 @@
# Retrieval Augmented Generation
-## Project Details
-**Project Goal**: A reference Retrieval Augmented Generation(RAG) workflow for a chatbot to question answer off public press releases & tech blogs. It performs document ingestion & Q&A interface using open source models deployed on any cloud or customer datacenter, leverages the power of GPU-accelerated Milvus for efficient vector storage and retrieval, along with TRT-LLM, to achieve lightning-fast inference speeds with custom LangChain LLM wrapper.
+Retrieval Augmented Generation (RAG) generates up-to-date and domain-specific answers by connecting a Large Language Model (LLM) to your enterprise data.
+
+## Developer RAG Examples
+
+1. [QA Chatbot -- No-GPU using NVIDIA AI Foundation](#1-qa-chatbot----nvidia-ai-foundation-inference-endpoint)
+2. [QA Chatbot -- A100/H100/L40S](#2-qa-chatbot----a100h100l40s-gpu)
+3. [QA Chatbot -- Multi-GPU](#3-qa-chatbot-multi-gpu----a100h100l40s)
+4. [QA Chatbot -- Quantized LLM model](#4-qa-chatbot-with-quantized-llm-model----a100h100l40s)
+5. [QA Chatbot -- Task Decomposition](#5-qa-chatbot-with-task-decomposition-example----a100h100l40s)
+6. [QA Chatbot -- NemoTron Model](#6-qa-chatbot----nemotron-model)
+
+
+
+### 1: QA Chatbot -- NVIDIA AI Foundation inference endpoint
+
+This example deploys a developer RAG pipeline for chat QA and serves inferencing via the NVIDIA AI Foundation endpoint.
+
+Developers get free credits for 10K requests to any of the available models.
+
+
+
+
+ Model |
+ Embedding |
+ Framework |
+ Description |
+ Multi-GPU |
+ TRT-LLM |
+ NVIDIA AI Foundation |
+ Triton |
+ Vector Database |
+
+
+
+
+ mixtral_8x7b |
+ nvolveqa_40k |
+ Langchain |
+ QA chatbot |
+ NO |
+ NO |
+ YES |
+ NO |
+ FAISS |
+
+
+
+
+#### 1.1 Prepare the environment
+
+This example uses NVIDIA AI Foundation inference endpoint.
+
+1. Follow steps 1 - 5 in the ["Prepare the environment" section of example 02](#21-prepare-the-environment).
+
+#### 1.2 Deploy
+
+Follow [these instructions](../docs/rag/aiplayground.md) to sign up for an NVIDIA AI Foundation developer account and deploy this example.
+
+
+
+### 2: QA Chatbot -- A100/H100/L40S GPU
+
+This example deploys a developer RAG pipeline for chat QA and serves inferencing via the NeMo Framework inference container.
+> ⚠️ **NOTE**: This example requires an A100, H100, or L40S GPU. Refer to the [support matrix](../docs/rag/support_matrix.md) to understand memory requirements for the model you are deploying.
+
+
+
+
+ Model |
+ Embedding |
+ Framework |
+ Description |
+ Multi-GPU |
+ TRT-LLM |
+ NVIDIA AI Foundation |
+ Triton |
+ Vector Database |
+
+
+
+
+ llama-2 |
+ e5-large-v2 |
+ Llamaindex |
+ QA chatbot |
+ NO |
+ YES |
+ NO |
+ YES |
+ Milvus |
+
+
+
+
+ llama-2 |
+ e5-large-v2 |
+ Llamaindex |
+ QA chatbot |
+ NO |
+ YES |
+ NO |
+ YES |
+ pgvector |
+
+
+
+
+
+#### 2.1 Prepare the environment
+
+1. Install [Docker Engine and Docker Compose.](https://docs.docker.com/engine/install/ubuntu/)
+
+2. Verify NVIDIA GPU driver version 535 or later is installed.
+
+ **Note**: This step is not required for Nvidia AI foundation workflow
+
+``` $ nvidia-smi --query-gpu=driver_version --format=csv,noheader
+535.129.03
+
+$ nvidia-smi -q -d compute
+
+==============NVSMI LOG==============
+
+Timestamp : Sun Nov 26 21:17:25 2023
+Driver Version : 535.129.03
+CUDA Version : 12.2
+
+Attached GPUs : 1
+GPU 00000000:CA:00.0
+ Compute Mode : Default
+```
+Reference: [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and [NVIDIA Linux driver installation instructions](https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html)
+
+3. Clone the Generative AI examples Git repository.
+
+> ⚠️ **NOTE**: This example requires Git Large File Support (LFS)
+
+```
+sudo apt -y install git-lfs
+git clone git@github.com:NVIDIA/GenerativeAIExamples.git
+cd GenerativeAIExamples/
+git lfs pull
+```
-## Components
-- **LLM**: [Llama2](https://ai.meta.com/llama/) - 7b-chat, 13b-chat, and 70b-chat all supported. 13b-chat and 70b-chat generate good responses.
-- **LLM Backend**: Nemo framework inference container with Triton inference server & TRT-LLM backend for speed.
-- **Vector DB**: Milvus because it's GPU accelerated.
-- **Embedding Model**: [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) since it is one of the best embedding model available at the moment.
-- **Framework(s)**: LangChain and LlamaIndex.
+4. Verify the NVIDIA container toolkit is installed and configured as the default container runtime.
-This reference workflow uses a variety of components and services to customize and deploy the RAG based chatbot. The following diagram illustrates how they work together. Refer to the [detailed architecture guide](../docs/rag/architecture.md) to understand more about these components and how they are tied together.
+ **Note**: This step is not required for Nvidia AI foundation workflow
+```
+$ cat /etc/docker/daemon.json
+{
+ "default-runtime": "nvidia",
+ "runtimes": {
+ "nvidia": {
+ "path": "/usr/bin/nvidia-container-runtime",
+ "runtimeArgs": []
+ }
+ }
+}
+
+$ sudo docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi -L
+GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-d8ce95c1-12f7-3174-6395-e573163a2ace)
+```
+
+5. Create an NGC Account and API Key.
+
+Please refer to [instructions](https://docs.nvidia.com/ngc/gpu-cloud/ngc-overview/index.html) to create account and generate NGC API key.
+
+Login to `nvcr.io` using the following command:
+
+```
+docker login nvcr.io
+```
+
+6. [Optional] Enable Riva ASR and TTS.
-![Diagram](../docs/rag/images/image3.jpg)
+ a. To launch a Riva server locally, please refer to the instructions in the [Riva Quick Start Guide](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/quick-start-guide.html).
-*Note:*
-We've used [Llama2](https://ai.meta.com/llama/) and [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) models as example defaults in this workflow, you should ensure that both the LLM and embedding model are appropriate for your use case, and validate that they are secure and have not been tampered with prior to use.
+ - In the provided `config.sh` script, set `service_enabled_asr=true` and `service_enabled_tts=true`, and select the desired ASR and TTS languages by adding the appropriate language codes to `asr_language_code` and `tts_language_code`.
-# Getting Started
-This section covers step by step guide to setup and try out this example workflow.
+ - Once the server is running, assign its IP address (or hostname) and port (50051 by default) to `RIVA_API_URI` in `deploy/compose/compose.env`.
-## Prerequisites
-Before proceeding with this guide, make sure you meet the following prerequisites:
+ b. Alternatively, you can use a hosted Riva API endpoint. You might need to obtain an API key and/or Function ID for access.
-- You should have at least one NVIDIA GPU. For this guide, we used an A100 data center GPU.
+ - In `deploy/compose/compose.env`, make the following assignments as necessary:
+ ```
+ export RIVA_API_URI=":"
+ export RIVA_API_KEY=""
+ export RIVA_FUNCTION_ID=""
+ ```
- - NVIDIA driver version 535 or newer. To check the driver version run: ``nvidia-smi --query-gpu=driver_version --format=csv,noheader``.
- - If you are running multiple GPUs they must all be set to the same mode (ie Compute vs. Display). You can check compute mode for each GPU using
- ``nvidia-smi -q -d compute``
+Reference:
+- [Docker installation instructions (Ubuntu)](https://docs.docker.com/engine/install/ubuntu/)
+- [NVIDIA Container Toolkit Installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
-### Setup the following
+#### 2.2 Deploy
-- Docker and Docker-Compose are essential. Please follow the [installation instructions](https://docs.docker.com/engine/install/ubuntu/).
+##### Downloading the model
+You can download the model either from huggingface or meta.
- Note:
- Please do **not** use Docker that is packaged with Ubuntu as the newer version of Docker is required for proper Docker Compose support.
+The steps mentioned here explains how to download from meta. If you are interested in downloading the model checkpoints from huggingface, follow the steps [here](../docs/rag/hf_model_download.md) instead.
- Make sure your user account is able to execute Docker commands.
+1. Clone the Llama Github.
+```
+git clone https://github.com/facebookresearch/llama.git
+cd llama/
+```
-- NVIDIA Container Toolkit is also required. Refer to the [installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
+2. Fill out Meta's [Llama request access form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
+3. Download the model weights.
-- NGC Account and API Key
+- Select the Llama 2 and Llama Chat text boxes.
+- After verifying your email, Meta will email you a download link.
+- Download the llama-2-13b-chat model when prompted.
- - Please refer to [instructions](https://docs.nvidia.com/ngc/gpu-cloud/ngc-overview/index.html) to create account and generate NGC API key.
- - Docker login to `nvcr.io` using the following command:
- ```
- docker login nvcr.io
- ```
+```
+$ ./download.sh
+Enter the URL from email: < https://download.llamameta.net/… etc>
-- git-lfs
- - Make sure you have [git-lfs](https://git-lfs.github.com) installed.
+Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: 13B-chat
+```
-- You can download Llama2 Chat Model Weights from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/). You can skip this step [if you are interested in using cloud based LLM's using Nvidia AI Playground](#using-nvdia-cloud-based-llm).
+4. Copy the tokenizer to the model directory.
- **Note for checkpoint downloaded using Meta**:
+```
+$ mv tokenizer* llama-2-13b-chat/
- - When downloading model weights from Meta, you can follow the instructions up to the point of downloading the models using ``download.sh``. There is no need to deploy the model using the steps mentioned in the repository. We will use Triton to deploy the model.
+$ ls ~/git/llama/llama-2-13b-chat/
+checklist.chk consolidated.00.pth consolidated.01.pth params.json tokenizer.model tokenizer_checklist.chk
+```
- - Meta will download two additional files, namely `tokenizer.model` and `tokenizer_checklist.chk`, outside of the model checkpoint directory. Ensure that you copy these files into the same directory as the model checkpoint directory.
+##### Deploying the model
- **Using Cloud based Nvidia AI Foundational models**:
+1. Set the absolute path to the model location in compose.env.
- - Instead of deploying the models on-prem if you will like to use LLM models deployed from NVIDIA AI Playground then follow the instructions from [here.](../docs/rag/aiplayground.md)
+```
+$ cd ~/git/GenerativeAIExamples
- **Using Quantized models**:
+$ grep MODEL deploy/compose/compose.env | grep -v \#
+export MODEL_DIRECTORY="/home/nvidia/git/llama/llama-2-13b-chat/"
+export MODEL_ARCHITECTURE="llama"
+export MODEL_NAME="Llama-2-13b-chat"
+```
- - In this workflow, we will be leveraging a Llama2 (7B parameters) chat model, which requires 38 GB of GPU memory.
- IMPORTANT: For this initial version of the workflow only 7B chat model is supported on A100 and H100 GPUs.
+2. Deploy the developer RAG example via Docker compose using milvus vector store, steps to deploy RAG example with pgvector vector store is [here](#deploying-with-pgvector-vector-store).
- - We also support quantization of LLama2 model using AWQ, which changes model precision to INT4, thereby reducing memory usage. Checkout the steps [here](../docs/rag/llm_inference_server.md) to enable quantization.
+> ⚠️ **NOTE**: It may take up to 5 minutes for the Triton server to start. The `-d` flag starts the services in the background.
+```
+$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build
-## Install Guide
+$ docker compose -f deploy/compose/docker-compose.yaml up -d
-NVIDIA TensorRT LLM providex state of the art performance for running LLM inference. Follow the below steps from the root of this project to setup the RAG example with TensorRT LLM and Triton deployed locally.
+$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+CONTAINER ID NAMES STATUS
+256da0ecdb7b llm-playground Up 48 minutes
+2974aa4fb2ce chain-server Up 48 minutes
+4a8c4aebe4ad notebook-server Up 48 minutes
+5be2b57bb5c1 milvus-standalone Up 48 minutes (healthy)
+ecf674c8139c llm-inference-server Up 48 minutes (healthy)
+a6609c22c171 milvus-minio Up 48 minutes (healthy)
+b23c0858c4d4 milvus-etcd Up 48 minutes (healthy)
+```
-### Step 1: Set Environment Variables
+Reference:
+- [Meta Llama README](https://github.com/facebookresearch/llama/blob/main/README.md)
+- [Meta Llama request access form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
-Modify ``compose.env`` in the ``deploy/compose`` directory to set your environment variables. The following variables are required as shown below for using a llama based model.
+#### 2.3 Test
- # full path to the local copy of the model weights
- export MODEL_DIRECTORY="$HOME/src/Llama-2-13b-chat-hf"
+1. Connect to the sample web application at ``http://host-ip:8090``.
- # the architecture of the model. eg: llama
- export MODEL_ARCHITECTURE="llama"
+2. Check **[X] Enable TTS output** to allow the web app to read the answers to your queries aloud.
- # the name of the model being used - only for displaying on frontend
- export MODEL_NAME="llama-2-13b-chat"
+3. Select the desired ASR language (`English (en-US)` for this test), TTS language (`English (en-US)` for this test) and TTS voice from the dropdown menus below the checkboxes to utilize the web app's voice-to-voice interaction capabilities.
- # [OPTIONAL] the config file for chain server
- APP_CONFIG_FILE=/dev/null
+4. In the Converse tab, type "How many cores does the Grace superchip contain?" in the chat box and press Submit. Alternatively, click on the microphone button to the right of the text box and ask your query verbally.
+![Grace query failure](../notebooks/imgs/grace_noanswer_with_riva.png)
-### Step 2: Build and Start Containers
-- Pull lfs files. This will pull large files from repository.
- ```
- git lfs pull
- ```
-- Run the following command to build containers.
- ```
- source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build
- ```
+5. If you encounter an error message reading "Media devices could not be accessed" when you first attempt to transcribe a voice query,
-- Run the following command to start containers.
- ```
- source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml up -d
- ```
- > ⚠️ **NOTE**: It will take a few minutes for the containers to come up and may take up to 5 minutes for the Triton server to be ready. Adding the `-d` flag will have the services run in the background. ⚠️
+![Media device access error](../notebooks/imgs/media_device_access_error.png)
+
+carry out the following steps:
+
+ - Open ``chrome://flags`` in another browser tab.
+
+ - Search for "insecure origins treated as secure".
+
+ - Copy ``http://host-ip:8090`` into the associated text box.
+
+ - Select "Enabled" in the adjacent dropdown menu.
+
+ - Click on the "Relaunch" button at the bottom right of the page.
+
+ - Grant ``http://host-ip:8090`` access to your microphone.
+
+![Fix media device access error in Chrome Flags](../notebooks/imgs/chrome_flags_fix_media_device_access_error.png)
+
+6. Upload the sample data set to the Knowledge Base tab.
+
+> ⚠️ **NOTE**: ``dataset.zip`` is located in the ``notebooks`` directory. Unzip the archive and upload the PDFs.
+
+> There is a timeout of `10 mins` set for the ingestion process. Uploading large files may see ingestion failure depending on network bandwidth.
+
+7. Return to **Converse** tab and check **[X] Use knowledge base**.
+
+8. Retype (or re-transcribe) the question: "How many cores does the Grace superchip contain?"
+
+![Grace query success](../notebooks/imgs/grace_answer_with_riva.png)
+
+> ⚠️ **NOTE**: Default prompts are optimized for llama chat model if you're using completion model then prompts need to be finetuned accordingly.
+
+#### Learn More
+
+Execute the Jupyter notebooks to explore optional features.
-- Run ``docker ps -a``. When the containers are ready the output should look similar to the image below.
- ![Docker Output](../docs/rag/images/docker-output.png "Docker Output Image")
+Note: Jupyter notebook is supported for [default flow](../deploy/compose/docker-compose.yaml) i.e. trt-llm with milvus.
+1. In a web browser, open Jupyter at ``http://host-ip:8888``.
- **Note**:
- - Default prompts are optimized for llama chat model if you're using completion model then prompts need to be finetuned accordingly.
+2. Execute the notebooks in order:
-#### Multi GPU deployment
+- [Enable streaming responses from the LLM](../notebooks/01-llm-streaming-client.ipynb)
+- [Document QA with LangChain](../notebooks/02_langchain_simple.ipynb)
+- [Document QA with LlamaIndex](../notebooks/03_llama_index_simple.ipynb)
+- [Advanced Document QA with LlamaIndex](../notebooks/04_llamaindex_hier_node_parser.ipynb)
+- [Document QA via REST FastAPI Server](../notebooks/05_dataloader.ipynb)
-By default the LLM model will be deployed using all available GPU's of the system. To use some specific GPU's you can provide the GPU ID(s) in the [docker compose file](../deploy/compose/docker-compose.yaml) under `llm` service's `deploy` section:
+#### 2.4 Uninstall
+To uninstall, stop and remove the running containers.
+```
+cd deploy/compose
+source compose.env
+docker compose down
+docker compose ps -q
+```
+
+#### Deploying with [pgvector](https://github.com/pgvector/pgvector) vector store
+2. Deploy the developer RAG example via Docker compose.
+
+> ⚠️ **NOTE**: It may take up to 5 minutes for the Triton server to start. The `-d` flag starts the services in the background.
+
+```
+$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-pgvector.yaml build
+
+$ docker compose -f deploy/compose/docker-compose-pgvector.yaml up -d
+
+$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+CONTAINER ID NAMES STATUS
+0f6f091d892e llm-playground Up 22 hours
+8d0ab09fcb98 chain-server Up 22 hours
+85bd98ba3b24 notebook-server Up 22 hours
+22f0d405b38b llm-inference-server Up 22 hours (healthy)
+cbd3cf65ce7e pgvector Up 22 hours
+```
+
+After deployment is successful, you can follow steps from [Test](#23-test) to verify workflow.
+
+
+
+### 3: QA Chatbot Multi-GPU -- A100/H100/L40S
+
+This example deploys a developer RAG pipeline for chat QA and serves inference via the NeMo Framework inference container across multiple GPUs.
+
+
+
+
+ Model |
+ Embedding |
+ Framework |
+ Description |
+ Multi-GPU |
+ TRT-LLM |
+ NVIDIA AI Foundation |
+ Triton |
+ Vector Database |
+
+
+
+
+ llama-2 |
+ e5-large-v2 |
+ Llamaindex |
+ QA chatbot |
+ YES |
+ YES |
+ NO |
+ YES |
+ Milvus |
+
+
+
+
+#### 3.1 Prepare the environment
+
+1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment).
+
+#### 3.2 Deploy
+
+1. Follow steps 1 - 4 in the ["Deploy" section of example 02](#downloading-the-model) to stage the model weights.
+
+2. Find the GPU device ID. You can check this using `nvidia-smi` command.
+
+3. Assign LLM inference to specific GPUs by specifying the GPU ID(s) in the [docker compose file](../deploy/compose/docker-compose.yaml).
+
+```
deploy:
resources:
reservations:
devices:
- driver: nvidia
# count: ${INFERENCE_GPU_COUNT:-all} # Comment this out
- device_ids: ["0"] # Provide the device id of GPU. It can be found using `nvidia-smi` command
+ device_ids: ["0"]
capabilities: [gpu]
+```
+
+4. Follow steps in the ["Deploy the model" section of example 02](#deploying-the-model) to deploy via Docker compose.
+
+#### 3.3 Test
+
+1. Follow steps 1 - 5 in the ["Test" section of example 02](#23-test).
+
+2. Verify the correct GPU is serving the model using `nvidia-smi`.
+
+#### 3.4 Uninstall
+
+1. To unintstall, follow the ["Uninstall" steps in example 02"](#24-uninstall).
+
+
+
+
+### 4: QA Chatbot with Quantized LLM model -- A100/H100/L40S
+
+This example deploys a developer RAG pipeline for chat QA and serves inference via the NeMo Framework inference container across multiple GPUs using a quantized version of Llama-7b-chat model.
+
+
+
+
+ Model |
+ Embedding |
+ Framework |
+ Description |
+ Multi-GPU |
+ TRT-LLM |
+ NVIDIA AI Foundation |
+ Triton |
+ Vector Database |
+
+
+
+
+ llama-2-7b-chat |
+ e5-large-v2 |
+ Llamaindex |
+ QA chatbot |
+ YES |
+ YES |
+ NO |
+ YES |
+ Milvus |
+
+
+
+
+#### 4.1 Prepare the environment
+
+1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment).
+
+
+#### 4.2 Deploy
+1. [Download Llama2-7b chat Chat Model Weights](#downloading-the-model) from huggingface as meta checkpoint does not have the required files to quantize it.
+
+> ⚠️ **NOTE**: For this initial version only 7B chat model is supported on A100/H100/L40 GPUs.
+
+
+1. For quantization of the Llama2 model using AWQ, first clone the [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0) repository separately and checkout release/v0.5.0.
+
+ - Also copy the Llama2 model directory downloaded earlier to the TensorRT-LLM repo
+
+```
+ git clone https://github.com/NVIDIA/TensorRT-LLM.git
+ cp -r TensorRT-LLM/
+ cd TensorRT-LLM/
+ git checkout release/0.5.0
+```
+
+3. Now setup the TensorRT-LLM repo seprately using steps [here](https://github.com/NVIDIA/TensorRT-LLM/blob/release/0.5.0/docs/source/installation.md)
+
+4. Once the model is downloaded and TensorRT-LLM repo is setup, we can quantize the model using the TensorRT-LLM container.
+
+ - Follow the steps from [here](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.5.0/examples/llama#awq) to quantize using AWQ, run these commands inside the container.
+
+ - While running the quantization script, make sure to point `--model_dir` to your downloaded Llama2 model directory
+
+ - Once the quantization is completed, copy the generated PyTorch (.pt) file inside the model directory
+
+ ```
+ cp .pt
+ ```
+
+5. Now, we will come back our repository, follow the steps below to deploy this quantized model using the inference server.
+
+ - Update [compose.env](../deploy/compose/compose.env) with `MODEL_DIRECTORY` pointing to Llama2 model directory containing the quantized checkpoint.
+
+ - Make sure the qantized PyTorch model (.pt) file generated using above steps is present inside the MODEL_DIRECTORY.
+
+
+ - Uncomment the QUANTIZATION variable which specifies quantization as "int4_awq" inside the [compose.env](../deploy/compose/compose.env).
+ ```
+ export QUANTIZATION="int4_awq"
+ ```
+
+6. Deploy the developer RAG example via Docker compose.
+
+> ⚠️ **NOTE**: It may take up to 5 minutes for the Triton server to start. The `-d` flag starts the services in the background.
+
+```
+$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build
+
+$ docker compose -f deploy/compose/docker-compose.yaml up -d
+
+$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+CONTAINER ID NAMES STATUS
+256da0ecdb7b llm-playground Up 48 minutes
+2974aa4fb2ce chain-server Up 48 minutes
+4a8c4aebe4ad notebook-server Up 48 minutes
+5be2b57bb5c1 milvus-standalone Up 48 minutes (healthy)
+ecf674c8139c llm-inference-server Up 48 minutes (healthy)
+a6609c22c171 milvus-minio Up 48 minutes (healthy)
+b23c0858c4d4 milvus-etcd Up 48 minutes (healthy)
+```
+
+#### 4.3 Test
+
+1. Follow steps 1 - 5 in the ["Test" section of example 02](#23-test).
+
+#### 4.4 Uninstall
+
+1. To uninstall, follow the ["Uninstall" steps in example 02"](#24-uninstall).
+
+
+
+### 5: QA Chatbot with Task Decomposition example -- A100/H100/L40S
+
+This example deploys a recursive Task Decomposition example for chat QA. It uses the llama2-70b chat model (via the NVIDIA AI Foundation endpoint) for inference.
+
+It showcases how to perform RAG when the agent needs to access information from several different files/chunks or perform some computation on the answers. It uses a custom langchain agent that recursively breaks down the user's questions into subquestions that it attempts to answer. It has access to 2 tools - search (which performs standard RAG on a subquestion) and math (which poses a math question to the LLM). The agent continues to break down the question into sub-questions until it has the answers it needs to formulate the final answer.
+
+
+
+
+ Model |
+ Embedding |
+ Framework |
+ Description |
+ Multi-GPU |
+ TRT-LLM |
+ NVIDIA AI Foundation |
+ Triton |
+ Vector Database |
+
+
+
+
+ llama2_70b |
+ nvolveqa_40k |
+ Langchain |
+ QA chatbot |
+ NO |
+ NO |
+ YES |
+ NO |
+ FAISS |
+
+
+
+
+#### 5.1 Prepare the environment
+
+1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment).
+
+
+#### 5.2 Deploy
+
+1. Follow the ["Deploy" section of example 01](#downloading-the-model) to setup your API key
+
+2. Change the RAG example in `deploy/compose/compose.env`.
+ ```shell
+ export RAG_EXAMPLE="query_decomposition_rag"
+ ```
+
+3. Change the LLM in `deploy/compose/docker-compose-nv-ai-foundation.yaml` to `llama2_70b`.
+ ```yaml
+ query:
+ container_name: chain-server
+ ...
+ environment:
+ APP_LLM_MODELNAME: llama2_70b
+ ...
+ ```
+
+4. Deploy the Query Decomposition RAG example via Docker compose.
+
+```
+$ source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml build
+$ docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml up -d
-### Step 3: Experiment with RAG in JupyterLab
+$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+CONTAINER ID NAMES STATUS
+256da0ecdb7b llm-playground Up 48 minutes
+2974aa4fb2ce chain-server Up 48 minutes
+```
-This AI Workflow includes Jupyter notebooks which allow you to experiment with RAG.
+#### 5.3 Test
-- Using a web browser, type in the following URL to open Jupyter
+1. Connect to the sample web application at ``http://host-ip:8090``.
- ``http://host-ip:8888``
+2. Upload 2 text documents in the Knowledge Base tab. The documents can contain different information - for example, one document can contain a company's revenue analysis for Q3 2023 and the other can contain a similar analysis for Q4 2023.
-- Locate the [LLM Streaming Client notebook](../notebooks/01-llm-streaming-client.ipynb) which demonstrates how to stream responses from the LLM.
+3. Return to the **Converse** tab and check **[X] Use knowledge base**.
-- Proceed with the next 4 notebooks:
+4. Enter the question: "Which is greater - NVIDIA's datacenter revenue for Q4 2023 or the sum of its datacenter and gaming revenues for Q3 2023?" and hit submit to get the answer.
- - [Document Question-Answering with LangChain](../notebooks/02_langchain_simple.ipynb)
+#### 5.4 Uninstall
- - [Document Question-Answering with LlamaIndex](../notebooks/03_llama_index_simple.ipynb)
+1. To uninstall, follow the ["Uninstall" steps in example 02"](#24-uninstall).
- - [Advanced Document Question-Answering with LlamaIndex](../notebooks/04_llamaindex_hier_node_parser.ipynb)
+
- - [Interact with REST FastAPI Server](../notebooks/05_dataloader.ipynb)
+### 6: QA Chatbot -- NemoTron Model
-### Step 4: Run the Sample Web Application
-A sample chatbot web application is provided in the workflow. Requests to the chat system are wrapped in FastAPI calls.
+This example deploys a developer RAG pipeline for chat QA and serves inference via the NeMo Framework inference container using NeMoTron model and showcases inference using sample notebook.
-- Open the web application at ``http://host-ip:8090``.
-- Type in the following question without using a knowledge base: "How many cores are on the Nvidia Grace superchip?"
+#### 6.1 Prepare the environment
- **Note:** the chatbot mentions the chip doesn't exist.
+1. Follow the steps in the ["Prepare the environment" section of example 02](#21-prepare-the-environment).
-- To use a knowledge base:
+> ⚠️ **NOTE**: This example requires at least 100GB of GPU memory or two A100 GPUs for locally deploying the nemotron model.
- - Click the **Knowledge Base** tab and upload the file [dataset.zip](../notebooks/dataset.zip).
-- Return to **Converse** tab and check **[X] Use knowledge base**.
+#### 6.2 Deploy
-- Retype the question: "How many cores are on the Nvidia Grace superchip?"
+1. Download [NeMoTron chat checkpoint](https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-sft) from HuggingFace
-# RAG Evaluation
+```
+git-lfs clone https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-sft
+```
-## Prerequisites
-Make sure the corps comm dataset is loaded into the vector database using the [Dataloader](../notebooks/05_dataloader.ipynb) notebook as part of step-3 of setup.
+2. Make sure the absolute model path of nemotron-3-8b-chat-4k-sft model is updated in `/GenerativeAIExamples/deploy/compose/compose.env`. Set the below values in `compose.env` file.
-This workflow include jupyter notebooks which allow you perform evaluation of your RAG application on the sample dataset and they can be extended to other datasets as well.
-Setup the workflow by building and starting the containers by following the steps [outlined here using docker compose.](#step-2-build-and-start-containers)
+```
+export MODEL_DIRECTORY="/home/nvidia/nemotron-3-8b-chat-4k-sft" # Example path
+export MODEL_ARCHITECTURE="gptnext"
+export MODEL_NAME="nemotron-3-8b-chat-4k-sft"
+```
-After setting up the workflow follow these steps:
+3. Build and deploy the nemotron workflow
-- Using a web browser, type in the following URL to open Jupyter Labs
+```
+source deploy/compose/compose.env
+docker compose -f deploy/compose/docker-compose-nemotron.yaml build
+docker compose -f deploy/compose/docker-compose-nemotron.yaml up -d
+```
+4. Check the deployment status by printing logs of `llm-inference-server` container
- ``http://host-ip:8889``
+Successful TRT-LLM conversion and Triton Inference Server deployment logs will display the following message
+```
+I0107 03:03:38.638311 260 http_server.cc:3558] Started HTTPService at 0.0.0.0:8000
+I0107 03:03:38.679626 260 http_server.cc:187] Started Metrics Service at 0.0.0.0:8002
+```
-- Locate the [synthetic data generation](../evaluation/01_synthetic_data_generation.ipynb) which demonstrates how to generate synthetic data of question answer pairs for evaluation
+#### 6.3 Test
-- Proceed with the next 3 notebooks:
+1. Run `02_langchain_simple.ipynb` for Document Question-Answering with LangChain based using NeMoTron model.
- - [Filling generated answers](../evaluation/02_filling_RAG_outputs_for_Evaluation.ipynb)
+[Optional] Run `00-llm-non-streaming-nemotron.ipynb` to send request to LLM.
- - [Ragas evaluation with NVIDIA AI playground](../evaluation/03_eval_ragas.ipynb)
+> ⚠️ **NOTE**:
+- Nemotron models do not support streaming in this release.
- - [LLM as a Judge evaluation with NVIDIA AI playground](../evaluation/04_Human_Like_RAG_Evaluation-AIP.ipynb)
+
+### Learn More
-# Learn More
-1. [Architecture Guide](../docs/rag/architecture.md): Detailed explanation of different components and how they are tried up together.
-2. Component Guides: Component specific features are enlisted in these sections.
- 1. [Chain Server](../docs/rag/chat_server.md)
- 2. [NeMo Framework Inference Server](../docs/rag/llm_inference_server.md)
- 3. [Jupyter Server](../docs/rag/jupyter_server.md)
- 4. [Sample frontend](../docs/rag/frontend.md)
-3. [Configuration Guide](../docs/rag/configuration.md): This guide covers different configurations available for this workflow.
-4. [Support Matrix](../docs/rag/support_matrix.md): This covers GPU, CPU, Memory and Storage requirements for deploying this workflow.
+To deep dive into different components and workflow used by the examples, please refer to the [Developer Guide.](../docs/README.md)
diff --git a/RetrievalAugmentedGeneration/common/base.py b/RetrievalAugmentedGeneration/common/base.py
new file mode 100644
index 00000000..7b61a51a
--- /dev/null
+++ b/RetrievalAugmentedGeneration/common/base.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base interface that all RAG examples should implement."""
+
+from abc import ABC, abstractmethod
+from typing import Generator
+
+class BaseExample(ABC):
+
+ @abstractmethod
+ def llm_chain(self, context: str, question: str, num_tokens: int) -> Generator[str, None, None]:
+ pass
+
+ @abstractmethod
+ def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]:
+ pass
+
+ @abstractmethod
+ def ingest_docs(self, data_dir: str, filename: str) -> None:
+ pass
\ No newline at end of file
diff --git a/RetrievalAugmentedGeneration/common/configuration.py b/RetrievalAugmentedGeneration/common/configuration.py
index 8aa21d2f..23882963 100644
--- a/RetrievalAugmentedGeneration/common/configuration.py
+++ b/RetrievalAugmentedGeneration/common/configuration.py
@@ -18,24 +18,40 @@
@configclass
-class MilvusConfig(ConfigWizard):
- """Configuration class for the Weaviate connection.
+class VectorStoreConfig(ConfigWizard):
+ """Configuration class for the Vector Store connection.
- :cvar url: URL of Milvus DB
+ :cvar name: Name of vector store
+ :cvar url: URL of Vector Store
"""
+ name: str = configfield(
+ "name",
+ default="milvus", # supports pgvector, milvus
+ help_txt="The name of vector store",
+ )
url: str = configfield(
"url",
- default="http://localhost:19530",
- help_txt="The host of the machine running Milvus DB",
+ default="http://milvus:19530", # for pgvector `pgvector:5432`
+ help_txt="The host of the machine running Vector Store DB",
+ )
+ nlist: int = configfield(
+ "nlist",
+ default=64, # IVF Flat milvus
+ help_txt="Number of cluster units",
+ )
+ nprobe: int = configfield(
+ "nprobe",
+ default=16, # IVF Flat milvus
+ help_txt="Number of units to query",
)
@configclass
class LLMConfig(ConfigWizard):
- """Configuration class for the Triton connection.
+ """Configuration class for the llm connection.
- :cvar server_url: The location of the Triton server hosting the llm model.
+ :cvar server_url: The location of the llm server hosting the model.
:cvar model_name: The name of the hosted model.
"""
@@ -60,7 +76,7 @@ class LLMConfig(ConfigWizard):
class TextSplitterConfig(ConfigWizard):
"""Configuration class for the Text Splitter.
- :cvar chunk_size: Chunk size for text splitter.
+ :cvar chunk_size: Chunk size for text splitter. Tokens per chunk in token-based splitters.
:cvar chunk_overlap: Text overlap in text splitter.
"""
@@ -138,10 +154,10 @@ class PromptsConfig(ConfigWizard):
class AppConfig(ConfigWizard):
"""Configuration class for the application.
- :cvar milvus: The configuration of the Milvus vector db connection.
- :type milvus: MilvusConfig
- :cvar triton: The configuration of the backend Triton server.
- :type triton: TritonConfig
+ :cvar vector_store: The configuration of the vector db connection.
+ :type vector_store: VectorStoreConfig
+ :cvar llm: The configuration of the backend llm server.
+ :type llm: LLMConfig
:cvar text_splitter: The configuration for text splitter
:type text_splitter: TextSplitterConfig
:cvar embeddings: The configuration for huggingface embeddings
@@ -150,11 +166,11 @@ class AppConfig(ConfigWizard):
:type prompts: PromptsConfig
"""
- milvus: MilvusConfig = configfield(
- "milvus",
+ vector_store: VectorStoreConfig = configfield(
+ "vector_store",
env=False,
- help_txt="The configuration of the Milvus connection.",
- default=MilvusConfig(),
+ help_txt="The configuration of the vector db connection.",
+ default=VectorStoreConfig(),
)
llm: LLMConfig = configfield(
"llm",
diff --git a/RetrievalAugmentedGeneration/common/server.py b/RetrievalAugmentedGeneration/common/server.py
index c207312f..01f7021b 100644
--- a/RetrievalAugmentedGeneration/common/server.py
+++ b/RetrievalAugmentedGeneration/common/server.py
@@ -20,25 +20,22 @@
import logging
from pathlib import Path
from typing import Any, Dict, List
+import importlib
+from inspect import getmembers, isclass
-from fastapi import FastAPI, File, UploadFile
+from fastapi import FastAPI, File, UploadFile, Request
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, Field
from pymilvus.exceptions import MilvusException, MilvusUnavailableException
-
-from RetrievalAugmentedGeneration.common import utils
-from RetrievalAugmentedGeneration.examples.developer_rag import chains
+from RetrievalAugmentedGeneration.common import utils, tracing
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# create the FastAPI server
app = FastAPI()
-# prestage the embedding model
-_ = utils.get_embedding_model()
-# set the global service context for Llama Index
-utils.set_service_context()
+EXAMPLE_DIR = "RetrievalAugmentedGeneration/example"
class Prompt(BaseModel):
"""Definition of the Prompt API data type."""
@@ -56,14 +53,47 @@ class DocumentSearch(BaseModel):
num_docs: int = Field(description="The maximum number of documents to return in the response.", default=4)
+@app.on_event("startup")
+def import_example() -> None:
+ """
+ Import the example class from the specified example file.
+ The example directory is expected to have a python file where the example class is defined.
+ """
+
+ for root, dirs, files in os.walk(EXAMPLE_DIR):
+ for file in files:
+ if not file.endswith(".py"):
+ continue
+
+ # Import the specified file dynamically
+ spec = importlib.util.spec_from_file_location(name="example", location=os.path.join(root, file))
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+
+ # Scan each class in the file to find one with the 3 implemented methods: ingest_docs, rag_chain and llm_chain
+ for name, _ in getmembers(module, isclass):
+ try:
+ cls = getattr(module, name)
+ if set(["ingest_docs", "llm_chain", "rag_chain"]).issubset(set(dir(cls))):
+ if name == "BaseExample":
+ continue
+ example = cls()
+ app.example = cls
+ return
+ except:
+ raise ValueError(f"Class {name} is not implemented and could not be instantiated.")
+
+ raise NotImplementedError(f"Could not find a valid example class in {EXAMPLE_DIR}")
+
+
@app.post("/uploadDocument")
-async def upload_document(file: UploadFile = File(...)) -> JSONResponse:
+@tracing.instrumentation_wrapper
+async def upload_document(request: Request, file: UploadFile = File(...)) -> JSONResponse:
"""Upload a document to the vector store."""
if not file.filename:
return JSONResponse(content={"message": "No files provided"}, status_code=200)
try:
-
upload_folder = "uploaded_files"
upload_file = os.path.basename(file.filename)
if not upload_file:
@@ -75,7 +105,7 @@ async def upload_document(file: UploadFile = File(...)) -> JSONResponse:
with open(file_path, "wb") as f:
shutil.copyfileobj(file.file, f)
- chains.ingest_docs(file_path, upload_file)
+ app.example().ingest_docs(file_path, upload_file)
return JSONResponse(
content={"message": "File uploaded successfully"}, status_code=200
@@ -84,21 +114,23 @@ async def upload_document(file: UploadFile = File(...)) -> JSONResponse:
except Exception as e:
logger.error("Error from /uploadDocument endpoint. Ingestion of file: " + file.filename + " failed with error: " + str(e))
return JSONResponse(
- content={"message": f"Ingestion of file: " + file.filename + " failed with error: " + str(e)}, status_code=500
+ content={"message": str(e)}, status_code=500
)
@app.post("/generate")
-async def generate_answer(prompt: Prompt) -> StreamingResponse:
+@tracing.instrumentation_wrapper
+async def generate_answer(request: Request, prompt: Prompt) -> StreamingResponse:
"""Generate and stream the response to the provided prompt."""
try:
+ example = app.example()
if prompt.use_knowledge_base:
logger.info("Knowledge base is enabled. Using rag chain for response generation.")
- generator = chains.rag_chain(prompt.question, prompt.num_tokens)
+ generator = example.rag_chain(prompt.question, prompt.num_tokens)
return StreamingResponse(generator, media_type="text/event-stream")
- generator = chains.llm_chain(prompt.context, prompt.question, prompt.num_tokens)
+ generator = example.llm_chain(prompt.context, prompt.question, prompt.num_tokens)
return StreamingResponse(generator, media_type="text/event-stream")
except (MilvusException, MilvusUnavailableException) as e:
@@ -111,20 +143,16 @@ async def generate_answer(prompt: Prompt) -> StreamingResponse:
@app.post("/documentSearch")
-def document_search(data: DocumentSearch) -> List[Dict[str, Any]]:
+@tracing.instrumentation_wrapper
+async def document_search(request: Request,data: DocumentSearch) -> List[Dict[str, Any]]:
"""Search for the most relevant documents for the given search parameters."""
try:
- retriever = utils.get_doc_retriever(num_nodes=data.num_docs)
- nodes = retriever.retrieve(data.content)
- output = []
- for node in nodes:
- file_name = nodes[0].metadata["filename"]
- decoded_filename = base64.b64decode(file_name.encode("utf-8")).decode("utf-8")
- entry = {"score": node.score, "source": decoded_filename, "content": node.text}
- output.append(entry)
-
- return output
+ example = app.example()
+ if hasattr(example, "document_search") and callable(example.document_search):
+ return example.document_search(data.content, data.num_docs)
+
+ raise NotImplementedError("Example class has not implemented the document_search method.")
except Exception as e:
logger.error(f"Error from /documentSearch endpoint. Error details: {e}")
diff --git a/RetrievalAugmentedGeneration/common/tracing.py b/RetrievalAugmentedGeneration/common/tracing.py
new file mode 100644
index 00000000..b2b5cb6e
--- /dev/null
+++ b/RetrievalAugmentedGeneration/common/tracing.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module for configuring objects used to create OpenTelemetry traces."""
+
+import os
+from opentelemetry import trace, context
+from opentelemetry.sdk.resources import SERVICE_NAME, Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+from opentelemetry.propagate import set_global_textmap, get_global_textmap
+from opentelemetry.propagators.composite import CompositePropagator
+from tools.observability.llamaindex import opentelemetry_callback
+import llama_index
+from llama_index.callbacks.base import CallbackManager
+from functools import wraps
+
+# Configure tracer used by the Chain Server to create spans
+resource = Resource.create({SERVICE_NAME: "chain-server"})
+provider = TracerProvider(resource=resource)
+if os.environ.get("ENABLE_TRACING") == "true":
+ processor = SimpleSpanProcessor(OTLPSpanExporter())
+ provider.add_span_processor(processor)
+trace.set_tracer_provider(provider)
+tracer = trace.get_tracer("chain-server")
+
+# Configure Propagator used for processing trace context received by the Chain Server
+if os.environ.get("ENABLE_TRACING") == "true":
+ propagator = TraceContextTextMapPropagator()
+ # Llamaindex global handler set to pass callbacks into the OpenTelemetry handler
+ llama_index.global_handler = opentelemetry_callback.OpenTelemetryCallbackHandler(tracer)
+else:
+ propagator = CompositePropagator([]) # No-op propagator
+set_global_textmap(propagator)
+
+# Wrapper Function to perform instrumentation
+def instrumentation_wrapper(func):
+ @wraps(func)
+ async def wrapper(*args, **kwargs):
+ request = kwargs.get("request")
+ prompt = kwargs.get("prompt")
+ ctx = get_global_textmap().extract(request.headers)
+ if ctx is not None:
+ context.attach(ctx)
+ if prompt is not None and prompt.use_knowledge_base == False:
+ # Hack to get the LLM event for no knowledge base queries to show up.
+ # A trace is not generated by Llamaindex for these calls so we need to generate it instead.
+ callback_manager = CallbackManager([])
+ with callback_manager.as_trace("query"):
+ result = func(*args, **kwargs)
+ else:
+ result = func(*args, **kwargs)
+ return await result
+
+ return wrapper
diff --git a/RetrievalAugmentedGeneration/common/utils.py b/RetrievalAugmentedGeneration/common/utils.py
index 50853f0c..99fda3f9 100644
--- a/RetrievalAugmentedGeneration/common/utils.py
+++ b/RetrievalAugmentedGeneration/common/utils.py
@@ -20,29 +20,55 @@
from functools import lru_cache
from typing import TYPE_CHECKING, List, Optional
-import torch
-from llama_index.postprocessor.types import BaseNodePostprocessor
-from llama_index.schema import MetadataMode
-from llama_index.utils import globals_helper
-from llama_index.vector_stores import MilvusVectorStore
-from llama_index import VectorStoreIndex, ServiceContext, set_global_service_context
-from llama_index.llms import LangChainLLM
-from llama_index.embeddings import LangchainEmbedding
-from langchain.text_splitter import SentenceTransformersTokenTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
+logger = logging.getLogger(__name__)
+
+try:
+ import torch
+except Exception as e:
+ logger.error(f"torch import failed with error: {e}")
+
+try:
+ import psycopg2
+except Exception as e:
+ logger.error(f"psycogp2 import failed with error: {e}")
+
+try:
+ from sqlalchemy import make_url
+except Exception as e:
+ logger.error(f"SQLalchemy import failed with error: {e}")
+
+try:
+ from llama_index.postprocessor.types import BaseNodePostprocessor
+ from llama_index.schema import MetadataMode
+ from llama_index.utils import globals_helper, get_tokenizer
+ from llama_index.vector_stores import MilvusVectorStore, PGVectorStore
+ from llama_index import VectorStoreIndex, ServiceContext, set_global_service_context
+ from llama_index.llms import LangChainLLM
+ from llama_index.embeddings import LangchainEmbedding
+ if TYPE_CHECKING:
+ from llama_index.indices.base_retriever import BaseRetriever
+ from llama_index.indices.query.schema import QueryBundle
+ from llama_index.schema import NodeWithScore
+except Exception as e:
+ logger.error(f"Llamaindex import failed with error: {e}")
+
+try:
+ from langchain.text_splitter import SentenceTransformersTokenTextSplitter
+ from langchain.embeddings import HuggingFaceEmbeddings
+except Exception as e:
+ logger.error(f"Langchain import failed with error: {e}")
+
+try:
+ from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
+except Exception as e:
+ logger.error(f"NVIDIA AI connector import failed with error: {e}")
+
from integrations.langchain.llms.triton_trt_llm import TensorRTLLM
-from integrations.langchain.llms.nv_aiplay import GeneralLLM
-from integrations.langchain.embeddings.nv_aiplay import NVAIPlayEmbeddings
from RetrievalAugmentedGeneration.common import configuration
if TYPE_CHECKING:
- from llama_index.indices.base_retriever import BaseRetriever
- from llama_index.indices.query.schema import QueryBundle
- from llama_index.schema import NodeWithScore
from RetrievalAugmentedGeneration.common.configuration_wizard import ConfigWizard
-logger = logging.getLogger(__name__)
-
DEFAULT_MAX_CONTEXT = 1500
DEFAULT_NUM_TOKENS = 150
TEXT_SPLITTER_EMBEDDING_MODEL = "intfloat/e5-large-v2"
@@ -58,11 +84,12 @@ def _postprocess_nodes(
included_nodes = []
current_length = 0
limit = DEFAULT_MAX_CONTEXT
+ tokenizer = get_tokenizer()
for node in nodes:
current_length += len(
- globals_helper.tokenizer(
- node.node.get_content(metadata_mode=MetadataMode.LLM)
+ tokenizer(
+ node.get_content(metadata_mode=MetadataMode.LLM)
)
)
if current_length > limit:
@@ -95,7 +122,42 @@ def get_config() -> "ConfigWizard":
def get_vector_index() -> VectorStoreIndex:
"""Create the vector db index."""
config = get_config()
- vector_store = MilvusVectorStore(uri=config.milvus.url, dim=config.embeddings.dimensions, overwrite=False)
+ vector_store = None
+
+ logger.info(f"Using {config.vector_store.name} as vector store")
+ if config.vector_store.name == "pgvector":
+ connection_string = f"postgresql://{os.getenv('POSTGRES_USER', '')}:{os.getenv('POSTGRES_PASSWORD', '')}@{config.vector_store.url}"
+ db_name = "vector_db"
+
+ conn = psycopg2.connect(connection_string)
+ conn.autocommit = True
+
+ with conn.cursor() as c:
+ # Check for database existence first
+ c.execute(f"SELECT 1 FROM pg_database WHERE datname = '{db_name}'")
+ if not c.fetchone(): # Database doesn't exist
+ c.execute(f"CREATE DATABASE {db_name}")
+
+ url = make_url(connection_string)
+
+ vector_store = PGVectorStore.from_params(
+ database=db_name,
+ host=url.host,
+ password=url.password,
+ port=url.port,
+ user=url.username,
+ table_name="document_store",
+ embed_dim=config.embeddings.dimensions,
+ )
+ elif config.vector_store.name == "milvus":
+ vector_store = MilvusVectorStore(uri=config.vector_store.url,
+ dim=config.embeddings.dimensions,
+ collection_name="document_store_ivfflat",
+ index_config={"index_type": "IVF_FLAT", "nlist": config.vector_store.nlist},
+ search_config={"nprobe": config.vector_store.nprobe},
+ overwrite=False)
+ else:
+ raise RuntimeError("Unable to find any supported Vector Store DB. Supported engines are milvus and pgvector.")
return VectorStoreIndex.from_vector_store(vector_store)
@@ -111,7 +173,7 @@ def get_llm() -> LangChainLLM:
"""Create the LLM connection."""
settings = get_config()
- logger.info(f"Using {settings.llm.model_engine} as model engine for llm")
+ logger.info(f"Using {settings.llm.model_engine} as model engine for llm. Model name: {settings.llm.model_name}")
if settings.llm.model_engine == "triton-trt-llm":
trtllm = TensorRTLLM( # type: ignore
server_url=settings.llm.server_url,
@@ -119,17 +181,10 @@ def get_llm() -> LangChainLLM:
tokens=DEFAULT_NUM_TOKENS,
)
return LangChainLLM(llm=trtllm)
- elif settings.llm.model_engine == "ai-playground":
- if os.getenv('NVAPI_KEY') is None:
- raise RuntimeError("AI PLayground key is not set")
- aipl_llm = GeneralLLM(
- model=settings.llm.model_name,
- max_tokens=DEFAULT_NUM_TOKENS,
- streaming=True
- )
- return LangChainLLM(llm=aipl_llm)
+ elif settings.llm.model_engine == "nv-ai-foundation":
+ return ChatNVIDIA(model=settings.llm.model_name)
else:
- raise RuntimeError("Unable to find any supported Large Language Model server. Supported engines are triton-trt-llm and ai-playground.")
+ raise RuntimeError("Unable to find any supported Large Language Model server. Supported engines are triton-trt-llm and nv-ai-foundation.")
@lru_cache
@@ -151,11 +206,8 @@ def get_embedding_model() -> LangchainEmbedding:
)
# Load in a specific embedding model
return LangchainEmbedding(hf_embeddings)
- elif settings.embeddings.model_engine == "ai-playground":
- if os.getenv('NVAPI_KEY') is None:
- raise RuntimeError("AI PLayground key is not set")
- embedding = NVAIPlayEmbeddings(model=settings.embeddings.model_name)
- return LangchainEmbedding(embedding)
+ elif settings.embeddings.model_engine == "nv-ai-foundation":
+ return NVIDIAEmbeddings(model=settings.embeddings.model_name, model_type="passage")
else:
raise RuntimeError("Unable to find any supported embedding model. Supported engine is huggingface.")
@@ -179,6 +231,6 @@ def get_text_splitter() -> SentenceTransformersTokenTextSplitter:
"""Return the token text splitter instance from langchain."""
return SentenceTransformersTokenTextSplitter(
model_name=TEXT_SPLITTER_EMBEDDING_MODEL,
- chunk_size=get_config().text_splitter.chunk_size,
+ tokens_per_chunk=get_config().text_splitter.chunk_size,
chunk_overlap=get_config().text_splitter.chunk_overlap,
)
diff --git a/RetrievalAugmentedGeneration/examples/developer_rag/chains.py b/RetrievalAugmentedGeneration/examples/developer_rag/chains.py
index b408cb69..4c9cb8cc 100644
--- a/RetrievalAugmentedGeneration/examples/developer_rag/chains.py
+++ b/RetrievalAugmentedGeneration/examples/developer_rag/chains.py
@@ -18,7 +18,7 @@
import os
import logging
from pathlib import Path
-from typing import Generator
+from typing import Generator, List, Dict, Any
from llama_index import Prompt, download_loader
from llama_index.query_engine import RetrieverQueryEngine
@@ -34,84 +34,111 @@
get_vector_index,
is_base64_encoded,
set_service_context,
+ get_embedding_model,
)
+from RetrievalAugmentedGeneration.common.base import BaseExample
+
+# prestage the embedding model
+_ = get_embedding_model()
+set_service_context()
+
logger = logging.getLogger(__name__)
-def llm_chain(
- context: str, question: str, num_tokens: int
-) -> Generator[str, None, None]:
- """Execute a simple LLM chain using the components defined above."""
-
- logger.info("Using llm to generate response directly without knowledge base.")
- set_service_context()
- prompt = get_config().prompts.chat_template.format(
- context_str=context, query_str=question
- )
-
- logger.info(f"Prompt used for response generation: {prompt}")
- response = get_llm().stream_complete(prompt, tokens=num_tokens)
- gen_response = (resp.delta for resp in response)
- return gen_response
-
-
-def rag_chain(prompt: str, num_tokens: int) -> Generator[str, None, None]:
- """Execute a Retrieval Augmented Generation chain using the components defined above."""
-
- logger.info("Using rag to generate response from document")
-
- set_service_context()
- if get_config().llm.model_engine == "triton-trt-llm":
- get_llm().llm.tokens = num_tokens # type: ignore
- else:
- get_llm().llm.max_tokens = num_tokens
- retriever = get_doc_retriever(num_nodes=4)
- qa_template = Prompt(get_config().prompts.rag_template)
-
- logger.info(f"Prompt used for response generation: {qa_template}")
- query_engine = RetrieverQueryEngine.from_args(
- retriever,
- text_qa_template=qa_template,
- node_postprocessors=[LimitRetrievedNodesLength()],
- streaming=True,
- )
- response = query_engine.query(prompt)
-
- # Properly handle an empty response
- if isinstance(response, StreamingResponse):
- return response.response_gen
-
- logger.warning("No response generated from LLM, make sure you've ingested document.")
- return StreamingResponse(iter(["No response generated from LLM, make sure you have ingested document from the Knowledge Base Tab."])).response_gen # type: ignore
-
-
-def ingest_docs(data_dir: str, filename: str) -> None:
- """Ingest documents to the VectorDB."""
-
- logger.info(f"Ingesting {filename} in vectorDB")
- _, ext = os.path.splitext(filename)
-
- if ext.lower() == ".pdf":
- PDFReader = download_loader("PDFReader")
- loader = PDFReader()
- documents = loader.load_data(file=Path(data_dir))
-
- else:
- unstruct_reader = download_loader("UnstructuredReader")
- loader = unstruct_reader()
- documents = loader.load_data(file=Path(data_dir), split_documents=False)
-
- encoded_filename = filename[:-4]
- if not is_base64_encoded(encoded_filename):
- encoded_filename = base64.b64encode(encoded_filename.encode("utf-8")).decode(
- "utf-8"
+class QAChatbot(BaseExample):
+ def ingest_docs(self, data_dir: str, filename: str):
+ """Ingest documents to the VectorDB."""
+
+ try:
+ logger.info(f"Ingesting {filename} in vectorDB")
+ _, ext = os.path.splitext(filename)
+
+ if ext.lower() == ".pdf":
+ PDFReader = download_loader("PDFReader")
+ loader = PDFReader()
+ documents = loader.load_data(file=Path(data_dir))
+
+ else:
+ unstruct_reader = download_loader("UnstructuredReader")
+ loader = unstruct_reader()
+ documents = loader.load_data(file=Path(data_dir), split_documents=False)
+
+ encoded_filename = filename[:-4]
+ if not is_base64_encoded(encoded_filename):
+ encoded_filename = base64.b64encode(encoded_filename.encode("utf-8")).decode(
+ "utf-8"
+ )
+
+ for document in documents:
+ document.metadata = {"filename": encoded_filename}
+
+ index = get_vector_index()
+ node_parser = LangchainNodeParser(get_text_splitter())
+ nodes = node_parser.get_nodes_from_documents(documents)
+ index.insert_nodes(nodes)
+ logger.info(f"Document {filename} ingested successfully")
+ except Exception as e:
+ logger.error(f"Failed to ingest document due to exception {e}")
+ raise ValueError("Failed to upload document. Please upload an unstructured text document.")
+
+ def llm_chain(self, context: str, question: str, num_tokens: int) -> Generator[str, None, None]:
+ """Execute a simple LLM chain using the components defined above."""
+
+ logger.info("Using llm to generate response directly without knowledge base.")
+ set_service_context()
+ prompt = get_config().prompts.chat_template.format(
+ context_str=context, query_str=question
+ )
+
+ logger.info(f"Prompt used for response generation: {prompt}")
+ response = get_llm().stream_complete(prompt, tokens=num_tokens)
+ gen_response = (resp.delta for resp in response)
+ return gen_response
+
+ def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]:
+ """Execute a Retrieval Augmented Generation chain using the components defined above."""
+
+ logger.info("Using rag to generate response from document")
+
+ set_service_context()
+ if get_config().llm.model_engine == "triton-trt-llm":
+ get_llm().llm.tokens = num_tokens # type: ignore
+ else:
+ get_llm().llm.max_tokens = num_tokens
+ retriever = get_doc_retriever(num_nodes=4)
+ qa_template = Prompt(get_config().prompts.rag_template)
+
+ logger.info(f"Prompt used for response generation: {qa_template}")
+ query_engine = RetrieverQueryEngine.from_args(
+ retriever,
+ text_qa_template=qa_template,
+ node_postprocessors=[LimitRetrievedNodesLength()],
+ streaming=True,
)
+ response = query_engine.query(prompt)
+
+ # Properly handle an empty response
+ if isinstance(response, StreamingResponse):
+ return response.response_gen
+
+ logger.warning("No response generated from LLM, make sure you've ingested document.")
+ return StreamingResponse(iter(["No response generated from LLM, make sure you have ingested document from the Knowledge Base Tab."])).response_gen # type: ignore
+
+ def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]:
+ """Search for the most relevant documents for the given search parameters."""
+
+ try:
+ retriever = get_doc_retriever(num_nodes=num_docs)
+ nodes = retriever.retrieve(content)
+ output = []
+ for node in nodes:
+ file_name = nodes[0].metadata["filename"]
+ decoded_filename = base64.b64decode(file_name.encode("utf-8")).decode("utf-8")
+ entry = {"score": node.score, "source": decoded_filename, "content": node.text}
+ output.append(entry)
- for document in documents:
- document.metadata = {"filename": encoded_filename}
+ return output
- index = get_vector_index()
- node_parser = LangchainNodeParser(get_text_splitter())
- nodes = node_parser.get_nodes_from_documents(documents)
- index.insert_nodes(nodes)
- logger.info(f"Document {filename} ingested successfully")
+ except Exception as e:
+ logger.error(f"Error from /documentSearch endpoint. Error details: {e}")
+ return []
diff --git a/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py
new file mode 100644
index 00000000..82886fbb
--- /dev/null
+++ b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/chains.py
@@ -0,0 +1,151 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from functools import lru_cache
+from typing import Generator, List, Dict, Any
+
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
+from RetrievalAugmentedGeneration.common.base import BaseExample
+from RetrievalAugmentedGeneration.common.utils import get_config, get_llm, get_embedding_model
+
+logger = logging.getLogger(__name__)
+DOCS_DIR = os.path.abspath("./uploaded_files")
+vector_store_path = "vectorstore.pkl"
+document_embedder = get_embedding_model()
+vectorstore = None
+settings = get_config()
+
+
+class NvidiaAIFoundation(BaseExample):
+ def ingest_docs(self, file_name: str, filename: str):
+ """Ingest documents to the VectorDB."""
+
+ try:
+ # TODO: Load embedding created in older conversation, memory persistance
+ # We initialize class in every call therefore it should be global
+ global vectorstore
+ # Load raw documents from the directory
+ # Data is copied to `DOCS_DIR` in common.server:upload_document
+ _path = os.path.join(DOCS_DIR, filename)
+ raw_documents = UnstructuredFileLoader(_path).load()
+
+ if raw_documents:
+ text_splitter = CharacterTextSplitter(chunk_size=settings.text_splitter.chunk_size, chunk_overlap=settings.text_splitter.chunk_overlap)
+ documents = text_splitter.split_documents(raw_documents)
+ if vectorstore:
+ vectorstore.add_documents(documents)
+ else:
+ vectorstore = FAISS.from_documents(documents, document_embedder)
+ logger.info("Vector store created and saved.")
+ else:
+ logger.warning("No documents available to process!")
+ except Exception as e:
+ logger.error(f"Failed to ingest document due to exception {e}")
+ raise ValueError("Failed to upload document. Please upload an unstructured text document.")
+
+ def llm_chain(
+ self, context: str, question: str, num_tokens: str
+ ) -> Generator[str, None, None]:
+ """Execute a simple LLM chain using the components defined above."""
+
+ logger.info("Using llm to generate response directly without knowledge base.")
+ prompt_template = ChatPromptTemplate.from_messages(
+ [
+ (
+ "system",
+ settings.prompts.chat_template,
+ ),
+ ("user", "{input}"),
+ ]
+ )
+
+ llm = get_llm()
+
+ chain = prompt_template | llm | StrOutputParser()
+ augmented_user_input = (
+ "Context: " + context + "\n\nQuestion: " + question + "\n"
+ )
+ return chain.stream({"input": augmented_user_input})
+
+ def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]:
+ """Execute a Retrieval Augmented Generation chain using the components defined above."""
+
+ logger.info("Using rag to generate response from document")
+
+ prompt_template = ChatPromptTemplate.from_messages(
+ [
+ (
+ "system",
+ settings.prompts.rag_template,
+ ),
+ ("user", "{input}"),
+ ]
+ )
+ llm = get_llm()
+
+ chain = prompt_template | llm | StrOutputParser()
+
+ try:
+ if vectorstore != None:
+ retriever = vectorstore.as_retriever()
+ docs = retriever.get_relevant_documents(prompt)
+
+ context = ""
+ for doc in docs:
+ context += doc.page_content + "\n\n"
+
+ augmented_user_input = (
+ "Context: " + context + "\n\nQuestion: " + prompt + "\n"
+ )
+
+ return chain.stream({"input": augmented_user_input})
+ except Exception as e:
+ logger.warning(f"Failed to generate response due to exception {e}")
+ logger.warning(
+ "No response generated from LLM, make sure you've ingested document."
+ )
+ return iter(
+ [
+ "No response generated from LLM, make sure you have ingested document from the Knowledge Base Tab."
+ ]
+ )
+
+ def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]:
+ """Search for the most relevant documents for the given search parameters."""
+
+ try:
+ if vectorstore != None:
+ retriever = vectorstore.as_retriever()
+ docs = retriever.get_relevant_documents(content)
+ result = []
+ for doc in docs:
+ result.append(
+ {
+ "source": os.path.basename(doc.metadata.get('source', '')),
+ "content": doc.page_content
+ }
+ )
+ return result
+ return []
+ except Exception as e:
+ logger.error(f"Error from /documentSearch endpoint. Error details: {e}")
+ return []
diff --git a/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt
new file mode 100644
index 00000000..39556ee6
--- /dev/null
+++ b/RetrievalAugmentedGeneration/examples/nvidia_ai_foundation/requirements.txt
@@ -0,0 +1 @@
+faiss-cpu==1.7.4
\ No newline at end of file
diff --git a/RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py
new file mode 100644
index 00000000..a08b2c20
--- /dev/null
+++ b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py
new file mode 100644
index 00000000..5d1a63fc
--- /dev/null
+++ b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/chains.py
@@ -0,0 +1,341 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example showcases recursive task decomposition to perform RAG which requires multiple steps.
+The agent is a langchain custom LLM agent, which uses 2 tools - search and math.
+It uses OpenAI's GPT-4 model for sub-answer formation, tool prediction and math operations. It uses the deployed LLM for final answer formation.
+Search tool is a RAG pipeline, whereas the math tool uses an LLM call to perform mathematical calculations.
+"""
+
+from langchain.vectorstores import FAISS
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain.chains import LLMChain
+from langchain.prompts import BaseChatPromptTemplate
+from langchain.schema import HumanMessage
+from langchain.agents import LLMSingleActionAgent, AgentOutputParser, AgentExecutor, Tool
+from langchain.schema.agent import AgentFinish, AgentAction
+from typing import List, Union, Dict, Any
+import json
+import jinja2
+import os
+
+import os
+import logging
+from typing import Generator, List
+
+from RetrievalAugmentedGeneration.common.utils import (
+ get_config,
+ get_llm,
+ set_service_context,
+ get_embedding_model,
+)
+from RetrievalAugmentedGeneration.common.base import BaseExample
+
+logger = logging.getLogger(__name__)
+
+llm = get_llm()
+DOCS_DIR = os.path.abspath("./uploaded_files")
+vector_store_path = "vectorstore.pkl"
+document_embedder = get_embedding_model()
+vectorstore = None
+settings = get_config()
+
+##### Helper methods and tools #####
+
+class Ledger: # Stores the state of the recursive decomposition
+ def __init__(self):
+ self.question_trace = []
+ self.answer_trace = []
+ self.trace = 0
+ self.done = False
+
+
+##### LLM and Prompt definitions #####
+def fetch_context(ledger: Ledger) -> str:
+ """
+ Create the context for the prompt from the subquestions and answers
+ """
+ context = ""
+ for i in range(len(ledger.question_trace)):
+ context += "Sub-Question: " + ledger.question_trace[i]
+ context += "\nSub-Answer: " + ledger.answer_trace[i] + "\n"
+
+ return context
+
+template = """Your task is to answer questions. If you cannot answer the question, you can request use for a tool and break the question into specific sub questions. Fill with Nil where no action is required. You should only return a JSON containing the tool and the generated sub questions. Consider the contextual information and only ask for information that you do not already have. Do not return any other explanations or text. The output should be a simple JSON structure! You are given two tools:
+- Search tool
+- Math tool
+
+Do not pass sub questions to any tool if they already have an answer in the Contextual Information.
+If you have all the information needed to answer the question, mark the Tool_Request as Nil.
+
+Contextual Information:
+{{ context }}
+
+Question:
+{{ question }}
+
+{"Tool_Request": "", "Generated Sub Questions": []}
+"""
+
+class CustomPromptTemplate(BaseChatPromptTemplate):
+ template: str
+ tools: List[Tool]
+ ledger: Ledger
+
+ def format_messages(self, **kwargs) -> str:
+ kwargs["context"] = fetch_context(self.ledger).strip("\n")
+ env = jinja2.Environment()
+ prompt_template = env.from_string(template)
+ prompt = prompt_template.render(**kwargs)
+ logger.info(prompt)
+ return [HumanMessage(content=prompt)]
+
+
+##### LLM output parser #####
+
+
+class CustomOutputParser(AgentOutputParser):
+ class Config:
+ arbitrary_types_allowed = True
+
+ ledger: Ledger
+
+ def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
+ """
+ Make a decision about the tool to be called based on LLM output.
+ """
+
+ logger.info(f"LLM Response: {llm_output}")
+ local_state = json.loads(llm_output)
+ if (
+ local_state["Generated Sub Questions"][0] == "Nil"
+ or local_state["Tool_Request"] == "Nil"
+ or self.ledger.trace > 3
+ or local_state["Generated Sub Questions"][0] in self.ledger.question_trace
+ ):
+ return AgentFinish(
+ return_values={"output": "success"},
+ log=llm_output,
+ )
+
+ if local_state["Tool_Request"] == "Search tool":
+ self.ledger.trace += 1
+
+ if local_state["Tool_Request"] in ["Search tool", "Math tool"]:
+ return AgentAction(
+ tool=local_state["Tool_Request"],
+ tool_input={"sub_questions": local_state["Generated Sub Questions"]},
+ log=llm_output,
+ )
+ raise ValueError(f"Invalid Tool name: {local_state['Tool_Request']}")
+
+
+class QueryDecompositionChatbot(BaseExample):
+ def ingest_docs(self, file_name: str, filename: str):
+ """Ingest documents to the VectorDB."""
+
+ try:
+ # TODO: Load embedding created in older conversation, memory persistance
+ # We initialize class in every call therefore it should be global
+ global vectorstore
+ # Load raw documents from the directory
+ # Data is copied to `DOCS_DIR` in common.server:upload_document
+ _path = os.path.join(DOCS_DIR, filename)
+ raw_documents = UnstructuredFileLoader(_path).load()
+
+ if raw_documents:
+ text_splitter = CharacterTextSplitter(chunk_size=settings.text_splitter.chunk_size, chunk_overlap=settings.text_splitter.chunk_overlap)
+ documents = text_splitter.split_documents(raw_documents)
+ if vectorstore:
+ vectorstore.add_documents(documents)
+ else:
+ vectorstore = FAISS.from_documents(documents, document_embedder)
+ logger.info("Vector store created and saved.")
+ else:
+ logger.warning("No documents available to process!")
+ except Exception as e:
+ logger.error(f"Failed to ingest document due to exception {e}")
+ raise ValueError("Failed to upload document. Please upload an unstructured text document.")
+
+
+ def llm_chain(
+ self, context: str, question: str, num_tokens: str
+ ) -> Generator[str, None, None]:
+ """Execute a simple LLM chain using the components defined above."""
+
+ logger.info("Using llm to generate response directly without knowledge base.")
+ prompt_template = ChatPromptTemplate.from_messages(
+ [
+ (
+ "system",
+ settings.prompts.chat_template,
+ ),
+ ("user", "{input}"),
+ ]
+ )
+
+ llm = get_llm()
+
+ chain = prompt_template | llm | StrOutputParser()
+ augmented_user_input = (
+ "Context: " + context + "\n\nQuestion: " + question + "\n"
+ )
+ return chain.stream({"input": augmented_user_input})
+
+ def rag_chain(self, question: str, num_tokens: int) -> Generator[str, None, None]:
+ """Execute a Retrieval Augmented Generation chain using the components defined above."""
+
+ logger.info("Using rag to generate response from document")
+
+ set_service_context()
+ final_context = self.run_agent(question)
+ logger.info(f"Final Answer from agent: {final_context}")
+
+ final_prompt_template = ChatPromptTemplate.from_messages(
+ [
+ ("human", final_context)
+ ]
+ )
+ chain = final_prompt_template | llm | StrOutputParser()
+
+ return chain.stream({})
+
+
+ def create_agent(self) -> AgentExecutor:
+ """
+ Creates the tools, chain, output parser and agent used to fetch the full context.
+ """
+
+ self.ledger = Ledger()
+
+ tools = [
+ Tool(name="Search tool", func=self.search, description="Searches for the answer from a given context."),
+ Tool(name="Math tool", func=self.math, description="Performs mathematical calculations."),
+ ]
+ tool_names = [tool.name for tool in tools]
+
+ prompt = CustomPromptTemplate(template=template, tools=tools, input_variables=["question"], ledger=self.ledger)
+ output_parser = CustomOutputParser(ledger=self.ledger)
+ llm_chain = LLMChain(llm=llm, prompt=prompt)
+
+ recursive_decomposition_agent = LLMSingleActionAgent(
+ llm_chain=llm_chain, output_parser=output_parser, stop=["\n\n"], allowed_tools=tool_names
+ )
+
+ agent_executor = AgentExecutor.from_agent_and_tools(agent=recursive_decomposition_agent, tools=tools, verbose=True)
+ return agent_executor
+
+
+ def run_agent(self, question: str):
+ """
+ Run question on the agent
+ """
+
+ agent_executor = self.create_agent()
+ agent_executor.invoke({"question": question})
+
+ ##### LLM call to get final answer ######
+
+ prompt = "Question: " + question + "\n\n"
+ prompt += "Sub Questions and Answers\n"
+ for i in range(len(self.ledger.question_trace)):
+ prompt += "Sub Question: " + str(self.ledger.question_trace[i]) + "\n"
+ prompt += "Sub Answer: " + str(self.ledger.answer_trace[i]) + "\n"
+ prompt += "\nFinal Answer: "
+
+ return prompt
+
+ def retriever(self, query: str) -> List[str]:
+ """
+ Searches for the answer from a given context.
+ """
+
+ if vectorstore is None:
+ return []
+
+ retriever = vectorstore.as_retriever()
+ result = retriever.get_relevant_documents(query)
+ logger.info(result)
+ return [hit.page_content for hit in result]
+
+
+ def extract_answer(self, chunks: List[str], question: str) -> str:
+ """
+ Find the answer to the query from the retrieved chunks
+ """
+
+ prompt = "Below is a Question and set of Passages that may or may not be relevant. Your task is to Extract the answer for question using only the information available in the passages. Be as concise as possible and only include the answer if present. Do not infer or process the passage in any other way\n\n"
+ prompt += "Question: " + question + "\n\n"
+ for idx, chunk in enumerate(chunks):
+ prompt += f"Passage {idx + 1}:\n"
+ prompt += chunk + "\n"
+
+ answer = llm([HumanMessage(content=prompt)])
+ return answer.content
+
+
+ def search(self, sub_questions: List[str]):
+ """
+ Search for the answer for each subquestion and add them to the ledger.
+ """
+
+ logger.info(f"Entering search with subquestions: {sub_questions}")
+ for sub_question in sub_questions:
+ chunk = self.retriever(sub_question)
+ sub_answer = self.extract_answer(chunk, sub_question)
+
+ self.ledger.question_trace.append(sub_question)
+ self.ledger.answer_trace.append(sub_answer)
+
+
+ def math(self, sub_questions: List[str]):
+ """
+ Places an LLM call to answer mathematical subquestions which do not require search
+ """
+
+ prompt = "Solve this mathematical question:\nQuestion: " + sub_questions[0]
+ prompt += f"Context:\n{fetch_context(self.ledger)}\n"
+ prompt += "Be concise and only return the answer."
+
+ logger.info(f"Performing Math LLM call with prompt: {prompt}")
+ sub_answer = llm([HumanMessage(content=prompt)])
+ self.ledger.question_trace.append(sub_questions[0])
+ self.ledger.answer_trace.append(sub_answer.content)
+
+ self.ledger.done = True
+
+ def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]:
+ """Search for the most relevant documents for the given search parameters."""
+
+ try:
+ retriever = get_doc_retriever(num_nodes=num_docs)
+ nodes = retriever.retrieve(content)
+ output = []
+ for node in nodes:
+ file_name = nodes[0].metadata["filename"]
+ decoded_filename = base64.b64decode(file_name.encode("utf-8")).decode("utf-8")
+ entry = {"score": node.score, "source": decoded_filename, "content": node.text}
+ output.append(entry)
+
+ return output
+
+ except Exception as e:
+ logger.error(f"Error from /documentSearch endpoint. Error details: {e}")
+ return []
diff --git a/RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt
new file mode 100644
index 00000000..39556ee6
--- /dev/null
+++ b/RetrievalAugmentedGeneration/examples/query_decomposition_rag/requirements.txt
@@ -0,0 +1 @@
+faiss-cpu==1.7.4
\ No newline at end of file
diff --git a/RetrievalAugmentedGeneration/frontend/Dockerfile b/RetrievalAugmentedGeneration/frontend/Dockerfile
index be5975b6..5f8192db 100644
--- a/RetrievalAugmentedGeneration/frontend/Dockerfile
+++ b/RetrievalAugmentedGeneration/frontend/Dockerfile
@@ -1,12 +1,14 @@
FROM docker.io/library/python:3.11-slim
-COPY frontend /app/frontend
+RUN mkdir /app
COPY requirements.txt /app
RUN apt-get update; \
apt-get upgrade -y; \
python3 -m pip --no-cache-dir install -r /app/requirements.txt; \
+ python3 -m pip --no-cache-dir install nvidia-riva-client==2.14.0; \
apt-get clean
USER 1001
+COPY frontend /app/frontend
WORKDIR /app
ENTRYPOINT ["python3", "-m", "frontend"]
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py b/RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py
new file mode 100644
index 00000000..a15aefed
--- /dev/null
+++ b/RetrievalAugmentedGeneration/frontend/frontend/asr_utils.py
@@ -0,0 +1,231 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import queue
+from threading import Thread
+
+import os
+import logging
+import grpc
+import pycountry
+import gradio as gr
+import numpy as np
+import riva.client
+import riva.client.proto.riva_asr_pb2 as riva_asr
+import riva.client.proto.riva_asr_pb2_grpc as rasr_srv
+from google.protobuf import text_format
+
+class ASRSession:
+ def __init__(self):
+ self.is_first_buffer = True
+ self.request_queue = None
+ self.response_stream = None
+ self.response_thread = None
+ self.transcript = ""
+
+_LOGGER = logging.getLogger(__name__)
+
+# Extract environmental variables
+RIVA_API_URI = os.getenv("RIVA_API_URI", None)
+RIVA_API_KEY = os.getenv("RIVA_API_KEY", None)
+RIVA_FUNCTION_ID = os.getenv("RIVA_FUNCTION_ID", None)
+
+# Establish a connection to the Riva server
+try:
+ use_ssl = False
+ metadata = []
+ auth = None
+ if RIVA_API_KEY:
+ use_ssl = True
+ metadata.append(("authorization", "Bearer " + RIVA_API_KEY))
+ if RIVA_FUNCTION_ID:
+ use_ssl = True
+ metadata.append(("function-id", RIVA_FUNCTION_ID))
+ auth = riva.client.Auth(
+ None, use_ssl=use_ssl,
+ uri=RIVA_API_URI,
+ metadata_args=metadata
+ )
+ _LOGGER.info('Created riva.client.Auth success')
+except:
+ _LOGGER.info('Error creating riva.client.Auth')
+
+# Obtain the ASR languages available on the Riva server
+ASR_LANGS = dict()
+
+try:
+ _LOGGER.info("Available ASR languages")
+ asr_client = riva.client.ASRService(auth)
+ config_response = asr_client.stub.GetRivaSpeechRecognitionConfig(riva_asr.RivaSpeechRecognitionConfigRequest())
+ for model_config in config_response.model_config:
+ if model_config.parameters["decoder_type"] and model_config.model_name.endswith("streaming"):
+ language_code = model_config.parameters['language_code']
+ language_name = f"{pycountry.languages.get(alpha_2=language_code[:2]).name} ({language_code})"
+ _LOGGER.info(f"{language_name} {model_config.model_name}")
+ ASR_LANGS[language_name] = {"language_code": language_code, "model": model_config.model_name}
+except:
+ ASR_LANGS["No ASR languages available"] = "No ASR languages available"
+ gr.Info('The app could not find any available ASR languages. Thus, none will appear in the "ASR Language" dropdown menu. Check that you are connected to a Riva server with ASR enabled.')
+ _LOGGER.info('The app could not find any available ASR languages. Thus, none will appear in the "ASR Language" dropdown menu. Check that you are connected to a Riva server with ASR enabled.')
+
+ASR_LANGS = dict(sorted(ASR_LANGS.items()))
+
+def print_streaming_response(asr_session):
+ asr_session.transcript = ""
+ final_transcript = ""
+ try:
+ for response in asr_session.response_stream:
+ final = ""
+ partial = ""
+ if not response.results:
+ continue
+ if len(response.results) > 0 and len(response.results[0].alternatives) > 0:
+ for result in response.results:
+ if result.is_final:
+ final += result.alternatives[0].transcript
+ else:
+ partial += result.alternatives[0].transcript
+
+ final_transcript += final
+ asr_session.transcript = final_transcript + partial
+
+ except grpc.RpcError as rpc_error:
+ _LOGGER.error(rpc_error.code(), rpc_error.details())
+ # TODO See if Gradio popup error mechanism can be used.
+ # For now whow error via transcript text box.
+ asr_session.transcript = rpc_error.details()
+ return
+
+def start_recording(audio, language, asr_session):
+ _LOGGER.info('start_recording')
+ asr_session.is_first_buffer = True
+ asr_session.request_queue = queue.Queue()
+ return "", asr_session
+
+def stop_recording(asr_session):
+ _LOGGER.info('stop_recording')
+ try:
+ asr_session.request_queue.put(None)
+ asr_session.response_thread.join()
+ except:
+ pass
+ return asr_session
+
+def transcribe_streaming(audio, language, asr_session, auth=auth):
+ _LOGGER.info('transcribe_streaming')
+
+ if auth == None:
+ _LOGGER.info('Riva client did not initialize properly. Skipping transcription.')
+ return None, None
+
+ if language == 'No ASR languages available':
+ gr.Info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.')
+ _LOGGER.info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.')
+ return None, None
+ rate, data = audio
+ if len(data.shape) > 1:
+ data = np.mean(data, axis=1)
+
+ if not len(data):
+ return asr_session.transcript, asr_session
+
+ if asr_session.is_first_buffer:
+
+ streaming_config = riva.client.StreamingRecognitionConfig(
+ config=riva.client.RecognitionConfig(
+ encoding=riva.client.AudioEncoding.LINEAR_PCM,
+ language_code=ASR_LANGS[language]['language_code'],
+ max_alternatives=1,
+ profanity_filter=False,
+ enable_automatic_punctuation=True,
+ verbatim_transcripts=False,
+ sample_rate_hertz=rate,
+ audio_channel_count=1,
+ enable_word_time_offsets=True,
+ model=ASR_LANGS[language]['model'],
+ ),
+ interim_results=True,
+ )
+
+ _LOGGER.info(f'auth.channel = {auth.channel}')
+ rasr_stub = rasr_srv.RivaSpeechRecognitionStub(auth.channel)
+ asr_session.response_stream = rasr_stub.StreamingRecognize(iter(asr_session.request_queue.get, None))
+
+ # First buffer should contain only the config
+ request = riva_asr.StreamingRecognizeRequest(streaming_config=streaming_config)
+ asr_session.request_queue.put(request)
+
+ asr_session.response_thread = Thread(target=print_streaming_response, args=(asr_session,))
+
+ # run the thread
+ asr_session.response_thread.start()
+
+ asr_session.is_first_buffer = False
+
+ request = riva_asr.StreamingRecognizeRequest(audio_content=data.astype(np.int16).tobytes())
+ asr_session.request_queue.put(request)
+
+ return asr_session.transcript, asr_session
+
+def transcribe_offline(audio, language, diarization, auth=auth):
+ _LOGGER.info('transcribe_offline')
+
+ if auth == None:
+ _LOGGER.info('Riva client did not initialize properly. Skipping transcription.')
+ return None, None
+
+ if language == 'No ASR languages available':
+ gr.Info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.')
+ _LOGGER.info('The app cannot access ASR services. Any attempt to transcribe audio will be unsuccessful. Check that you are connected to a Riva server with ASR enabled.')
+ return None, None
+ rate, data = audio
+ if len(data.shape) > 1:
+ data = np.mean(data, axis=1)
+
+ if not len(data):
+ _LOGGER.info("Empty audio provided")
+ return None, None
+
+ asr_dict = next((d for d in asr_config if d['asr_language_name'] == language), None)
+
+ config = riva.client.RecognitionConfig(
+ encoding=riva.client.AudioEncoding.LINEAR_PCM,
+ sample_rate_hertz=rate,
+ audio_channel_count=1,
+ language_code=ASR_LANGS[language]['language_code'],
+ max_alternatives=1,
+ profanity_filter=False,
+ enable_automatic_punctuation=True,
+ verbatim_transcripts=False,
+ enable_word_time_offsets=True,
+ )
+ riva.client.add_speaker_diarization_to_config(config, diarization)
+
+ asr_client = riva.client.ASRService(auth)
+ try:
+ response = asr_client.offline_recognize(data.astype(np.int16).tobytes(), config)
+ if len(response.results) > 0 and len(response.results[0].alternatives) > 0:
+ final_transcript = ""
+ for res in response.results:
+ final_transcript += res.alternatives[0].transcript
+ return final_transcript, text_format.MessageToString(response, as_utf8=True)
+ except grpc.RpcError as rpc_error:
+ _LOGGER.info(f"{rpc_error.code()}, {rpc_error.details()}")
+ # TODO See if Gradio popup error mechanism can be used.
+ # For now whow error via transcript text box.
+ latest_transcript = rpc_error.details()
+ return latest_transcript, None
+
+ return latest_transcript, None
\ No newline at end of file
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css b/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css
index 04e93049..237f3ddd 100644
--- a/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css
+++ b/RetrievalAugmentedGeneration/frontend/frontend/assets/kaizen-theme.css
@@ -11,3 +11,18 @@
footer {
visibility: hidden;
}
+
+.record-button {
+ width: 35px !important;
+ overflow: hidden !important;
+}
+.record-button::before {
+ content: "🎤" !important;
+ background-color: var(--block-background-fill) !important;
+}
+.mic-wrap {float: left}
+#microphone{min-width: min(0px, 100%) !important;}
+#microphone div.small {
+ width: 25px;
+ height: 100%;
+}
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py b/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py
index b2cac45a..0b95f957 100644
--- a/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py
+++ b/RetrievalAugmentedGeneration/frontend/frontend/chat_client.py
@@ -20,6 +20,8 @@
import requests
+from frontend import tracing
+
_LOGGER = logging.getLogger(__name__)
@@ -37,12 +39,16 @@ def model_name(self) -> str:
"""Return the friendly model name."""
return self._model_name
+ @tracing.instrumentation_wrapper
def search(
- self, prompt: str
+ self, carrier, prompt: str
) -> typing.List[typing.Dict[str, typing.Union[str, float]]]:
"""Search for relevant documents and return json data."""
data = {"content": prompt, "num_docs": 4}
- headers = {"accept": "application/json", "Content-Type": "application/json"}
+ headers = {
+ **carrier,
+ "accept": "application/json", "Content-Type": "application/json"
+ }
url = f"{self.server_url}/documentSearch"
_LOGGER.debug(
"looking up documents - %s", str({"server_url": url, "post_data": data})
@@ -62,8 +68,9 @@ def search(
)
+ @tracing.predict_instrumentation_wrapper
def predict(
- self, query: str, use_knowledge_base: bool, num_tokens: int
+ self, carrier, query: str, use_knowledge_base: bool, num_tokens: int
) -> typing.Generator[str, None, None]:
"""Make a model prediction."""
data = {
@@ -78,8 +85,7 @@ def predict(
)
try:
- with requests.post(url, stream=True, json=data, timeout=10) as req:
-
+ with requests.post(url, stream=True, json=data, timeout=30, headers=carrier) as req:
req.raise_for_status()
for chunk in req.iter_content(16):
yield chunk.decode("UTF-8")
@@ -87,10 +93,16 @@ def predict(
_LOGGER.error(f"Failed to get response from /generate endpoint of chain-server. Error details: {e}. Refer to chain-server logs for details.")
yield str("Failed to get response from /generate endpoint of chain-server. Check if the fastapi server in chain-server is up. Refer to chain-server logs for details.")
- def upload_documents(self, file_paths: typing.List[str]) -> None:
+ # Send None to indicate end of response
+ yield None
+
+
+ @tracing.instrumentation_wrapper
+ def upload_documents(self, carrier, file_paths: typing.List[str]) -> None:
"""Upload documents to the kb."""
url = f"{self.server_url}/uploadDocument"
headers = {
+ **carrier,
"accept": "application/json",
}
@@ -105,8 +117,11 @@ def upload_documents(self, file_paths: typing.List[str]) -> None:
str({"server_url": url, "file": fpath}),
)
- _ = requests.post(
+ resp = requests.post(
url, headers=headers, files=files, timeout=600 # type: ignore [arg-type]
)
+ if resp.status_code == 500:
+ raise ValueError(f"{resp.json().get('message', 'Failed to upload document')}")
except Exception as e:
- _LOGGER.error(f"Failed to get response from /uploadDocument endpoint of chain-server. Error details: {e}. Refer to chain-server logs for details.")
\ No newline at end of file
+ _LOGGER.error(f"Failed to get response from /uploadDocument endpoint of chain-server. Error details: {e}. Refer to chain-server logs for details.")
+ raise ValueError(f"{e}")
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py b/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py
index 2671f5b0..414e6a87 100644
--- a/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py
+++ b/RetrievalAugmentedGeneration/frontend/frontend/pages/converse.py
@@ -20,7 +20,7 @@
import gradio as gr
-from frontend import assets, chat_client
+from frontend import assets, chat_client, asr_utils, tts_utils
_LOGGER = logging.getLogger(__name__)
PATH = "/converse"
@@ -38,16 +38,21 @@
def build_page(client: chat_client.ChatClient) -> gr.Blocks:
- """Buiild the gradio page to be mounted in the frame."""
+ """Build the gradio page to be mounted in the frame."""
kui_theme, kui_styles = assets.load_theme("kaizen")
with gr.Blocks(title=TITLE, theme=kui_theme, css=kui_styles + _LOCAL_CSS) as page:
+
+ # session specific state across runs
+ state = gr.State(value=asr_utils.ASRSession())
+
# create the page header
gr.Markdown(f"# {TITLE}")
# chat logs
with gr.Row(equal_height=True):
chatbot = gr.Chatbot(scale=2, label=client.model_name)
+ latest_response = gr.Textbox(visible=False)
context = gr.JSON(
scale=1,
label="Knowledge Base Context",
@@ -55,16 +60,82 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks:
elem_id="contextbox",
)
+ # TTS output box
+ # visible so that users can stop or replay playback
+ with gr.Row():
+ output_audio = gr.Audio(
+ label="Synthesized Speech",
+ autoplay=True,
+ interactive=False,
+ streaming=True,
+ visible=True,
+ show_download_button=False
+ )
+
+ # check boxes
with gr.Row():
- with gr.Column(scale=10, min_width=600):
+ with gr.Column(scale=10, min_width=150):
kb_checkbox = gr.Checkbox(
label="Use knowledge base", info="", value=False
)
+ with gr.Column(scale=10, min_width=150):
+ tts_checkbox = gr.Checkbox(
+ label="Enable TTS output", info="", value=False
+ )
+
+ # dropdowns
+ with gr.Accordion("ASR and TTS Settings"):
+ with gr.Row():
+ asr_language_list = list(asr_utils.ASR_LANGS)
+ asr_language_dropdown = gr.components.Dropdown(
+ label="ASR Language",
+ choices=asr_language_list,
+ value=asr_language_list[0],
+ )
+ tts_language_list = list(tts_utils.TTS_MODELS)
+ tts_language_dropdown = gr.components.Dropdown(
+ label="TTS Language",
+ choices=tts_language_list,
+ value=tts_language_list[0],
+ )
+ all_voices = []
+ try:
+ for model in tts_utils.TTS_MODELS:
+ all_voices.extend(tts_utils.TTS_MODELS[model]['voices'])
+ default_voice = tts_utils.TTS_MODELS[tts_language_list[0]]['voices'][0]
+ except:
+ all_voices.append("No TTS voices available")
+ default_voice = "No TTS voices available"
+ tts_voice_dropdown = gr.components.Dropdown(
+ label="TTS Voice",
+ choices=all_voices,
+ value=default_voice,
+ )
+
+ # audio and text input boxes
+ with gr.Row():
+ with gr.Column(scale=10, min_width=500):
msg = gr.Textbox(
show_label=False,
placeholder="Enter text and press ENTER",
container=False,
)
+ # For (at least) Gradio 3.39.0 and lower, the first argument
+ # in the list below is named `source`. If not None, it must
+ # be a single string, namely either "upload" or "microphone".
+ # For more recent Gradio versions (such as 4.4.1), it's named
+ # `sources`, plural. If not None, it must be a list, containing
+ # either "upload", "microphone", or both.
+ audio_mic = gr.Audio(
+ sources=["microphone"],
+ type="numpy",
+ streaming=True,
+ visible=True,
+ label="Transcribe Audio Query",
+ show_label=False,
+ container=False,
+ elem_id="microphone",
+ )
# user feedback
with gr.Row():
@@ -73,7 +144,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks:
# _ = gr.Button(value="⚠️ Flag")
submit_btn = gr.Button(value="Submit")
_ = gr.ClearButton(msg)
- _ = gr.ClearButton([msg, chatbot], value="Clear history")
+ _ = gr.ClearButton([msg, chatbot], value="Clear History")
ctx_show = gr.Button(value="Show Context")
ctx_hide = gr.Button(value="Hide Context", visible=False)
@@ -95,10 +166,49 @@ def _toggle_context(btn: str) -> Dict[gr.component, Dict[Any, Any]]:
# form actions
_my_build_stream = functools.partial(_stream_predict, client)
msg.submit(
- _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context]
+ _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context, latest_response]
)
submit_btn.click(
- _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context]
+ _my_build_stream, [kb_checkbox, msg, chatbot], [msg, chatbot, context, latest_response]
+ )
+
+ tts_language_dropdown.change(
+ tts_utils.update_voice_dropdown,
+ [tts_language_dropdown],
+ [tts_voice_dropdown],
+ api_name=False
+ )
+
+ audio_mic.start_recording(
+ asr_utils.start_recording,
+ [audio_mic, asr_language_dropdown, state],
+ [msg, state],
+ api_name=False,
+ )
+ audio_mic.stop_recording(
+ asr_utils.stop_recording,
+ [state],
+ [state],
+ api_name=False
+ )
+ audio_mic.stream(
+ asr_utils.transcribe_streaming,
+ [audio_mic, asr_language_dropdown, state],
+ [msg, state],
+ api_name=False
+ )
+ audio_mic.clear(
+ lambda: "",
+ [],
+ [msg],
+ api_name=False
+ )
+
+ latest_response.change(
+ tts_utils.text_to_speech,
+ [latest_response, tts_language_dropdown, tts_voice_dropdown, tts_checkbox],
+ [output_audio],
+ api_name=False
)
page.queue()
@@ -121,8 +231,11 @@ def _stream_predict(
documents: Union[None, List[Dict[str, Union[str, float]]]] = None
if use_knowledge_base:
- documents = client.search(question)
-
- for chunk in client.predict(question, use_knowledge_base, OUTPUT_TOKENS):
- chunks += chunk
- yield "", chat_history + [[question, chunks]], documents
+ documents = client.search(prompt = question)
+
+ for chunk in client.predict(query=question, use_knowledge_base=use_knowledge_base, num_tokens=OUTPUT_TOKENS):
+ if chunk:
+ chunks += chunk
+ yield "", chat_history + [[question, chunks]], documents, ""
+ else:
+ yield "", chat_history + [[question, chunks]], documents, chunks
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py b/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py
index c38c846c..35327f59 100644
--- a/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py
+++ b/RetrievalAugmentedGeneration/frontend/frontend/pages/kb.py
@@ -62,16 +62,19 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks:
def upload_file(files: List[Path], client: chat_client.ChatClient) -> List[str]:
"""Use the client to upload a file to the knowledge base."""
- file_paths = [file.name for file in files]
- client.upload_documents(file_paths)
-
- # Save the uploaded file names to the state file
- with open(STATE_FILE, 'a') as file:
- for file_path in file_paths:
- file_path = os.path.basename(file_path)
- file.write(file_path + '\n')
-
- return file_paths
+ try:
+ file_paths = [file.name for file in files]
+ client.upload_documents(file_paths = file_paths)
+
+ # Save the uploaded file names to the state file
+ with open(STATE_FILE, 'a') as file:
+ for file_path in file_paths:
+ file_path = os.path.basename(file_path)
+ file.write(file_path + '\n')
+
+ return file_paths
+ except Exception as e:
+ raise gr.Error(f"{e}")
def get_uploaded_files():
"""Load previously uploaded files if the file exists"""
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/tracing.py b/RetrievalAugmentedGeneration/frontend/frontend/tracing.py
new file mode 100644
index 00000000..945ae00f
--- /dev/null
+++ b/RetrievalAugmentedGeneration/frontend/frontend/tracing.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from opentelemetry import trace
+from opentelemetry.sdk.resources import SERVICE_NAME, Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+from opentelemetry.propagate import set_global_textmap, get_global_textmap
+from opentelemetry.propagators.composite import CompositePropagator
+
+# Configure tracer used by the Frontend to create spans
+resource = Resource.create({
+ SERVICE_NAME: "frontend"
+})
+provider = TracerProvider(resource=resource)
+if os.environ.get("ENABLE_TRACING") == "true":
+ processor = SimpleSpanProcessor(OTLPSpanExporter())
+ provider.add_span_processor(processor)
+trace.set_tracer_provider(provider)
+tracer = trace.get_tracer("frontend")
+
+# Configure Propagator used for processing trace context received by the Frontend
+if os.environ.get("ENABLE_TRACING") == "true":
+ propagator = TraceContextTextMapPropagator()
+else:
+ propagator = CompositePropagator([]) # No-op propagator
+
+set_global_textmap(propagator)
+
+# Include the contents of carrier in an HTTP header
+# to propagate the span context into another microservice
+def inject_context(ctx):
+ carrier = {}
+ get_global_textmap().inject(carrier, context=ctx)
+ return carrier
+
+# Wrapper Function to perform instrumentation
+def instrumentation_wrapper(func):
+ def wrapper(self, *args, **kwargs):
+ span_name = func.__name__
+ span = tracer.start_span(span_name)
+ span_ctx = trace.set_span_in_context(span)
+ carrier = inject_context(span_ctx)
+ [span.set_attribute(f"{kw}", kwargs[kw]) for kw in kwargs]
+ result = func(self, carrier, *args, **kwargs)
+ span.end()
+ return result
+ return wrapper
+
+# Wrapper function for the streaming predict call
+def predict_instrumentation_wrapper(func):
+ def wrapper(self, *args, **kwargs):
+ span_name = func.__name__
+ span = tracer.start_span(span_name)
+ span_ctx = trace.set_span_in_context(span)
+ [span.set_attribute(f"{kw}", kwargs[kw]) for kw in kwargs]
+ carrier = inject_context(span_ctx)
+ constructed_response = ""
+ for chunk in func(self, carrier, *args, **kwargs):
+ if chunk:
+ constructed_response += chunk
+ yield chunk
+ span.set_attribute("response", constructed_response)
+ span.end()
+ return wrapper
\ No newline at end of file
diff --git a/RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py b/RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py
new file mode 100644
index 00000000..ac65a025
--- /dev/null
+++ b/RetrievalAugmentedGeneration/frontend/frontend/tts_utils.py
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import json
+import logging
+import pycountry
+from pathlib import Path
+from threading import Thread
+from typing import TYPE_CHECKING, Any, List
+import gradio as gr
+import numpy as np
+import riva.client
+import riva.client.proto.riva_tts_pb2 as riva_tts
+
+_LOGGER = logging.getLogger(__name__)
+
+# Extract environmental variables
+RIVA_API_URI = os.getenv("RIVA_API_URI", None)
+RIVA_API_KEY = os.getenv("RIVA_API_KEY", None)
+RIVA_FUNCTION_ID = os.getenv("RIVA_FUNCTION_ID", None)
+
+try:
+ tts_sample_rate = int(os.getenv("TTS_SAMPLE_RATE", 48000))
+except Exception as e:
+ _LOGGER.info('TTS_SAMPLE_RATE is not set to an integer value. Defaulting to 48000.')
+ tts_sample_rate = 48000
+
+# Establish a connection to the Riva server
+try:
+ use_ssl = False
+ metadata = []
+ auth = None
+ if RIVA_API_KEY:
+ use_ssl = True
+ metadata.append(("authorization", "Bearer " + RIVA_API_KEY))
+ if RIVA_FUNCTION_ID:
+ use_ssl = True
+ metadata.append(("function-id", RIVA_FUNCTION_ID))
+ auth = riva.client.Auth(
+ None, use_ssl=use_ssl,
+ uri=RIVA_API_URI,
+ metadata_args=metadata
+ )
+ _LOGGER.info('Created riva.client.Auth success')
+except:
+ _LOGGER.info('Error creating riva.client.Auth')
+
+# Obtain the TTS languages and voices available on the Riva server
+TTS_MODELS = dict()
+try:
+ tts_client = riva.client.SpeechSynthesisService(auth)
+ config_response = tts_client.stub.GetRivaSynthesisConfig(riva_tts.RivaSynthesisConfigRequest())
+ for model_config in config_response.model_config:
+ language_code = model_config.parameters['language_code']
+ language_name = f"{pycountry.languages.get(alpha_2=language_code[:2]).name} ({language_code})"
+ voice_name = model_config.parameters['voice_name']
+ subvoices = [voice.split(':')[0] for voice in model_config.parameters['subvoices'].split(',')]
+ full_voice_names = [voice_name + "." + subvoice for subvoice in subvoices]
+
+ if language_name in TTS_MODELS:
+ TTS_MODELS[language_name]['voices'].extend(full_voice_names)
+ else:
+ TTS_MODELS[language_name] = {"language_code": language_code, "voices": full_voice_names}
+
+ TTS_MODELS = dict(sorted(TTS_MODELS.items()))
+
+ _LOGGER.info(json.dumps(TTS_MODELS, indent=4))
+except:
+ TTS_MODELS["No TTS languages available"] = "No TTS languages available"
+ gr.Info('The app could not find any available TTS languages. Thus, none will appear in the "TTS Language" or "TTS Voice" dropdown menus. Check that you are connected to a Riva server with TTS enabled.')
+ _LOGGER.info('The app could not find any available TTS languages. Thus, none will appear in the "TTS Language" or "TTS Voice" dropdown menus. Check that you are connected to a Riva server with TTS enabled.')
+
+# Once the user selects a TTS language, narrow the options in the TTS voice
+# dropdown menu accordingly
+def update_voice_dropdown(language):
+ if language == "No TTS languages available":
+ voice_dropdown = gr.Dropdown(
+ label="Voice", choices="No TTS voices available", value="No TTS voices available"
+ )
+ else:
+ voice_dropdown = gr.Dropdown(
+ label="Voice", choices=TTS_MODELS[language]['voices'], value=TTS_MODELS[language]['voices'][0]
+ )
+ return voice_dropdown
+
+def text_to_speech(text, language, voice, enable_tts, auth=auth):
+ if not enable_tts:
+ return None
+ if auth == None:
+ _LOGGER.info('Riva client did not initialize properly. Skipping text to speech.')
+ return None, None
+ if language == "No TTS languages available":
+ gr.Info('The app cannot access TTS services. Any attempt to synthesize audio will be unsuccessful. Check that you are connected to a Riva server with TTS enabled.')
+ _LOGGER.info('The app cannot access TTS services. Any attempt to synthesize audio will be unsuccessful. Check that you are connected to a Riva server with TTS enabled.')
+ return None, gr.update(interactive=False)
+ if not text or not voice or not enable_tts:
+ gr.Info("Provide all inputs or select an example")
+ return None, gr.update(interactive=False)
+ if not text:
+ gr.Info('No text from which to synthesize a voice has been provided')
+ return None, gr.update(interactive=False)
+ if not voice:
+ gr.Info('No TTS voice or an invalid TTS voice has been selected')
+ return None, gr.update(interactive=False)
+ if not enable_tts:
+ gr.Info('TTS output is currently disabled. Click on the "Enable TTS output" checkbox to enable it.')
+ return None, gr.update(interactive=False)
+
+ first_buffer = True
+ start_time = time.time()
+
+ # TODO: Gradio Flagging doesn't work with streaming audio ouptut.
+ # See https://github.com/gradio-app/gradio/issues/5806
+ # TODO: Audio download does not work with streaming audio output.
+ # See https://github.com/gradio-app/gradio/issues/6570
+
+ tts_client = riva.client.SpeechSynthesisService(auth)
+
+ response = tts_client.synthesize_online(
+ text=text,
+ voice_name=voice,
+ language_code=TTS_MODELS[language]['language_code'],
+ sample_rate_hz=tts_sample_rate
+ )
+ for result in response:
+ if len(result.audio):
+ if first_buffer:
+ _LOGGER.info(
+ f"TTS request [{result.id.value}] first buffer latency: {time.time() - start_time} sec"
+ )
+ first_buffer = False
+ yield (tts_sample_rate, np.frombuffer(result.audio, dtype=np.int16))
+
+ _LOGGER.info(f"TTS request [{result.id.value}] last buffer latency: {time.time() - start_time} sec")
+
+ yield (tts_sample_rate, np.frombuffer(b'', dtype=np.int16))
diff --git a/RetrievalAugmentedGeneration/frontend/requirements.txt b/RetrievalAugmentedGeneration/frontend/requirements.txt
index a2a85130..78b1c5c8 100644
--- a/RetrievalAugmentedGeneration/frontend/requirements.txt
+++ b/RetrievalAugmentedGeneration/frontend/requirements.txt
@@ -1,8 +1,12 @@
dataclass_wizard==0.22.2
-gradio==3.39.0
+gradio==4.13.0
jinja2==3.1.2
numpy==1.25.2
protobuf==3.20.3
PyYAML==6.0
tritonclient[all]==2.36.0
uvicorn==0.22.0
+opentelemetry-sdk==1.21.0
+opentelemetry-api==1.21.0
+opentelemetry-exporter-otlp-proto-grpc==1.21.0
+pycountry==23.12.11
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext
deleted file mode 120000
index 056bf100..00000000
--- a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext
+++ /dev/null
@@ -1 +0,0 @@
-llama
\ No newline at end of file
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/1/.tmp b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/1/.tmp
new file mode 100644
index 00000000..e69de29b
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt
new file mode 100755
index 00000000..cbd087ce
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/ensemble/config.pbtxt
@@ -0,0 +1,228 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble"
+platform: "ensemble"
+max_batch_size: 128
+input [
+ {
+ name: "text_input"
+ data_type: TYPE_STRING
+ dims: [ -1 ]
+ },
+ {
+ name: "max_tokens"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "end_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "pad_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "top_k"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "top_p"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "temperature"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "length_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "repetition_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "min_length"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "presence_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "random_seed"
+ data_type: TYPE_UINT64
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "beam_width"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "stream"
+ data_type: TYPE_BOOL
+ dims: [ 1 ]
+ optional: true
+ }
+]
+output [
+ {
+ name: "text_output"
+ data_type: TYPE_STRING
+ dims: [ -1, -1 ]
+ }
+]
+ensemble_scheduling {
+ step [
+ {
+ model_name: "preprocessing"
+ model_version: -1
+ input_map {
+ key: "QUERY"
+ value: "text_input"
+ }
+ input_map {
+ key: "REQUEST_OUTPUT_LEN"
+ value: "max_tokens"
+ }
+ output_map {
+ key: "REQUEST_INPUT_LEN"
+ value: "_REQUEST_INPUT_LEN"
+ }
+ output_map {
+ key: "INPUT_ID"
+ value: "_INPUT_ID"
+ }
+ output_map {
+ key: "REQUEST_OUTPUT_LEN"
+ value: "_REQUEST_OUTPUT_LEN"
+ }
+ },
+ {
+ model_name: "tensorrt_llm"
+ model_version: -1
+ input_map {
+ key: "input_ids"
+ value: "_INPUT_ID"
+ }
+ input_map {
+ key: "input_lengths"
+ value: "_REQUEST_INPUT_LEN"
+ }
+ input_map {
+ key: "request_output_len"
+ value: "_REQUEST_OUTPUT_LEN"
+ }
+ input_map {
+ key: "end_id"
+ value: "end_id"
+ }
+ input_map {
+ key: "pad_id"
+ value: "pad_id"
+ }
+ input_map {
+ key: "runtime_top_k"
+ value: "top_k"
+ }
+ input_map {
+ key: "runtime_top_p"
+ value: "top_p"
+ }
+ input_map {
+ key: "temperature"
+ value: "temperature"
+ }
+ input_map {
+ key: "len_penalty"
+ value: "length_penalty"
+ }
+ input_map {
+ key: "repetition_penalty"
+ value: "repetition_penalty"
+ }
+ input_map {
+ key: "min_length"
+ value: "min_length"
+ }
+ input_map {
+ key: "presence_penalty"
+ value: "presence_penalty"
+ }
+ input_map {
+ key: "random_seed"
+ value: "random_seed"
+ }
+ input_map {
+ key: "beam_width"
+ value: "beam_width"
+ }
+ input_map {
+ key: "streaming"
+ value: "stream"
+ }
+ output_map {
+ key: "output_ids"
+ value: "_TOKENS_BATCH"
+ }
+ },
+ {
+ model_name: "postprocessing"
+ model_version: -1
+ input_map {
+ key: "TOKENS_BATCH"
+ value: "_TOKENS_BATCH"
+ }
+ output_map {
+ key: "OUTPUT"
+ value: "text_output"
+ }
+ }
+ ]
+}
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py
new file mode 100755
index 00000000..bb8a7378
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/1/model.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+import json
+import os
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from transformers import LlamaTokenizer
+
+TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "/model")
+
+SPACE_CHAR = 9601
+NEWLINE_CHAR = 60
+STOP_TOKEN = 2
+
+
+class TritonPythonModel:
+ """Your Python model must use the same class name. Every Python model
+ that is created must have "TritonPythonModel" as the class name.
+ """
+
+ def initialize(self, args):
+ """`initialize` is called only once when the model is being loaded.
+ Implementing `initialize` function is optional. This function allows
+ the model to initialize any state associated with this model.
+ Parameters
+ ----------
+ args : dict
+ Both keys and values are strings. The dictionary keys and values are:
+ * model_config: A JSON string containing the model configuration
+ * model_instance_kind: A string containing model instance kind
+ * model_instance_device_id: A string containing model instance device ID
+ * model_repository: Model repository path
+ * model_version: Model version
+ * model_name: Model name
+ """
+ # Parse model configs
+ self.model_config = model_config = json.loads(args["model_config"])
+
+ # Parse model output configs
+ output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT")
+
+ # Convert Triton types to numpy types
+ self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
+
+ self.tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR, legacy=False)
+ vocab = self.tokenizer.convert_ids_to_tokens(
+ list(range(self.tokenizer.vocab_size))
+ )
+
+ def execute(self, requests):
+ """`execute` must be implemented in every Python model. `execute`
+ function receives a list of pb_utils.InferenceRequest as the only
+ argument. This function is called when an inference is requested
+ for this model. Depending on the batching configuration (e.g. Dynamic
+ Batching) used, `requests` may contain multiple requests. Every
+ Python model, must create one pb_utils.InferenceResponse for every
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
+ set the error argument when creating a pb_utils.InferenceResponse.
+ Parameters
+ ----------
+ requests : list
+ A list of pb_utils.InferenceRequest
+ Returns
+ -------
+ list
+ A list of pb_utils.InferenceResponse. The length of this list must
+ be the same as `requests`
+ """
+
+ responses = []
+
+ # Every Python backend must iterate over everyone of the requests
+ # and create a pb_utils.InferenceResponse for each of them.
+ for request in requests:
+ # Get input tensors
+ tokens_batch = pb_utils.get_input_tensor_by_name(
+ request, "TOKENS_BATCH"
+ ).as_numpy()
+
+ # Reshape Input
+ # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
+ # tokens_batch = tokens_batch.T
+
+ # Postprocessing output data.
+ outputs = self._postprocessing(tokens_batch)
+
+ # Create output tensors. You need pb_utils.Tensor
+ # objects to create pb_utils.InferenceResponse.
+ output_tensor = pb_utils.Tensor(
+ "OUTPUT", np.array(outputs).astype(self.output_dtype)
+ )
+
+ # Create InferenceResponse. You can set an error here in case
+ # there was a problem with handling this inference request.
+ # Below is an example of how you can set errors in inference
+ # response:
+ #
+ # pb_utils.InferenceResponse(
+ # output_tensors=..., TritonError("An error occurred"))
+ inference_response = pb_utils.InferenceResponse(
+ output_tensors=[output_tensor]
+ )
+ responses.append(inference_response)
+
+ # You should return a list of pb_utils.InferenceResponse. Length
+ # of this list must match the length of `requests` list.
+ return responses
+
+ def finalize(self):
+ """`finalize` is called only once when the model is being unloaded.
+ `Implementing `finalize` function is optional. This function allows
+ the model to perform any necessary clean ups before exit.
+ """
+ pb_utils.Logger.log("Finalizing the Post-Processing Model.")
+
+ def _id_to_token(self, token_id):
+ # handle special tokens (end of string, unknown, etc)
+ try:
+ special_token_index = self.tokenizer.all_special_ids.index(token_id)
+ return self.tokenizer.all_special_tokens[special_token_index]
+ except ValueError:
+ pass
+
+ # handle typical tokens
+ tokens = self.tokenizer.convert_ids_to_tokens(token_id)
+ if ord(tokens[0]) == SPACE_CHAR:
+ return f" {tokens[1:]}"
+ if ord(tokens[0]) == NEWLINE_CHAR:
+ return "\n"
+ return tokens
+
+ def _postprocessing(self, tokens_batch):
+ tokens_batch = tokens_batch.tolist()
+ return [
+ self._id_to_token(token_id)
+ for beam_tokens in tokens_batch
+ for token_ids in beam_tokens
+ for token_id in token_ids
+ ]
+
+ # for beam_tokens in tokens_batch:
+ # for token_ids in beam_tokens:
+ # for token_id in token_ids:
+ # # handle special tokens (end of string, unknown, etc)
+ # special_token = self.tokenizer.added_tokens_decoder.get(token_id)
+ # if special_token:
+ # tokens = special_token.content
+
+ # # handle typical tokens
+ # else:
+ # tokens = self.tokenizer.convert_ids_to_tokens(token_id)
+ # if ord(tokens[0]) == SPACE_CHAR:
+ # tokens = f" {tokens[1:]}"
+ # elif ord(tokens[0]) == NEWLINE_CHAR:
+ # tokens = "\n"
+
+ # outputs.append(tokens)
+ # return outputs
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt
new file mode 100755
index 00000000..3c3ea10d
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/postprocessing/config.pbtxt
@@ -0,0 +1,50 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "postprocessing"
+backend: "python"
+max_batch_size: 128
+input [
+ {
+ name: "TOKENS_BATCH"
+ data_type: TYPE_INT32
+ dims: [ -1, -1 ]
+ }
+]
+output [
+ {
+ name: "OUTPUT"
+ data_type: TYPE_STRING
+ dims: [ -1, -1 ]
+ }
+]
+
+instance_group [
+ {
+ count: 1
+ kind: KIND_CPU
+ }
+]
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py
new file mode 100644
index 00000000..44e8b9c4
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/1/model.py
@@ -0,0 +1,244 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import csv
+import json
+import os
+
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.nn.utils.rnn import pad_sequence
+from transformers import LlamaTokenizer
+
+TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "/model")
+
+END_ID = 2
+
+# SYSTEM_PROMPT = (
+# """You are a helpful, respectful and honest assistant."""
+# """Always answer as helpfully as possible, while being safe."""
+# """Please ensure that your responses are positive in nature."""
+# )
+
+# LLAMA_PROMPT_TEMPLATE = (
+# "[INST] <>"
+# "{system_prompt}"
+# "<>"
+# "[/INST] {context} [INST] {question} [/INST]"
+# )
+
+
+class TritonPythonModel:
+ """Your Python model must use the same class name. Every Python model
+ that is created must have "TritonPythonModel" as the class name.
+ """
+
+ def initialize(self, args):
+ """`initialize` is called only once when the model is being loaded.
+ Implementing `initialize` function is optional. This function allows
+ the model to initialize any state associated with this model.
+ Parameters
+ ----------
+ args : dict
+ Both keys and values are strings. The dictionary keys and values are:
+ * model_config: A JSON string containing the model configuration
+ * model_instance_kind: A string containing model instance kind
+ * model_instance_device_id: A string containing model instance device ID
+ * model_repository: Model repository path
+ * model_version: Model version
+ * model_name: Model name
+ """
+ # Parse model configs
+ self.model_config = model_config = json.loads(args["model_config"])
+
+ # Parse model output configs and convert Triton types to numpy types
+ input_names = ["INPUT_ID", "REQUEST_INPUT_LEN"]
+ for input_name in input_names:
+ setattr(
+ self,
+ input_name.lower() + "_dtype",
+ pb_utils.triton_string_to_numpy(
+ pb_utils.get_output_config_by_name(model_config, input_name)[
+ "data_type"
+ ]
+ ),
+ )
+
+ self.encoder = LlamaTokenizer.from_pretrained(TOKENIZER_DIR, legacy=False)
+
+ def execute(self, requests):
+ """`execute` must be implemented in every Python model. `execute`
+ function receives a list of pb_utils.InferenceRequest as the only
+ argument. This function is called when an inference is requested
+ for this model. Depending on the batching configuration (e.g. Dynamic
+ Batching) used, `requests` may contain multiple requests. Every
+ Python model, must create one pb_utils.InferenceResponse for every
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
+ set the error argument when creating a pb_utils.InferenceResponse.
+ Parameters
+ ----------
+ requests : list
+ A list of pb_utils.InferenceRequest
+ Returns
+ -------
+ list
+ A list of pb_utils.InferenceResponse. The length of this list must
+ be the same as `requests`
+ """
+
+ responses = []
+
+ # Every Python backend must iterate over everyone of the requests
+ # and create a pb_utils.InferenceResponse for each of them.
+ for request in requests:
+ # Get input tensors
+ query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy()
+ request_output_len = pb_utils.get_input_tensor_by_name(
+ request, "REQUEST_OUTPUT_LEN"
+ ).as_numpy()
+
+ input_id, request_input_len = self._create_request(query)
+
+ # Create output tensors. You need pb_utils.Tensor
+ # objects to create pb_utils.InferenceResponse.
+ input_id_tensor = pb_utils.Tensor(
+ "INPUT_ID", np.array(input_id).astype(self.input_id_dtype)
+ )
+ request_input_len_tensor = pb_utils.Tensor(
+ "REQUEST_INPUT_LEN",
+ np.array(request_input_len).astype(self.request_input_len_dtype),
+ )
+ request_output_len_tensor = pb_utils.Tensor(
+ "REQUEST_OUTPUT_LEN", request_output_len
+ )
+
+ # Create InferenceResponse. You can set an error here in case
+ # there was a problem with handling this inference request.
+ # Below is an example of how you can set errors in inference
+ # response:
+ #
+ # pb_utils.InferenceResponse(
+ # output_tensors=..., TritonError("An error occurred"))
+ inference_response = pb_utils.InferenceResponse(
+ output_tensors=[
+ input_id_tensor,
+ request_input_len_tensor,
+ request_output_len_tensor,
+ ]
+ )
+ responses.append(inference_response)
+
+ # You should return a list of pb_utils.InferenceResponse. Length
+ # of this list must match the length of `requests` list.
+ return responses
+
+ def finalize(self):
+ """`finalize` is called only once when the model is being unloaded.
+ Implementing `finalize` function is optional. This function allows
+ the model to perform any necessary clean ups before exit.
+ """
+ pb_utils.Logger.log("Finalizing the Pre-Processing Model.")
+
+ def _create_request(self, prompts):
+ """
+ prompts : batch string (2D numpy array)
+ """
+
+ start_ids = [
+ torch.IntTensor(self.encoder.encode(prompt[0].decode()))
+ for prompt in prompts
+ ]
+
+ start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
+
+ start_ids = pad_sequence(start_ids, batch_first=True, padding_value=END_ID)
+
+ return start_ids, start_lengths
+
+ def _create_word_list(self, word_dict):
+ flat_ids = []
+ offsets = []
+ for word_dict_item in word_dict:
+ item_flat_ids = []
+ item_offsets = []
+
+ words = list(csv.reader([word_dict_item[0].decode()]))[0]
+ for word in words:
+ ids = self._encode(word)
+
+ if len(ids) == 0:
+ continue
+
+ item_flat_ids += ids
+ item_offsets.append(len(ids))
+
+ flat_ids.append(np.array(item_flat_ids))
+ offsets.append(np.cumsum(np.array(item_offsets)))
+
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
+
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
+
+ return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
+
+ def to_word_list_format(self, word_dict):
+ flat_ids = []
+ offsets = []
+ for word_dict_item in word_dict:
+ item_flat_ids = []
+ item_offsets = []
+
+ if isinstance(word_dict_item[0], bytes):
+ word_dict_item = [word_dict_item[0].decode()]
+
+ words = list(csv.reader(word_dict_item))[0]
+ for word in words:
+ ids = self.encoder.encode(word)
+
+ if len(ids) == 0:
+ continue
+
+ item_flat_ids += ids
+ item_offsets.append(len(ids))
+
+ flat_ids.append(np.array(item_flat_ids))
+ offsets.append(np.cumsum(np.array(item_offsets)))
+
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
+
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
+
+ return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
+
+ def _encode(self, sentence):
+ sentence = sentence.decode() if isinstance(sentence, bytes) else sentence
+ return self.encoder.encode(sentence)
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt
new file mode 100644
index 00000000..d2e3029a
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/preprocessing/config.pbtxt
@@ -0,0 +1,65 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "preprocessing"
+backend: "python"
+max_batch_size: 128
+input [
+ {
+ name: "QUERY"
+ data_type: TYPE_STRING
+ dims: [ -1 ]
+ },
+ {
+ name: "REQUEST_OUTPUT_LEN"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ }
+]
+output [
+ {
+ name: "INPUT_ID"
+ data_type: TYPE_INT32
+ dims: [ -1 ]
+ },
+ {
+ name: "REQUEST_INPUT_LEN"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ },
+ {
+ name: "REQUEST_OUTPUT_LEN"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ }
+]
+
+instance_group [
+ {
+ count: 1
+ kind: KIND_CPU
+ }
+]
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/1/.gitkeep b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/1/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2 b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2
new file mode 100644
index 00000000..4b719b04
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/ensemble_models/gptnext/tensorrt_llm/config.pbtxt.j2
@@ -0,0 +1,208 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "tensorrt_llm"
+backend: "tensorrtllm"
+max_batch_size: 128
+
+model_transaction_policy {
+ decoupled: {{ decoupled_mode }}
+}
+
+input [
+ {
+ name: "input_ids"
+ data_type: TYPE_INT32
+ dims: [ -1 ]
+ },
+ {
+ name: "input_lengths"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ },
+ {
+ name: "request_output_len"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ },
+ {
+ name: "end_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "pad_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "beam_width"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "temperature"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "runtime_top_k"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "runtime_top_p"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "len_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "repetition_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "min_length"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "presence_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "random_seed"
+ data_type: TYPE_UINT64
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "stop"
+ data_type: TYPE_BOOL
+ dims: [ 1 ]
+ optional: true
+ },
+ {
+ name: "streaming"
+ data_type: TYPE_BOOL
+ dims: [ 1 ]
+ optional: true
+ }
+]
+output [
+ {
+ name: "output_ids"
+ data_type: TYPE_INT32
+ dims: [ -1, -1 ]
+ }
+]
+instance_group [
+ {
+ count: 1
+ kind : KIND_CPU
+ }
+]
+parameters: {
+ key: "max_beam_width"
+ value: {
+ string_value: "1"
+ }
+}
+parameters: {
+ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+ value: {
+ string_value: "no"
+ }
+}
+parameters: {
+ key: "gpt_model_type"
+ value: {
+ string_value: "{{ gpt_model_type }}"
+ }
+}
+parameters: {
+ key: "gpt_model_path"
+ value: {
+ string_value: "{{ engine_dir }}"
+ }
+}
+parameters: {
+ key: "max_tokens_in_paged_kv_cache"
+ value: {
+ string_value: ""
+ }
+}
+parameters: {
+ key: "batch_scheduler_policy"
+ value: {
+ string_value: "guaranteed_completion"
+ }
+}
+parameters: {
+ key: "kv_cache_free_gpu_mem_fraction"
+ value: {
+ string_value: ".75"
+ }
+}
+parameters: {
+ key: "max_num_sequences"
+ value: {
+ string_value: ""
+ }
+}
+parameters: {
+ key: "enable_trt_overlap"
+ value: {
+ string_value: ""
+ }
+}
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py b/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py
index 2c4d3d5a..e3475718 100644
--- a/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py
+++ b/RetrievalAugmentedGeneration/llm-inference-server/model_server/__init__.py
@@ -120,6 +120,8 @@ def main(args: argparse.Namespace) -> int:
# print discovered model parameters
_LOGGER.info("Model file format: %s", model.format.name)
_LOGGER.info("World Size: %d", model.world_size)
+ _LOGGER.info("Max input length: %s", args.max_input_length)
+ _LOGGER.info("Max output length: %s", args.max_output_length)
_LOGGER.info("Compute Capability: %s", model.compute_cap)
_LOGGER.info("Quantization: %s", conversion_opts.quantization)
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py b/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py
index d0eb10f3..437f3075 100644
--- a/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py
+++ b/RetrievalAugmentedGeneration/llm-inference-server/model_server/conversion/nemo.py
@@ -32,7 +32,7 @@
_LOGGER = logging.getLogger(__name__)
-def convert(model: Model, _: ConversionOptions) -> None:
+def convert(model: Model, opts: ConversionOptions) -> None:
"""Convert a .nemo formatted model."""
# find the .nemo model file
model_files = glob(os.path.join(model.model_dir, "*.nemo"))
@@ -52,14 +52,6 @@ def convert(model: Model, _: ConversionOptions) -> None:
config = yaml.safe_load(config_file)
config_file.close()
- if config.get("tensor_model_parallel_size", 1) != model.world_size:
- raise ModelServerException(
- f"The provided model has a tensor parallelism of {config.get('tensor_model_parallel_size', 1)} "
- + f"and the server has been requested to use {model.world_size} "
- + "gpus. Please use the NeMo inference container to rezise the parallelism of the model or change "
- + "the model-server's world size."
- )
-
# run the nemo to trt llm conversion
trt_llm_exporter = TensorRTLLM(model_dir=model.engine_dir)
_LOGGER.info(".nemo to TensorRT Conversion started. This will take a few minutes.")
@@ -68,4 +60,6 @@ def convert(model: Model, _: ConversionOptions) -> None:
nemo_checkpoint_path=model_files[0],
model_type=model.family,
n_gpus=model.world_size,
+ max_input_token=opts.max_input_length,
+ max_output_token=opts.max_output_length
)
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py b/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py
index a9077bf9..272234ec 100644
--- a/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py
+++ b/RetrievalAugmentedGeneration/llm-inference-server/model_server/server.py
@@ -47,6 +47,8 @@ def _decoupled_mode(self) -> str:
@property
def _allow_http(self) -> str:
"""Indicate if Triton should allow http connections."""
+ if self._model.format == ModelFormats.NEMO:
+ return "true"
return "true" if self._http else "false"
@property
diff --git a/RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh b/RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh
new file mode 100755
index 00000000..fa35fe16
--- /dev/null
+++ b/RetrievalAugmentedGeneration/llm-inference-server/tools/resize_nemo_model.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -x
+
+MODEL_STORE="$1"
+MODEL_IN="$2"
+MODEL_IN_DIR=$(cd $(dirname "$MODEL_IN"); pwd)
+MODEL_OUT="$3"
+MODEL_OUT_DIR=$(cd $(dirname "$MODEL_OUT"); pwd)
+TARGET_SIZE="$4"
+
+TRAINING_CONTAINER="nvcr.io/nvaie/nemo-framework-training:23.08.03"
+
+# init
+echo $MODEL_IN " -> " $MODEL_OUT
+cd "$MODEL_STORE"
+mkdir -p "$MODEL_OUT_DIR"
+
+# find tokenizer
+tar xvf $MODEL_IN model_config.yaml
+mv model_config.yaml "$MODEL_OUT_DIR"
+tokenizer=$(grep "tokenizer_model" gpt_8b_strict_skua_bf16_nemo_yi_dong_us_v1.0-tp1/model_config.yaml | awk -F: '{
+ print $3 }')
+tar xvf $MODEL_IN $tokenizer
+mv $tokenizer $MODEL_OUT_DIR
+
+# run conversion
+docker run --rm -it --gpus all --ipc host \
+ -v $MODEL_STORE:$MODEL_STORE \
+ -w $MODEL_STORE \
+ $TRAINING_CONTAINER \
+ /usr/bin/python3 \
+ /opt/NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py \
+ --model_file $MODEL_IN \
+ --target_file $MODEL_OUT \
+ --tensor_model_parallel_size=-1 \
+ --target_tensor_model_parallel_size=$TARGET_SIZE \
+ --pipeline_model_parallel_size=-1 \
+ --target_pipeline_model_parallel_size=1 \
+ --precision=bf16 \
+ --tokenizer_model_path $MODEL_OUT_DIR/$tokenizer
diff --git a/RetrievalAugmentedGeneration/requirements.txt b/RetrievalAugmentedGeneration/requirements.txt
index b2f9b81a..2a9f6214 100644
--- a/RetrievalAugmentedGeneration/requirements.txt
+++ b/RetrievalAugmentedGeneration/requirements.txt
@@ -1,12 +1,21 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
python-multipart==0.0.6
-langchain==0.0.330
-tritonclient[all]==2.39.0
+langchain==0.0.352
unstructured[all-docs]==0.11.2
sentence-transformers==2.2.2
-llama-index==0.9.13
+llama-index==0.9.22
pymilvus==2.3.1
dataclass-wizard==0.22.2
opencv-python==4.8.0.74
-minio==7.2.0
\ No newline at end of file
+minio==7.2.0
+asyncpg==0.29.0
+psycopg2-binary==2.9.9
+pgvector==0.2.4
+langchain-core==0.1.3
+langchain-nvidia-ai-endpoints==0.0.1
+langchain-nvidia-trt==0.0.1rc0
+nemollm==0.3.4
+opentelemetry-sdk==1.21.0
+opentelemetry-api==1.21.0
+opentelemetry-exporter-otlp-proto-grpc==1.21.0
diff --git a/deploy/compose/compose.env b/deploy/compose/compose.env
index 3ac2946a..52cb1465 100644
--- a/deploy/compose/compose.env
+++ b/deploy/compose/compose.env
@@ -1,20 +1,24 @@
# full path to the local copy of the model weights
# NOTE: This should be an absolute path and not relative path
export MODEL_DIRECTORY="/home/nvidia/llama2_13b_chat_hf_v1/"
-
+# export MODEL_DIRECTORY="/home/nvidia/nemotron-3-8b-chat-4k-sft"
# Fill this out if you dont have a GPU. Leave this empty if you have a local GPU
-export AI_PLAYGROUND_API_KEY="nvapi-*"
+export NVIDIA_API_KEY="nvapi-*"
# flag to enable activation aware quantization for the LLM
# export QUANTIZATION="int4_awq"
-# the architecture of the model. eg: llama
+# the architecture of the model. eg: llama, gptnext (for nemotron use gptnext)
export MODEL_ARCHITECTURE="llama"
+
# the name of the model being used - only for displaying on frontend
export MODEL_NAME="Llama-2-13b-chat-hf"
+# the name of the RAG example being used
+export RAG_EXAMPLE="developer_rag"
+
# [OPTIONAL] the maximum number of input tokens
# export MODEL_MAX_INPUT_LENGTH=3000
@@ -29,3 +33,29 @@ export MODEL_NAME="Llama-2-13b-chat-hf"
# [OPTIONAL] the config file for chain server w.r.t. pwd
export APP_CONFIG_FILE=/dev/null
+
+# parameters for PGVector, update this when using PGVector Vecotor store
+# export POSTGRES_PASSWORD=password
+# export POSTGRES_USER=postgres
+# export POSTGRES_DB=api
+
+### Riva Parameters:
+
+# Riva Speech API URI: Riva Server IP address/hostname and port
+export RIVA_API_URI=""
+
+# [OPTIONAL] Riva Speech API Key
+# If necessary, enter a key to access the Riva API
+export RIVA_API_KEY=""
+
+# [OPTIONAL] Riva Function ID
+# If necessary, enter a function ID to access the Riva API
+export RIVA_FUNCTION_ID=""
+
+# TTS sample rate (Hz)
+export TTS_SAMPLE_RATE=48000
+
+# the config file for the OpenTelemetry collector
+export OPENTELEMETRY_CONFIG_FILE="./configs/otel-collector-config.yaml"
+# the config file for Jaeger
+export JAEGER_CONFIG_FILE="./configs/jaeger.yaml"
diff --git a/deploy/compose/config.yaml b/deploy/compose/config.yaml
index 19084ca9..0cac6ae7 100644
--- a/deploy/compose/config.yaml
+++ b/deploy/compose/config.yaml
@@ -1,28 +1,33 @@
-milvus:
- # The configuration of the Milvus connection.
+vector_store:
+ # The configuration of the Vector Store connection.
+
+ name: milvus
+ # The name of vector store db. Can be pgvector or milvus.
+ # Type: str
+ # ENV Variable: APP_VECTORSTORE_NAME
url: "http://milvus:19530"
- # The location of the Milvus Server.
+ # The location of the VectorStore DB.
# Type: str
- # ENV Variable: APP_MILVUS_URL
+ # ENV Variable: APP_VECTORSTORE_URL
llm:
# The configuration for the server hosting the Large Language models.
model_engine: "triton-trt-llm"
- # The backend name hosting the model. Options currently supported are: triton-trt-llm, ai-playground
+ # The backend name hosting the model. Options currently supported are: triton-trt-llm, nv-ai-foundation
# Type: str
# ENV Variable: APP_LLM_MODELENGINE
server_url: "llm:8001"
- # The location of the server hosting the large language model. Use this option when model engine is
- # set to triton-trt-llm, ignore this option if model_engine is set to "ai-playground"
+ # The location of the server hosting the large language model. Use this option when model engine is
+ # set to triton-trt-llm, ignore this option if model_engine is set to "nv-ai-foundation"
# Type: str
# ENV Variable: APP_LLM_SERVERURL
model_name: "ensemble"
# if model_engine is "triton-trt-llm" set this to "ensemble"
- # if model_engine is "ai-plaground" options are "llama2_13b", "llama2_70b", "mistral_7b"
+ # if model_engine is "ai-plaground" options are "llama2_13b", "llama2_70b", "mistral_7b"
# The name of the hosted model.
# Type: str
# ENV Variable: APP_LLM_MODELNAME
@@ -32,6 +37,7 @@ text_splitter:
chunk_size: 510
# Chunk size for text splitting.
+ # When using a token-based text splitter, this is the number of 'tokens per chunk'
# Type: int
chunk_overlap: 200
@@ -42,7 +48,7 @@ embeddings:
# The configuration embedding models.
model_name: intfloat/e5-large-v2
- # The name embedding search model from huggingface or ai-playground.
+ # The name embedding search model from huggingface or nv-ai-foundation.
# Type: str
dimensions: 1024
@@ -50,7 +56,7 @@ embeddings:
# Type: int
model_engine: huggingface
- # The backend name hosting the model, huggingface and ai-playground are supported.
+ # The backend name hosting the model, huggingface and nv-ai-foundation are supported.
# Type: str
prompts:
diff --git a/deploy/compose/configs/jaeger.yaml b/deploy/compose/configs/jaeger.yaml
new file mode 100644
index 00000000..64d3513c
--- /dev/null
+++ b/deploy/compose/configs/jaeger.yaml
@@ -0,0 +1,3 @@
+query.base-path: /jaeger/ui
+cassandra.keyspace: jaeger_v1_dc1
+cassandra.servers: cassandra
\ No newline at end of file
diff --git a/deploy/compose/configs/otel-collector-config.yaml b/deploy/compose/configs/otel-collector-config.yaml
new file mode 100644
index 00000000..69d1cbe4
--- /dev/null
+++ b/deploy/compose/configs/otel-collector-config.yaml
@@ -0,0 +1,17 @@
+receivers:
+ otlp:
+ protocols:
+ grpc:
+ # endpoint: 0.0.0.0:4317
+ http:
+ # endpoint: 0.0.0.0:4318
+exporters:
+ otlp:
+ endpoint: jaeger:4317
+ tls:
+ insecure: true
+service:
+ pipelines:
+ traces:
+ receivers: [otlp]
+ exporters: [otlp]
\ No newline at end of file
diff --git a/deploy/compose/docker-compose-evaluation.yaml b/deploy/compose/docker-compose-evaluation.yaml
new file mode 100644
index 00000000..caab7e8f
--- /dev/null
+++ b/deploy/compose/docker-compose-evaluation.yaml
@@ -0,0 +1,22 @@
+services:
+ evaluation:
+ container_name: evaluation
+ image: evalulation:latest
+ build:
+ context: ../../
+ dockerfile: ./tools/evaluation/Dockerfile.eval
+ ports:
+ - "8889:8889"
+ expose:
+ - "8889"
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: 1
+ capabilities: [gpu]
+
+networks:
+ default:
+ name: nvidia-llm
\ No newline at end of file
diff --git a/deploy/compose/docker-compose-playground.yaml b/deploy/compose/docker-compose-nemotron.yaml
similarity index 65%
rename from deploy/compose/docker-compose-playground.yaml
rename to deploy/compose/docker-compose-nemotron.yaml
index 2def0c08..ce00a85a 100644
--- a/deploy/compose/docker-compose-playground.yaml
+++ b/deploy/compose/docker-compose-nemotron.yaml
@@ -1,33 +1,47 @@
services:
- jupyter-server:
- container_name: notebook-server
- image: notebook-server:latest
+ llm:
+ container_name: llm-inference-server
+ image: llm-inference-server:latest
build:
- context: ../../
- dockerfile: ./notebooks/Dockerfile.notebooks
+ context: ../.././RetrievalAugmentedGeneration/llm-inference-server/
+ dockerfile: Dockerfile
+ volumes:
+ - ${MODEL_DIRECTORY:?please update the env file and source it before running}:/model
+ command: ${MODEL_ARCHITECTURE:?please update the env file and source it before running} --http --max-input-length ${MODEL_MAX_INPUT_LENGTH:-3000} --max-output-length ${MODEL_MAX_OUTPUT_LENGTH:-512} --quantization ${QUANTIZATION:-None}
ports:
- - "8888:8888"
+ - "8000:8000"
+ - "8001:8001"
+ - "8002:8002"
expose:
- - "8888"
+ - "8000"
+ - "8001"
+ - "8002"
+ shm_size: 20gb
deploy:
resources:
reservations:
devices:
- driver: nvidia
- count: 1
+ device_ids: ["0", "1"]
capabilities: [gpu]
-
- evaluation:
- container_name: evaluation
- image: evaluation:latest
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"]
+ interval: 30s
+ timeout: 20s
+ retries: 3
+ start_period: 10m
+
+ jupyter-server:
+ container_name: notebook-server
+ image: notebook-server:latest
build:
context: ../../
- dockerfile: ./evaluation/Dockerfile.eval
+ dockerfile: ./notebooks/Dockerfile.notebooks
ports:
- - "8889:8889"
+ - "8888:8888"
expose:
- - "8889"
+ - "8888"
deploy:
resources:
reservations:
@@ -35,6 +49,8 @@ services:
- driver: nvidia
count: 1
capabilities: [gpu]
+ depends_on:
+ - "llm"
etcd:
container_name: milvus-etcd
@@ -60,13 +76,13 @@ services:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
ports:
- - "9001:9001"
- - "9000:9000"
+ - "9011:9011"
+ - "9010:9010"
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
- command: minio server /minio_data --console-address ":9001"
+ command: minio server /minio_data --console-address ":9011" --address ":9010"
healthcheck:
- test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+ test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
@@ -77,7 +93,7 @@ services:
command: ["milvus", "run", "standalone"]
environment:
ETCD_ENDPOINTS: etcd:2379
- MINIO_ADDRESS: minio:9000
+ MINIO_ADDRESS: minio:9010
KNOWHERE_GPU_MEM_POOL_SIZE: 2048:4096
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
@@ -107,13 +123,20 @@ services:
build:
context: ../../
dockerfile: ./RetrievalAugmentedGeneration/Dockerfile
+ args:
+ EXAMPLE_NAME: ${RAG_EXAMPLE}
command: --port 8081 --host 0.0.0.0
environment:
- APP_MILVUS_URL: "http://milvus:19530"
- APP_LLM_MODELNAME: "llama2_13b"
- APP_LLM_MODELENGINE: "ai-playground"
+ APP_VECTORSTORE_URL: "http://milvus:19530"
+ APP_VECTORSTORE_NAME: "milvus"
+ APP_LLM_SERVERURL: "llm:8001"
+ APP_LLM_MODELNAME: ensemble
+ APP_LLM_MODELENGINE: triton-trt-llm
APP_CONFIG_FILE: ${APP_CONFIG_FILE}
NVAPI_KEY: ${AI_PLAYGROUND_API_KEY}
+ OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
+ OTEL_EXPORTER_OTLP_PROTOCOL: grpc
+ ENABLE_TRACING: false
volumes:
- ${APP_CONFIG_FILE}:${APP_CONFIG_FILE}
ports:
@@ -135,6 +158,7 @@ services:
# retries: 3
depends_on:
- "milvus"
+ - "llm"
frontend:
container_name: llm-playground
@@ -147,6 +171,13 @@ services:
APP_SERVERURL: http://query
APP_SERVERPORT: 8081
APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}}
+ OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
+ OTEL_EXPORTER_OTLP_PROTOCOL: grpc
+ ENABLE_TRACING: false
+ RIVA_API_URI: ${RIVA_API_URI}
+ RIVA_API_KEY: ${RIVA_API_KEY}
+ RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID}
+ TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE}
ports:
- "8090:8090"
expose:
diff --git a/deploy/compose/docker-compose-nv-ai-foundation.yaml b/deploy/compose/docker-compose-nv-ai-foundation.yaml
new file mode 100644
index 00000000..1ec0e74f
--- /dev/null
+++ b/deploy/compose/docker-compose-nv-ai-foundation.yaml
@@ -0,0 +1,55 @@
+services:
+
+ query:
+ container_name: chain-server
+ image: chain-server:latest
+ build:
+ context: ../../
+ dockerfile: ./RetrievalAugmentedGeneration/Dockerfile
+ args:
+ EXAMPLE_NAME: ${RAG_EXAMPLE}
+ command: --port 8081 --host 0.0.0.0
+ environment:
+ APP_LLM_MODELNAME: mixtral_8x7b
+ APP_LLM_MODELENGINE: nv-ai-foundation
+ APP_EMBEDDINGS_MODELNAME: nvolveqa_40k
+ APP_EMBEDDINGS_MODELENGINE: nv-ai-foundation
+ APP_TEXTSPLITTER_CHUNKSIZE: 2000
+ APP_TEXTSPLITTER_CHUNKOVERLAP: 200
+ APP_PROMPTS_CHATTEMPLATE: "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
+ APP_PROMPTS_RAGTEMPLATE: "You are a helpful AI assistant named Envie. You will reply to questions only based on the context that you are provided. If something is out of context, you will refrain from replying and politely decline to respond to the user."
+ NVIDIA_API_KEY: ${NVIDIA_API_KEY}
+ APP_CONFIG_FILE: ${APP_CONFIG_FILE}
+ volumes:
+ - ${APP_CONFIG_FILE}:${APP_CONFIG_FILE}
+ ports:
+ - "8081:8081"
+ expose:
+ - "8081"
+ shm_size: 5gb
+
+ frontend:
+ container_name: llm-playground
+ image: llm-playground:latest
+ build:
+ context: ../.././RetrievalAugmentedGeneration/frontend/
+ dockerfile: Dockerfile
+ command: --port 8090
+ environment:
+ APP_SERVERURL: http://query
+ APP_SERVERPORT: 8081
+ APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}}
+ RIVA_API_URI: ${RIVA_API_URI}
+ RIVA_API_KEY: ${RIVA_API_KEY}
+ RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID}
+ TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE}
+ ports:
+ - "8090:8090"
+ expose:
+ - "8090"
+ depends_on:
+ - query
+
+networks:
+ default:
+ name: nvidia-llm
diff --git a/deploy/compose/docker-compose-observability.yaml b/deploy/compose/docker-compose-observability.yaml
new file mode 100644
index 00000000..cc2e11a0
--- /dev/null
+++ b/deploy/compose/docker-compose-observability.yaml
@@ -0,0 +1,47 @@
+services:
+ otel-collector:
+ container_name: otel-collector
+ image: otel/opentelemetry-collector:0.88.0
+ restart: always
+ command: ["--config=/etc/otel-collector-config.yaml"]
+ volumes:
+ - ${OPENTELEMETRY_CONFIG_FILE}:/etc/otel-collector-config.yaml
+
+ jaeger:
+ image: jaegertracing/all-in-one:1.52
+ container_name: jaeger
+ command:
+ - "--config-file=/etc/jaeger.yaml"
+ environment:
+ - SPAN_STORAGE_TYPE=cassandra
+ deploy:
+ resources:
+ limits:
+ memory: 300M
+ restart: always
+ ports:
+ - "16686:16686"
+ - "4317"
+ - "4318"
+ expose:
+ - "4318"
+ - "4317"
+ volumes:
+ - ${JAEGER_CONFIG_FILE}:/etc/jaeger.yaml
+ depends_on:
+ - cassandra-schema
+
+ cassandra:
+ image: cassandra:4.0
+ container_name: cassandra
+ volumes:
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/cassandra:/var/lib/cassandra
+
+ cassandra-schema:
+ image: jaegertracing/jaeger-cassandra-schema
+ depends_on:
+ - cassandra
+
+networks:
+ default:
+ name: nvidia-llm
diff --git a/deploy/compose/docker-compose-pgvector.yaml b/deploy/compose/docker-compose-pgvector.yaml
new file mode 100644
index 00000000..9ad8ff56
--- /dev/null
+++ b/deploy/compose/docker-compose-pgvector.yaml
@@ -0,0 +1,111 @@
+services:
+
+ llm:
+ container_name: llm-inference-server
+ image: llm-inference-server:latest
+ build:
+ context: ../.././RetrievalAugmentedGeneration/llm-inference-server/
+ dockerfile: Dockerfile
+ volumes:
+ - ${MODEL_DIRECTORY:?please update the env file and source it before running}:/model
+ command: ${MODEL_ARCHITECTURE:?please update the env file and source it before running} --max-input-length ${MODEL_MAX_INPUT_LENGTH:-3000} --max-output-length ${MODEL_MAX_OUTPUT_LENGTH:-512} --quantization ${QUANTIZATION:-None}
+ ports:
+ - "8000:8000"
+ - "8001:8001"
+ - "8002:8002"
+ expose:
+ - "8000"
+ - "8001"
+ - "8002"
+ shm_size: 20gb
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: ${INFERENCE_GPU_COUNT:-all}
+ capabilities: [gpu]
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"]
+ interval: 30s
+ timeout: 20s
+ retries: 3
+ start_period: 10m
+
+ pgvector:
+ container_name: pgvector
+ image: ankane/pgvector:v0.5.1
+ ports:
+ - 5432:5432
+ expose:
+ - "5432"
+ volumes:
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/data:/var/lib/postgresql/data
+ environment:
+ - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-password}
+ - POSTGRES_USER=${POSTGRES_USER:-postgres}
+ - POSTGRES_DB=${POSTGRES_DB:-api}
+
+ query:
+ container_name: chain-server
+ image: chain-server:latest
+ build:
+ context: ../../
+ dockerfile: ./RetrievalAugmentedGeneration/Dockerfile
+ args:
+ EXAMPLE_NAME: ${RAG_EXAMPLE}
+ command: --port 8081 --host 0.0.0.0
+ environment:
+ APP_VECTORSTORE_URL: "pgvector:5432"
+ APP_VECTORSTORE_NAME: "pgvector"
+ APP_LLM_SERVERURL: "llm:8001"
+ APP_LLM_MODELNAME: "ensemble"
+ APP_LLM_MODELENGINE: "triton-trt-llm"
+ APP_CONFIG_FILE: ${APP_CONFIG_FILE}
+ NVAPI_KEY: ${AI_PLAYGROUND_API_KEY}
+ POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-password}
+ POSTGRES_USER: ${POSTGRES_USER:-postgres}
+ POSTGRES_DB: ${POSTGRES_DB:-api}
+ volumes:
+ - ${APP_CONFIG_FILE}:${APP_CONFIG_FILE}
+ ports:
+ - "8081:8081"
+ expose:
+ - "8081"
+ shm_size: 5gb
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: 1
+ capabilities: [gpu]
+ depends_on:
+ - "pgvector"
+ - "llm"
+
+ frontend:
+ container_name: llm-playground
+ image: llm-playground:latest
+ build:
+ context: ../.././RetrievalAugmentedGeneration/frontend/
+ dockerfile: Dockerfile
+ command: --port 8090
+ environment:
+ APP_SERVERURL: http://query
+ APP_SERVERPORT: 8081
+ APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}}
+ RIVA_API_URI: ${RIVA_API_URI}
+ RIVA_API_KEY: ${RIVA_API_KEY}
+ RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID}
+ TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE}
+ ports:
+ - "8090:8090"
+ expose:
+ - "8090"
+ depends_on:
+ - query
+
+networks:
+ default:
+ name: nvidia-llm
diff --git a/deploy/compose/docker-compose.yaml b/deploy/compose/docker-compose.yaml
index 52c675e0..ac60a34b 100644
--- a/deploy/compose/docker-compose.yaml
+++ b/deploy/compose/docker-compose.yaml
@@ -52,26 +52,6 @@ services:
depends_on:
- "llm"
- evaluation:
- container_name: evaluation
- image: evalulation:latest
- build:
- context: ../../
- dockerfile: ./evaluation/Dockerfile.eval
- ports:
- - "8889:8889"
- expose:
- - "8889"
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- count: 1
- capabilities: [gpu]
- depends_on:
- - "llm"
-
etcd:
container_name: milvus-etcd
image: quay.io/coreos/etcd:v3.5.5
@@ -96,13 +76,13 @@ services:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
ports:
- - "9001:9001"
- - "9000:9000"
+ - "9011:9011"
+ - "9010:9010"
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
- command: minio server /minio_data --console-address ":9001"
+ command: minio server /minio_data --console-address ":9011" --address ":9010"
healthcheck:
- test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+ test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
@@ -113,7 +93,7 @@ services:
command: ["milvus", "run", "standalone"]
environment:
ETCD_ENDPOINTS: etcd:2379
- MINIO_ADDRESS: minio:9000
+ MINIO_ADDRESS: minio:9010
KNOWHERE_GPU_MEM_POOL_SIZE: 2048:4096
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
@@ -143,14 +123,20 @@ services:
build:
context: ../../
dockerfile: ./RetrievalAugmentedGeneration/Dockerfile
+ args:
+ EXAMPLE_NAME: ${RAG_EXAMPLE}
command: --port 8081 --host 0.0.0.0
environment:
- APP_MILVUS_URL: "http://milvus:19530"
+ APP_VECTORSTORE_URL: "http://milvus:19530"
+ APP_VECTORSTORE_NAME: "milvus"
APP_LLM_SERVERURL: "llm:8001"
APP_LLM_MODELNAME: ensemble
APP_LLM_MODELENGINE: triton-trt-llm
APP_CONFIG_FILE: ${APP_CONFIG_FILE}
NVAPI_KEY: ${AI_PLAYGROUND_API_KEY}
+ OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
+ OTEL_EXPORTER_OTLP_PROTOCOL: grpc
+ ENABLE_TRACING: false
volumes:
- ${APP_CONFIG_FILE}:${APP_CONFIG_FILE}
ports:
@@ -185,6 +171,13 @@ services:
APP_SERVERURL: http://query
APP_SERVERPORT: 8081
APP_MODELNAME: ${MODEL_NAME:-${MODEL_ARCHITECTURE}}
+ OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
+ OTEL_EXPORTER_OTLP_PROTOCOL: grpc
+ ENABLE_TRACING: false
+ RIVA_API_URI: ${RIVA_API_URI}
+ RIVA_API_KEY: ${RIVA_API_KEY}
+ RIVA_FUNCTION_ID: ${RIVA_FUNCTION_ID}
+ TTS_SAMPLE_RATE: ${TTS_SAMPLE_RATE}
ports:
- "8090:8090"
expose:
diff --git a/deploy/compose/nemotron_config.yaml b/deploy/compose/nemotron_config.yaml
index 53f1e041..658408f2 100644
--- a/deploy/compose/nemotron_config.yaml
+++ b/deploy/compose/nemotron_config.yaml
@@ -29,6 +29,7 @@ text_splitter:
chunk_size: 510
# Chunk size for text splitting.
+ # When using a token-based text splitter, this is the number of 'tokens per chunk'
# Type: int
chunk_overlap: 200
diff --git a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml
index 12ad1204..fc302398 100644
--- a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml
+++ b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-minio.yaml
@@ -22,15 +22,15 @@ spec:
- server
- /minio_data
- --console-address
- - :9001
- env:
+ - :9011
+ env:
- name: MINIO_ACCESS_KEY
value: minioadmin
- name: MINIO_SECRET_KEY
value: minioadmin
ports:
- - containerPort: 9001
- - containerPort: 9000
+ - containerPort: 9011
+ - containerPort: 9010
volumeMounts:
- mountPath: /minio_data
name: minio-data
@@ -38,8 +38,8 @@ spec:
exec:
command:
- curl
- - -f
- - http://localhost:9000/minio/health/live
+ - -f
+ - http://localhost:9010/minio/health/live
initialDelaySeconds: 20
periodSeconds: 5
volumes:
@@ -57,6 +57,6 @@ spec:
app.kubernetes.io/name: milvus-minio
ports:
- protocol: TCP
- port: 9000
- targetPort: 9000
+ port: 9010
+ targetPort: 9010
diff --git a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml
index 2873239e..1b9ab847 100644
--- a/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml
+++ b/deploy/k8s-operator/kube-trailblazer/helm-charts/staging/rag-llm-pipeline/templates/milvus-standalone.yaml
@@ -18,32 +18,32 @@ spec:
- name: milvus-standalone
image: milvusdb/milvus:v2.3.1-gpu
command:
- - /tini
+ - /tini
- --
- milvus
- run
- standalone
- env:
+ env:
- name: ETCD_ENDPOINTS
value: milvus-etcd:2379
- name: KNOWHERE_GPU_MEM_POOL_SIZE
value: 2048:4096
- name: MINIO_ADDRESS
- value: milvus-minio:9000
- ports:
+ value: milvus-minio:9010
+ ports:
- containerPort: 19530
- containerPort: 9091
readinessProbe:
exec:
command:
- curl
- - -f
+ - -f
- http://localhost:9091/healthz
initialDelaySeconds: 20
periodSeconds: 5
resources:
limits:
- {{ .Values.milvus.gpu.type }}: {{ .Values.milvus.gpu.count }}
+ {{ .Values.milvus.gpu.type }}: {{ .Values.milvus.gpu.count }}
---
apiVersion: v1
kind: Service
diff --git a/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml b/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml
index 3e9d4d08..7c6f21f2 100644
--- a/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml
+++ b/deploy/k8s-operator/kube-trailblazer/pkg/helmer/controller/test.yaml
@@ -2,16 +2,12 @@
- repoEntry:
name: "zvonkok"
url: "https://zvonkok.github.io/helm-charts/"
- #username: "zvonkok"
- #password: "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"
- #pass_credentials_all: true
- #insecure_skip_tls_verify: true
chartSpec:
release: "flannel"
chart: "zvonkok/flannel"
namespace: "flannel"
version: "v0.23.0"
-
+
- repoEntry:
name: "nfd"
url: "https://kubernetes-sigs.github.io/node-feature-discovery/charts"
@@ -22,7 +18,7 @@
version: "0.14.3"
chartValues:
kernelVersion: "{{ tpl .Values.runtime.kernelVersiosn }}" # {{ tpl .Values.chartValues.kernelVersion . }}
-
+
- repoEntry:
name: "nvidia"
url: "https://helm.ngc.nvidia.com/nvidia"
diff --git a/docs/README.md b/docs/README.md
index 787a6a8f..47dc3a83 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,40 +6,43 @@ The RAG documentation is divided into the following sections:
- [Getting Started](#getting-started)
- [User Guides](#user-guides)
- [Architecture Guide](#architecture-guide)
- - [Evaluation Tools](#evaluation-tools)
- - [Other](#other)
+ - [Evaluation Tool](#evaluation-tool)
+ - [Observability Tool](#observability-tool)
+ - [Others](#others)
## Getting Started
-This section will help you get started quickly with the sample RAG example.
-
-* [Installation guide](../RetrievalAugmentedGeneration/README.md#prerequisites): This guide walks you through the process of setting up your environment and utilizing the
-* [Getting Started guides](../RetrievalAugmentedGeneration/README.md#getting-started): A series of quick start steps that will help you to understand the core concepts and start the pipeline quickly. These guides include Jupyter notebooks that you can experiment with.
+* [Getting Started guides](../RetrievalAugmentedGeneration/README.md): A series of quick start steps that will help you to understand the core concepts and start the pipeline quickly for the different examples and usecases provided in this repository. These guides also include Jupyter notebooks that you can experiment with.
## User Guides
-The user guides cover the core details of the provided example and how to configure and use different features to make your own chains.
+The user guides cover the core details of the provided sample canonical developer rag example and how to configure and use different features to make your own chains.
* [LLM Inference Server](./rag/llm_inference_server.md): Learn about the service which accelerates LLM inference time using TRT-LLM.
-* [Integration with Nvidia AI Playground](./rag/aiplayground.md): Understand how to access **NVIDIA AI Playground** on NGC which allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server.
+* [Integration with Nvidia AI Playground](./rag/aiplayground.md): Understand how to access **NVIDIA AI Playground** on NGC which allows developers to experience state of the art LLMs and embedding models accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT and Triton Inference Server.
* [Configuration Guide](./rag/configuration.md): The complete guide to all the configuration options available in the `config.yaml` file.
-* [Frontend](./rag/frontend.md): Learn more about the sample playground provided as part of the workflow.
-* [Chat Server Guide](./rag/chat_server.md): Learn about the chat server which exposes core API's for end user.
-* [Jupyter Server Guide](./rag/jupyter_server.md): Learn about the different notebooks available and the server which can be used to access them.
+* [Frontend](./rag/frontend.md): Learn more about the sample playground provided as part of the workflow used by all the examples.
+* [Chat Server Guide](./rag/chat_server.md): Learn about the chat server which exposes core API's for the end user. All the different examples are deployed behind these standardized API's, exposed by this server.
+* [Notebooks Guide](./rag/jupyter_server.md): Learn about the different notebooks available and the server which can be used to access them.
## Architecture Guide
-This guide sheds more light on the infrastructure details and the execution flow for a query when the runtime is used:
+This guide sheds more light on the infrastructure details and the execution flow for a query when the runtime is used for the default canonical RAG example:
* [Architecture](./rag/architecture.md): Understand the architecture of the sample RAG workflow.
-## Evaluation Tools
+## Evaluation Tool
-The sample RAG worlflow provides a set of evaluation pipelines via notebooks which developers can use for benchmarking.
+The sample RAG worlflow provides a set of evaluation pipelines via notebooks which developers can use for benchmarking the default canonical RAG example.
There are also detailed guides on how to reproduce results and create datasets for the evaluation.
-* [RAG Evaluation](../evaluation/README.md): Understand the different notebooks available.
+* [RAG Evaluation](./rag/evaluation.md): Understand the different notebooks available.
+
+## Observability Tool
+
+Observability is a crucial aspect that facilitates the monitoring and comprehension of the internal state and behavior of a system or application.
+* [Observability tool](./rag/observability.md): Understand the tool and deployment steps for the observability tool.
-## Other
+## Others
* [Support Matrix](./rag/support_matrix.md)
* [Open API schema references](./rag/api_reference/openapi_schema.json)
diff --git a/docs/developer-llm-operator/install.md b/docs/developer-llm-operator/install.md
index bafc570b..ba3df516 100644
--- a/docs/developer-llm-operator/install.md
+++ b/docs/developer-llm-operator/install.md
@@ -55,15 +55,15 @@ NVIDIA container runtime on the Kubernetes node.
```console
$ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
- && helm repo update
+ && helm repo update
```
1. Install the Operator:
```console
$ helm install --wait --generate-name \
- -n gpu-operator --create-namespace \
- nvidia/gpu-operator
+ -n gpu-operator --create-namespace \
+ nvidia/gpu-operator
```
1. Optional: Configure GPU time-slicing if you have fewer than four GPUs.
@@ -106,14 +106,12 @@ NVIDIA container runtime on the Kubernetes node.
- Verify that at least `4` GPUs are allocatable:
```console
- $ kubectl get nodes -l nvidia.com/gpu.present -o json | jq '.items[0].status.allocatable |
- with_entries(select(.key | startswith("nvidia.com/"))) |
- with_entries(select(.value != "0"))'
+ $ kubectl get nodes -l nvidia.com/gpu.present -o json | jq '.items[0].status.allocatable | with_entries(select(.key | startswith("nvidia.com/"))) | with_entries(select(.value != "0"))'
```
*Example Output*
- ```output
+ ```json
{
"nvidia.com/gpu": "4"
}
@@ -242,8 +240,7 @@ in the NVIDIA GPU Operator documentation.
- View the logs from the Operator controller pod:
```console
- $ kubectl logs -n kube-trailblazer-system \
- $(kubectl get pod -n kube-trailblazer-system -o=jsonpath='{.items[0].metadata.name}')
+ $ kubectl logs -n kube-trailblazer-system $(kubectl get pod -n kube-trailblazer-system -o=jsonpath='{.items[0].metadata.name}')
```
- View the pods in the pipeline namespace:
@@ -280,7 +277,7 @@ in the NVIDIA GPU Operator documentation.
llm ClusterIP 10.107.213.112 8001/TCP 22h
milvus ClusterIP 10.102.86.183 19530/TCP 22h
milvus-etcd ClusterIP 10.109.74.142 2379/TCP 22h
- milvus-minio ClusterIP 10.103.238.28 9000/TCP 22h
+ milvus-minio ClusterIP 10.103.238.28 9010/TCP 22h
query ClusterIP 10.110.199.69 8081/TCP 22h
```
diff --git a/docs/developer-llm-operator/uninstall.md b/docs/developer-llm-operator/uninstall.md
new file mode 100644
index 00000000..5cd096a7
--- /dev/null
+++ b/docs/developer-llm-operator/uninstall.md
@@ -0,0 +1,50 @@
+
+
+# Uninstalling the Operator
+
+To uninstall the Operator, perform the following steps:
+
+1. Delete the RAG pipeline:
+
+ ```console
+ $ kubectl delete helmpipeline -n kube-trailblazer-system rag-llm-pipeline
+ ```
+
+ *Example Output*
+
+ ```output
+ helmpipeline.package.nvidia.com "rag-llm-pipeline" deleted
+ ```
+
+1. Optional: Delete the namespace for the RAG pipeline:
+
+ ```console
+ $ kubectl delete namespace rag-llm-pipeline
+ ```
+
+1. Uninstall the Operator:
+
+ ```console
+ $ helm delete -n kube-trailblazer-system $(helm list -n kube-trailblazer-system | grep developer-llm-operator | awk '{print $1}')
+ ```
+
+ *Example Output*
+
+ ```output
+ release "developer-llm-operator-0-1705070979" uninstalled
+ ```
diff --git a/docs/rag/aiplayground.md b/docs/rag/aiplayground.md
index 5ea66bd9..050bc159 100644
--- a/docs/rag/aiplayground.md
+++ b/docs/rag/aiplayground.md
@@ -1,104 +1,62 @@
-# NVIDIA AI Playground
+# NVIDIA AI Foundation
-**NVIDIA AI Playground** on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. Developers get **free credits for 10K requests** to any of the available models. Sign up process is easy.
+**NVIDIA AI Foundation** lets developers to experience state of the art LLMs accelerated by NVIDIA. Developers get **free credits for 10K requests** to any of the available models.
-**Setup**
+## Prepare the environment
-Please follow the instruction below to get access to AI playground API key
+1. Navigate to https://catalog.ngc.nvidia.com/ai-foundation-models.
-* Navigate to https://catalog.ngc.nvidia.com/ai-foundation-models
-* Select any of the available models and click on learn more
+2. Find the Mixtral x7B model icon and click ``Learn More``.
-![Diagram](./images/image5.png)
+![Diagram](./images/image7.png)
-* Select the ```API``` navigation bar and click on the ```Generate key``` option as shown below.
+3. Select the ```API``` navigation bar and click on the ```Generate key``` option..
-![Diagram](./images/image6.png)
+![Diagram](./images/image8.png)
-* Copy the generated key over to a safe place.
+4. Save the generated API key.
+## Deploy
-## Using Nvdia Cloud based LLM's
+1. Clone the Generative AI examples Git repository.
-#### Step 1: Sign up to AI playground
+> ⚠️ **NOTE**: This example requires Git Large File Support (LFS)
-- Follow the [above](#nvidia-ai-playground) instructions to get access to an API key.
-
-#### Step 2: Set Environment Variables
-
-- Modify ``compose.env`` in the ``deploy/compose`` directory to set your environment variables. The following variable is required.
```
- export AI_PLAYGROUND_API_KEY="nvapi-*"
+$ sudo apt -y install git-lfs
+$ git clone git@github.com:NVIDIA/GenerativeAIExamples.git
+Cloning into 'GenerativeAIExamples'...
+$ cd GenerativeAIExamples/
+$ git lfs pull
```
-#### Step 3: Build and Start Containers
-- Pull lfs files. This will pull large files from repository.
- ```
- git lfs pull
- ```
-- Run the following command to build containers.
- ```
- source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-playground.yaml build
- ```
-
-- Run the following command to start containers.
- ```
- source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-playground.yaml up -d
- ```
-
-#### Step 4: Try out queries with the deployed pipeline
-- Interact with the pipeline using UI as as mentioned [here.](../../RetrievalAugmentedGeneration/README.md#step-4-run-the-sample-web-application)
-
-- Example [notebook 6](../../notebooks/06_AI_playground.ipynb) showcases the usage of AI Playground based LLM. You can access the notebook server at `http://host-ip:8888` from your web browser.
-
-
-## Using Nvidia Cloud based Embedding models
+2. Add your NGC API key to compose.env to use the NVIDIA endpoint.
-#### Step 1: Sign up to AI playground
-
-- Follow the [above](#nvidia-ai-playground) instructions to get access to an API key.
+```
+$ cd GenerativeAIExamples
-#### Step 2: Set Environment Variables
+$ grep NVIDIA_API_KEY deploy/compose/compose.env
+ export NVIDIA_API_KEY="nvapi-*"
+```
-- Modify ``compose.env`` in the ``deploy/compose`` directory to set your environment variables. The following variables are required. Provide your API key for NV playground and absolute path to [config.yaml](../../deploy/compose/config.yaml) file.
+3. Set the nv-ai-foundation example in compose.env.
```
- export AI_PLAYGROUND_API_KEY="YOUR_NV_PLAYGROUND_API_KEY"
- export APP_CONFIG_FILE="ABSOLUTE PATH TO config.yaml"
+ export RAG_EXAMPLE="nvidia_ai_foundation"
```
+4. Deploy the developer RAG example via Docker compose.
-If you want to use the on-prem deployed LLM model provide the values of below variables as well:
```
- # full path to the local copy of the model weights
- export MODEL_DIRECTORY="PATH TO MODEL CHECKPOINT DIrECTORY"
+$ source deploy/compose/compose.env ; docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml build
- # the architecture of the model. eg: llama
- export MODEL_ARCHITECTURE="llama"
+$ docker compose -f deploy/compose/docker-compose-nv-ai-foundation.yaml up -d
- # the name of the model being used - only for displaying on frontend
- export MODEL_NAME="llama-2-13b-chat"
+$ docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+CONTAINER ID NAMES STATUS
+70ef27ae4c91 llm-playground Up 56 seconds
+4aacfbe89464 chain-server Up 56 seconds
```
-#### Step 3: Update Config file
-- Update the embedding model name and model engine in [config.yaml](../../deploy/compose/config.yaml)
-
- ```
- embeddings:
- model_name: nvolve
- model_engine: ai-playground
- ```
+## Test
-#### Step 4: Build and Start Containers
-- Run the following command to build containers and start container if you want to use on-prem LLM model with playground based embedding model.
- ```
- source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose.yaml build
- docker compose -f deploy/compose/docker-compose.yaml up -d
- ```
-
-Alternatively, run the following command to build and start the containers if you want to use playground based LLM model with playground based embedding model.
-```
- source deploy/compose/compose.env; docker compose -f deploy/compose/docker-compose-playground.yaml build
- docker compose -f deploy/compose/docker-compose-playground.yaml up -d
-```
+1. Follow steps 1 - 5 in the ["Test" section of example 02](../../RetrievalAugmentedGeneration/README.md#23-test).
-#### Step 5: Try out queries with the deployed pipeline
-- Interact with the pipeline using UI by following the steps mentioned [here.](../../RetrievalAugmentedGeneration/README.md#step-4-run-the-sample-web-application)
\ No newline at end of file
diff --git a/docs/rag/architecture.md b/docs/rag/architecture.md
index 069c8459..a498e597 100644
--- a/docs/rag/architecture.md
+++ b/docs/rag/architecture.md
@@ -10,7 +10,10 @@ Generative AI starts with foundational models trained on vast quantities of unla
To create true business value from LLMs, these foundational models need to be tailored to your enterprise use case. In this workflow, we use [RAG](https://blog.langchain.dev/tutorial-chatgpt-over-your-data/) with [Llama2](https://github.com/facebookresearch/llama/), an open source model from Meta, to achieve this. Augmenting an existing AI foundational model provides an advanced starting point and a low-cost solution that enterprises can leverage to generate accurate and clear responses to their specific use case.
-This RAG-based reference chatbot workflow contains:
+> ⚠️ **NOTE**:
+This repository contains multiple examples. The architecture for the default canonical developer rag example is described below.
+
+This RAG-based reference default chatbot workflow contains:
- [NVIDIA NeMo framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html) - part of NVIDIA AI Enterprise solution
- [NVIDIA TensorRT-LLM](https://developer.nvidia.com/tensorrt) - for low latency and high throughput inference for LLMs
diff --git a/docs/rag/chat_server.md b/docs/rag/chat_server.md
index c0c68a18..224b92d9 100644
--- a/docs/rag/chat_server.md
+++ b/docs/rag/chat_server.md
@@ -1,7 +1,7 @@
# Chat Server
A sample fastapi based server is provided in the workflow so that you can test the chat system in an interactive manner.
-This server wraps calls made to different components and orchestrates the entire flow.
+This server wraps calls made to different components and orchestrates the entire flow for all the provided examples.
This API endpoint allows for several actions:
- [Chat Server](#chat-server)
diff --git a/docs/rag/configuration.md b/docs/rag/configuration.md
index 2d1eaf76..2d7aa6b9 100644
--- a/docs/rag/configuration.md
+++ b/docs/rag/configuration.md
@@ -2,14 +2,16 @@
### Chain Server Configuration
-In this section, we explore the configurations for the [Chain Server](./chat_server.md). Chain server interaction with other components can be controlled by config. Chain Server interacts with components such as the `milvus` vector store and `triton` server, which hosts the Large Language Model (LLM). Additionally, we'll delve into customization options to fine-tune the behavior of the query server. These options include settings for the embedding model, chunk size, and prompts for generating responses.
+In this section, we explore the configurations for the [Chain Server](./chat_server.md) used for the default canonical developer rag example.
+
+Chain server interaction with other components can be controlled by config. Chain Server interacts with components such as the `milvus` vector store and `triton` server, which hosts the Large Language Model (LLM). Additionally, we'll delve into customization options to fine-tune the behavior of the query server. These options include settings for the embedding model, chunk size, and prompts for generating responses.
You can refer to [sample config](../../deploy/compose/config.yaml) to see the structure.
-#### Milvus Configuration
-`Milvus` serves as a vector database for storing embeddings.
+#### Vector Database Configuration
+The configuration of the solution which serves as a vector database for storing embeddings.
- url: Configure the HTTP URI where the Milvus server is hosted.
+ url: Configure the HTTP URI where the vector database server is hosted.
#### LLM server Configuration
LLM Inference server hosts the Large Language Model (LLM) with triton backend.
@@ -21,7 +23,7 @@ LLM Inference server hosts the Large Language Model (LLM) with triton backend.
model_engine: An enum specifying the backend name hosting the model. Options currently supported are:
1. `triton-trt-llm` for using locally deployed LLM models. Follow steps [here](../../RetrievalAugmentedGeneration/README.md#local-llm-setup) to understand how to deploy and use on-prem deployed models.
- 2. `ai-playground` for using NV AI Playground based models. Follow steps [here](../../RetrievalAugmentedGeneration/README.md#using-nvdia-cloud-based-llm) to understand how to deploy and use TRT-LLM optimized playground models from cloud.
+ 2. `nv-ai-foundation` for using NV AI Playground based models. Follow steps [here](../../RetrievalAugmentedGeneration/README.md#1-qa-chatbot----nvidia-ai-foundation-inference-endpoint) to understand how to deploy and use TRT-LLM optimized playground models from cloud.
#### Text Splitter Configuration
This section covers the settings for the Text Splitter component.
@@ -34,7 +36,7 @@ This section covers the settings for the Text Splitter component.
The Embeddings section contains information required for generating embeddings.
model_name: Indicate the name of the model used to generate embeddings.
- model_engine: An enum specifying the backend name hosting the model, Currently huggingface and ai-playground are supported.
+ model_engine: An enum specifying the backend name hosting the model, Currently huggingface and nv-ai-foundation are supported.
dimensions: Integer value specifying the dimensions of the embedding search model from huggingface.
Note: Any change in `model_name`` may also necessitate changes in the model's `dimensions`, which can be adjusted using this field.
@@ -46,8 +48,8 @@ Customize prompts used for generating responses.
You set path to use this config file to be used by chain server using enviornment variable `APP_CONFIG_FILE`. You can do the same in [compose.env](../../deploy/compose/compose.env) and source the file.
-### Configuring docker compose file
-In this section, we will look into the environment variables and parameters that can be configured within the [Docker Compose](../../deploy/compose/docker-compose.yaml) YAML file. Our system comprises multiple microservices that interact harmoniously to generate responses. These microservices include LLM Inference Server, Jupyter Server, Milvus, Query/chain server, and Frontend.
+### Configuring docker compose file for default RAG example
+In this section, we will look into the environment variables and parameters that can be configured within the [Docker Compose](../../deploy/compose/docker-compose.yaml) YAML file for the default canonical example. Our system comprises multiple microservices that interact harmoniously to generate responses. These microservices include LLM Inference Server, Jupyter Server, Milvus, Query/chain server, and Frontend.
#### LLM server Configurations
The LLM Inference Server is used for hosting the Large Language Model (LLM) with triton backend. You can configure the model information using the [compose.env](../../deploy/compose/compose.env) file or by setting the corresponding environment variables. Here is a list of environment variables utilized by the llm inference server:
@@ -72,7 +74,7 @@ The Query service is the core component responsible for interacting with the llm
APP_LLM_MODELNAME: The model name used by the Triton server.
APP_LLM_MODELENGINE: An enum specifying the backend name hosting the model. Options currently supported are:
1. `triton-trt-llm` if you are using locally deployed LLM models.
- 2. `ai-playground` if you are using NV AI Playground based models.
+ 2. `nv-ai-foundation` if you are using NV AI Playground based models.
APP_CONFIG_FILE: Provides the path to the configuration file used by the Chain Server or this container. Defaults to /dev/null
#### Frontend
diff --git a/evaluation/README.md b/docs/rag/evaluation.md
similarity index 58%
rename from evaluation/README.md
rename to docs/rag/evaluation.md
index 5c98ca23..55942463 100644
--- a/evaluation/README.md
+++ b/docs/rag/evaluation.md
@@ -1,6 +1,6 @@
# Evaluation Tool
-## Tool Details
+## Introduction
Evaluation is crucial for retrieval augmented generation (RAG) pipelines as it ensures the accuracy and relevance of information retrieved as well as the generated content.
There are 3 components needed for evaluating the performance of a RAG pipeline:
@@ -8,7 +8,8 @@ There are 3 components needed for evaluating the performance of a RAG pipeline:
2. Automated metrics to measure performance of both the context retrieval and response generation.
3. Human-like evaluation of the generated response from the end-to-end pipeline.
-This tool provides a set of notebooks that show examples of how to address these requirements in an automated fashion.
+> ⚠️ **NOTE**
+This tool provides a set of notebooks that show examples of how to address these requirements in an automated fashion for the default canonical developer rag example.
### Synthetic Data Generation
Using an existing knowledge base we can synthetically generate question|answer|context triplets using a LLM. This tool uses the Llama 2 70B model on [Nvidia AI Playground](https://www.nvidia.com/en-us/research/ai-playground/) for data generation.
@@ -18,3 +19,16 @@ Using an existing knowledge base we can synthetically generate question|answer|c
### LLM-as-a-Judge
We can use LLMs to provide human-like feedback and Likert evaluation scores for full end-to-end RAG pipelines. This tool uses Llama 2 70B as a judge LLM.
+
+## Deploy
+1. Follow steps 1 - 5 in the ["Prepare the environment" section of example 02](../../RetrievalAugmentedGeneration/README.md#21-prepare-the-environment).
+
+2. Deploy the developer RAG example via Docker compose by following [these steps](../../RetrievalAugmentedGeneration/README.md#22-deploy).
+
+3. Build and deploy the evaluation service
+```
+ $ docker compose -f deploy/compose/docker-compose-evaluation.yaml build
+ $ docker compose -f deploy/compose/docker-compose-evaluation.yaml up -d
+```
+
+4. Access the notebook server at `http://host-ip:8889` from your web browser and try out the notebooks sequentially starting from [Notebook 1: Synthetic Data Generation for RAG Evaluation](../../tools/evaluation/01_synthetic_data_generation.ipynb)
diff --git a/docs/rag/frontend.md b/docs/rag/frontend.md
index 425acb3c..8ce78441 100644
--- a/docs/rag/frontend.md
+++ b/docs/rag/frontend.md
@@ -1,7 +1,7 @@
# Web Frontend
------------
The web frontend provides a UI on top of the [RAG chat server APIs](./chat_server.md).
-- Users can chat with the LLM and see responses streamed back.
+- Users can chat with the LLM and see responses streamed back for different examples.
- By selecting “Use knowledge base,” the chatbot returns responses augmented with the data that’s been stored in the vector database.
- To store content in the vector database, change the window to “Knowledge Base” in the upper right corner and upload documents.
diff --git a/docs/rag/hf_model_download.md b/docs/rag/hf_model_download.md
new file mode 100644
index 00000000..216020c9
--- /dev/null
+++ b/docs/rag/hf_model_download.md
@@ -0,0 +1,59 @@
+## Downloading Model from huggingface
+
+- Visit the Hugging Face Models Hub at https://huggingface.co/models
+
+- Search for the "llama-2" model in search bar.
+![Model Search](../rag/images/hf/Slide1.JPG)
+
+- Choose the specific model you wish to download; for instance, let's select "llama-2-13b-chat-hf."
+
+- If you haven't already, sign up or log in to your Hugging Face account.
+![Signup Page](../rag/images/hf/Slide2.JPG)
+
+- Agree to the terms and conditions provided.
+![T and C Page](../rag/images/hf/Slide3.JPG)
+
+
+- Confirm that your request to access the repository is successful.
+![Success](../rag/images/hf/Slide4.JPG)
+
+- Complete the meta form by clicking on the link `Meta website` link mentioned in the previous steps.
+![Meta Form](../rag/images/hf/Slide5.JPG)
+
+- Navigate to the "Files" section, which displays the available files. If you don't have access, it will be indicated like below.
+![Default files](../rag/images/hf/Slide6.JPG)
+
+- Upon obtaining the necessary permissions, you will see all the files associated with the model on Hugging Face.
+![Files list](../rag/images/hf/Slide7.JPG)
+
+- Click on the three dots (...) next to the train.
+![Files list](../rag/images/hf/Slide8.JPG)
+
+- Select "Clone repository," which will prompt the following:
+![Files list](../rag/images/hf/Slide9.JPG)
+
+- Execute the provided command in your terminal. When prompted, enter your Hugging Face username and token.
+![Files list](../rag/images/hf/download.png)
+
+- In the password section, insert your token. If you haven't generated a token, you can do so in the Hugging Face settings.
+![Files list](../rag/images/hf/Slide11.JPG)
+
+- Access the "Access Tokens" section in the right panel.
+![Files list](../rag/images/hf/Slide12.JPG)
+
+- Generate a new token or copy an existing one.
+![Files list](../rag/images/hf/Slide13.JPG)
+
+- Paste the token into your terminal.
+![Files list](../rag/images/hf/download.png)
+
+- You may be asked for your username and password multiple times; provide the required information.
+
+- The terminal will initiate the download process for the model. This may take some time as it involves downloading checkpoints.
+
+- Once the download is complete, you will be able to view the contents of the downloaded model.
+
+
+
+
+
diff --git a/docs/rag/images/docker-output.png b/docs/rag/images/docker-output.png
deleted file mode 100644
index 6311cc76028fad842b3ddc395ddb6176dcf4921b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 343710
zcma&Nbx>U0w=IghyL*DWdvFNu5Zr>hdvIwq1gDW8!J%3L{6
z*}5oOd6-$6d6>OlgM#v>v2d_;QFFF+@^GgSGVrJIp%LO>q49km16lr^ar5(Y{eNHJ
z;t~Asd;aT*f96JkyW^rCp`hOQ6r?4ze2kCR;e9l%R}s&HMQk;^?=%V{ehvoS6FK->
zJ6H_;80EJ>h!ga;R?6fUx}97m;vlLXtkPmoD>!AdouS)w|K6ZHTO^5!))Ktv$@bYz
zB|&`ZWl2y(TwI)7DaMGy?XP3YZd%#?5f~)`ed=E3gICZ6hVh#@?pRzNk2@OW;NQFi
zY)%l;ty_fNgxH)-el^=*=3M_$lPmhdt2+w5GD$zo-*B$`8`6m7S$d044jeFsqSVdU
z``sGaY1E=8NKFo}_}#C8FrM*WUiQc+HlYhkCI)_U)6Z#_gge?vsMc#^Bxe#yhg-)IRCehlmeo{ZxGo?a1F?=&FvqPt{VQ0#0^I2Hk^
z^6PaZn~rvUKM?n**utxHd%~)_C
zT6cKivpi=W;WIBHjzjT`tGqIzg++WX6w+GUNE)v$zRFh$eD%BS-lHceb5DwCEYW&I4)>~BRuquD?t1E6AS+hfZU;%njI)P>3qaE
z-H7RaU5Yy{Z{vHa-XjZrU}x)p=6oH(tgQDjIlp&7p@?ZewY>9HLh1`xqndFJ19vHk
zX2z1+2lWnx#gtBGRCbacSs`M`lo{0S#1OjV4_vxJCLtaC!TrzR1c&)`MSiyMxes`E
zYyqV1%eU#km2h1X!*D5Ig6jCmb4M&eQ?wG$l>iS7(S~sXOO07({6(9n=PeP&b3H+@
zqfp4_`bybDg1Cd`iNS#aQ1rRg0dOs;+s2M5IWHv=Xt21Ixamz>QKJ6lYe2q!R|@e?=3EJ}-axo?
z->`D;sqihlBJ&}$p0RrX`|I-q;jzMmOxVI0KDFmSk1IEa?Su6X2}me{3t_I_@G6^B})+
zNkPzgOUh+$;La4H30I<&7kLzUBj;knv39I4WV`mt)%237@GI#@En)ESQdmwbLciok
zrK`r+aQv2wht$&2MvwTL>IDz6?*NDEaRwg~BF>*Kb
z^5r_)wazAA)&<8ys_J$nxLB$drB>p}{z!?jQ%~x41bsSWWh*f|RR<|>Td*m&KXM0z
zwx>ooCT`J0dO0)a^p{*XmLwAO|86({m6UvZ@qyPoLF_nG0OJ#f$vb;5jbtIdm8rWt
z(y0JPqh{`77(;-tp)j=A$IC)QGTPdq?t9;pjkTiK;`LKjXc<`GBue1|pv%hMI(!9|Sb&KC`^4
zFgtBpEO)hg`fjpIuR_l4g#~uSlsJte8MQjf3lCm>nXNe^0G^Er9_!AE}-isg+k+f&NJox%6)bJ^AH@
zo8ZY`Vp0i%z19fnZyk!@ef-E4dG8TN%V}FXweP91kkJU2m{_?tD&6zn38eQlOLxRk
zzX-)*Kq`axJ6D(&t0fRM{zlR-wh?e*U{Ey1)ikEqq2bdBKlyo^WiO4xsDor4HRt*F
zdW(_M<(2P-MRdOkN>!Fm12Qo^f{sJ(k{yo#rTd=*0LwRp8U+G(WEvVX>W+@*S)G`X
zgEIYtO(~+&yajDrJhzmi`&*A^6b8boEe{X<&=wTF|ZGLlFsno)>jmsl#F>15k^;Lg-*lSiiAC?dJ(~5{wRvw!W)I|Oy+x>G)X`?
z)u=S+n#Uu)6r<+i}MY58T*(?!JOprBeXLSsEKFR%8C9^oe}8eaF-s!a`Auk8=sLRtF3*i81H@>89Kk0AAx7NvuEfVJ&95`^_p_?
zjjuPr3Y824n-9+@KyK6PPNw!PpJX)Xr7;tuY7D^r7b9
z!pUXhV}&)YSgP9-YF}jIrf6R*f=*6)tR))A7_u94S0CP1*RCR}8|h7I7p^O9L2Hbm
z@!Ws(2iA6e?~B*k)4SmV*buVPy_7ZICxE2|bY(tOon7u$Q|93Q3$KD3MVva;96
zMvP|PR+9u!KSymw|Dw!^V%XC-ez4~aTK^us_RKXm$9uS$FhV$DdSr=dW<<
z-gcRsZv>}XjO(LsJNscfzuh^CJZ+J;*bbphWN@O^J9kk|u>>McZ3yAGA>VlkNmeW2
z#NbM!Gv52J3q4<-Ovy7iaRf>IpH}Z5ICT8;P1h~v0(?vu<)3R1HwZwV{r;h5`@p|C
zEL{uAQ=RKo%!S~_uFCbbB2lG6=5I{={duD<=(-UTbmGGnAiMGHiNYj00yS??Gx4`X
zKq*>41lbqzsTMqH-Q34=yyuoNs}}fwN(t}r
z#hl%njr4?xbM+<
zzCZa~xttp3chkrdr-5#kBb(mbxuqYC3cy)}Q)kNEk_ReJqq|Ka;m-~24T#%=akIn7S`
zJV;xxXLuajTx_+=Nh59VXYFVBst9zh_1vKqE<6HViNd!$tEOr(mqvWjw?$}AQ8(1s
z0=pR!anqwq%cJi`Jzb>8OTL$0-l4V3!=EZ-Wo)sT?9lD^eU?A+@G+?y8Aq%{$^!PP
ze%;d`=NO5aji2aQ|JW5U8JP^b2uB{^YH9QDlP!94Mg>+Qmm}cYReV;fnsZ6t)Gia=
zTfffY6#x6hiuW>hEPo+`oB;&WmCJ^O9Je=ge4lOS&bD+>!^kdSt1XpYL+s>Bk4QhV
zgM$>`KD(r6f3tEl%B%YCMzGO@z+rgZ*4C#=GTdH!xTCHv>N9>zA
z*5F^Oo3M!XxY+@j-x;9%D7|bm!{2l`g33
zy(8OGSnu1KI32S+8>c}h)>bVm2wJ-7g(~y6i%8TvCK4=B03a(K3gf+Dv6(xNTJenZ
z#|qzrwDX+4XKPo&tVAc`uknKb_Id4Ca*?fFn=rPQIQ_uFwZZh0<`D;d*6}aAY71J0
zb!{l+2~0xoD?pXfKe_=O`_~J)Bxdj{$%pncyZAe!tHEH<;hys0MAgF7l}xR-R|GCV
zw7uP{>PyVb&j1&&w*8mh)03-+(%FW@!T~P17Js2o)J?nbOEtrJoBKXI#YH+dg4sF`
z6&hP*|12%0(!(QlZ4+cGi`9k%EXito@3jh
zK&KZyVz=q_KQwpVRu&ye4ay8;9Qm7
z>*f=-12gC3q9OPs?Y
zE+zQkfO<$(D5;Y)v8x4bn0xHz?PRoz-<>#sdqe-8>aoQWOr&K6uDBzx*j=c9d}~AN
zieY_98nkL$VZtN$3R$X#qdQ)YeHo2VEAwxjZHQOpq44)5{wiB5lm={&L&
zNoW#;8CB1;3K!(%%D>gkE*2_}nfdzmceooL(>NLTVn!H%RnZ-%2)Y1R-Jb{ht6OCB
z3QTHe>l1PZYD|g1H3xlz3EXqL(pL-PuV{)5&%wJbaw>Uu=L77Sm0K;Fv!$
zl{yN&zlTVCY1Ms~k(C6+J;}v^^p^g3Fy!T+RE?K7opTG6G0p5j^ic^II!YValRlWY
z))HG1%atQkNB;KQNi3{VQqTlyytZX535*$Rh-xF6MWL~7WR%)HF$keIenEa{bVOvk
zZau&cxZjeNiYU}KTu;Mu^wsEYA1wePtJw6;;T?hq6CFOXBI47l{Sl)aNL!mjtyI4rzJM)IT{!<^K*32c&fZd@2nE1^KSMC!$&
zz^1!P0g57r)xzNci?v!#B79slx$4Pq&SQUUF4cNR`LzU~!7Wwu=QsTSm8Abo%O$^F
zDkmE40c-6j>}(Ap|1VBV`ViZ{%$Bl#$8uu@@oZ^t_Kl~qq^Kh3CGM1m7_2SmvbgY^^OQ?B?$0$j%$BG?l@lyP@*7tR{&MJ$y8Y)tgo@w
zZue7sZj*(v_bhx!LK#m-QMrium;2~FpdvBQC7!HLE~(SpuH>ol`QIYJww;eR87cwT
zMUR^)5-dDV)mFK;tG%EYq8MtkgvgWR)jnnh5(#5+Z#D?U%=F`GKs24k%cVw)-dhpE
z>>1pn97Z$FXEu+`_?BP1`-}7udQ=?x>BTVy-J>bd%bc}Mm7n&J-g+cj4sxR|*2dYv
zkqRIq{|agETBWAnAPB0#WOn$;uY=oK_IT7r9O6WvG#$?*z3E~F_cLvw%gcJA*`Mt}
zpVN?KBsa*H&Kfo$-~*`M?RUNJYL2_?$f-gJ~)jQxddqwc1S=I1?ZcHANS$lzAYSYUij~
zp9-sgs$1O2a&cpJN<6lA!wn)S`*E?Vw0%+J-yJ=$3SfYF0^h#*W7LRUag->>g{2gR
zQ4?5zvZ_pwO4hHDa$mdmIK}!z<(SxGQYL>(FFY^_xbvV
zxn-}-CF72rH_f7Mfxj}%0n&dK+WYz-qVG&Ze_Z=H3mdH=Wp@lED(bfc*`^fzWK%Qng~
zi#@u!=RLZ-51D*n3OL`sf3mUN;KR)$uU>%dAtYJ+fL{-p5lJLVm^3EhutlKo{mNkoFA>69w^p~@S!?3(tH%isC7+)nrEfa@JKil=l~nii`;x#_?|INMQ41xe^dzt!SHSez
z5fd_)Zrm$X-tU^BFWGP<
zBw!7~#$4lonf$7(A!-}eM1dQdUyC|Wo6LQ&G+wi{4DU@fJ=?JH)B92&h6ZX;{G2CE
z>N$`}B*?8w!
zkf`r2&L5*^ITPt@W`32G=%3#tFNtJkTaaf`%Vu{i>Jv0dj?|n|C~}9^^j43Y?M)_`
zs(~yl%&yd8CM1hV3Nv4pcfNLHvSpa*Y8yRmy(7Zn3o#BK^&Amv!Vj)~M31^Wf@*h(
z19BtstHgz}s>aZE2Y%BPe}H}PjO0ghaTWyL{dLX|&+!+fE7aVcTT#8(4t|R=goqeM
z;cV_7_;^0h46Mw7M^tOJL++YCQGueGo4F_2&sQQRdzew*!afNzCcL!~QNCuv)gv>1>e5>mv-OQefHZhaw8PeME@HSCO*bn
z8xC{OG62Q?jAqeRO1*xY&M
zZb+srzgh?pIh)_~r8;ngSq+pwi3{$RGaK3g?L!O`@KOIlo{ARazTEBnH^J#t{2ARB
z2*~UG4Kb&xsu&v8M63m;qYI=0e+4Hg_)NG=wAga>i5!D&Zw$^~3Hn#}1SGSOCW2q^
z*La1{lvzHmJzd#VuDYT;*pc23DfOIqE6WCp(`)Tf^b!J0dZ&g_ZpQT!ZQsl}tEm5;
zF_hp-Pw@?!xf^C4Z?W6OfQf?xD9xxF~AGzuI9KLJS2j{cAsXgg8YFt^fzJ2KP
zE03?@*)ihXzNF*9qTvXBf0(IvxLnb56X;Fy5--d))KdSNz4fl&AKkHz)Ko`m*+unQ
z)btlA6oM-!2cOu*pe!^Ni-jtPr7X7Wizj0%_%mN8P*}3?Vc!Fa+xAq^RqdC0Go4L=i&1=^-L#M-{u1yk
zf11TzrlgIJ*ZDa{e`y&mho1|a;jv}usC>^a%8oGJC-(x7MwMhWxmTB~&ra&1i&PFG
zPilnxN(b~2-`OX-RGgK+N7k1D1H%#Ei7e~d`_;n7<^`jLN8eniTQ&-py3~Er;A~Gl
zap$7hQgw}tFoYocFY3C<-*lb7M6(iXo(o}~Rc2tCSn=`&KQb8Ki;dm`p^T;envS9-
zntx0DF<7b`0GBcsM)KbPmS;xlPAjB<+lP%2?ZR!vwq;zh~S$K!n`3XM<+HZ4W
zL}B#GLzSZ!=Z%CvLFYsp!M2>QBDsiC*bjZH`NFZtz8Z`j2#JFY3bPzc6XZ0?qF&YS
zYFa@5wn}kR`lSFe!ukan!S6g|_DmGtVO8_H$2`ynM#T9N$1DrPUUju__B63j)*+va
z{{gmBPx_7Xogm4o7Xr49M16a4enB_fx1lISUE0}|7Q}j2%99lu_en@kTi*Fym*$A0
z6W|y(9$9l(&47-2r`+rUy+$LEVWREmqx90Ky8l;pvEUn-j<;P*f#j^%xHz;Nkygy{
zOcwlkLm?RXk#Ov(@&=Btbu9v$jzN7ClN5=|Xr;w%%54YlOWQ-3GB
z&CJN;M{1!Zd>ZO}or)X880WLd*FS+~0l2=xLRAEwAC_6WvvYkZiswHEM*ectC=}rD
z^z0OjU@21ahi@{vXw?U)C
zikdWDTPA-@knz=LJyH@k`L-h4pQ*X1O?4M=bgOsdF_OSFcVi_`lb2}Z%qR@{dgJeN
zx?6m2<&Hv9|HInAR*%`RMjbDv>ybr3nl&Bdi0N})-jrMpMX-wbX1
zvc1Lb)Yaqb;*BcZZDaC-LOgob^F(+*8%I!@N@(OfHn3O&nH;r8I_M7(_Fs&$RDd4C
zU0eo7I579xw~RPi(m+*Cbu*$&o0HzV}OU>;Ahk
zBW?Ni|E0oRXQ(B-a(^|u+_G8X(K3H?&f9L(H6DIkf4T-OE!Y17ffJoU=MOS^pj#@Y
zI8avzUT@)$nUWG!c=>0#=S%pFCnwfhpjg=Tb;?rXt9wWLvvm%mZf`Fw%%UuXm1*bYF$*dk~^ST8(knn=pp}yIYcGczCpCy%0nJg44s;XvIx61c^
z4;mt!mV#a%2tI9XCyu&1^_M2y%DsymY{)UFC2O9vvWWh$_{;YP5Pr~g)vmmNuST9~
zQBh9MA^S%EcuiMWyd3xREw;h$mGl>MPaVL8HsMiaLly!aNqQrnewFtj6_C>W)yt)1
z@rbEFRwk%=@WJ4t{Y^Z{htHR8Nt1uUmkZ&d-H`P~g6&=JV9C$6uz1u`E=TCI;Ik$D
zAem}E5##{cAu|tgZzP`GrKcl*pVt?55LlXNGRs&TkT(cn?*@pgq~-td8TEX~axSZI
z0Qwn<*UaXZ(_v2Mtr$(wa@nSoDkvtR(&KHZP}QS&3%3_j;FemhkSR2r|^
zo5+M+>Ug#9I3Ixc9CVv(QvnQ+sdrYsQmne1jAVlzJB#nnql_3d?Wk^|`v^Bfz4&a+4vWLSD7#Ww^A5lBncZ|q>S4DiH8}5lo!D8Y5@!S7Hbv>Z3;Q|)Swpg%z
z%=Ub@t_uoyamtw+ZmM$3QWbq5+j#Ni=K7NN@;_A7BJ`^uos-ogVNYm_2sFQfMB>%g
zkDmB$!X$U5VyFEbtd~;NtZuBgSxqud_mL|!$ke)`LD%=BoI-ur0glK`FZbkp6yx7M
z7cXF%LR{YX7PKKF;@w+EdEP&l=vcL-x__EoO|Yi8z0>50hna5mkFp
zmSZPch*__Tm))sIplinI)4pQY8A0t;RT*nf?h9g)W*?V$GP;P!T0u)oQ~{xl{&|6{
z@1yH;tsOo_UQLuVI03vEAF8{`-(hlf2#8H`Nmt1=zgr-v9$SlYTGx3AH1bY!`0D;dzhf*qkD9?b2N%*WXR9_Y2NL%F7*4L
z6W1T}u(&NgYQL&w
z9{;c|1qqPa72s|Bk@~Y(ANXk#XI4lLV9PBOXA4mM0P#@AKMthD-m%(O8h13XbYMA-
zu##6+;&)213eZtOG35kxqvz|SQC+B^Fyq6TXJS6yy4zkgfxH^*c8V$c!E-nxFOjB}
z(aOVD-jcs8tj>mna>Oq&yB<7YFUxsd&H)3p)SR7Zu0%z$SYYCMvSD!UDY&NGcDX*FiZ4k%R{X{;C%sAJ)4nXZs1`Zk7;HKV5_y?D
zBoFJuK0QSN%N!4WZ98*K_uCX!)7AvY;(KqvCCLwBT^kFZ$&^U=nvo<>s>&69f@$7@
zjz;M+=X;$|OQm8wfGZ~fa@uIIv>>KL7dJ(d6Cmmj3X%zHM)I+Px#&Imr>U`Bl92MP
zuFh6kC=lz=!!$*oFeJXM^`&S^YxoStOE(lEl}C~t8+F4MlXoafXl6xiXKi4-{o84f
zpNiS~OCI(5CjpCxcl$YJUBKA9b4JN1CGl#o$`u$7WPN@Kq>{yD`aGMQZ0z`4qOGaT
zsgaVR_J`pc`Qic3NL8Rrv4WDYi>1BN>N`K1iLGZTVLcNlsro#$0TE0RMxbT-2xsV7
zl#(rM{!{YL4|lX-E}@{C^Z_=a3n{4^*a$oBTZ?s*Ro4JJ%!n5R6zx90M`QDBsMkrs5U+5qOc
z<#6IbcanYU)yszV@uo^kZJp?EO0LlKUE@QOQaNxp4mSb1Sf3YMgN6ptr9
zgEL11yJAsfbETVK1^NofDe~(FtKG+LUlB&j&ierdw!~WS2GRIs%#iH7jGrmoe=2{5
zs1uL+%4c82EB-dgFQPgCcy2lCU0Z6!z7qaaHYh5uhKRYmYq9tcM1f^VB=pHrgRS
z6ZfvV@>u&v2nF9htnLnuR91%-H#8_@hCSm*DdyVC3#_s7@*VzQ7+XrDRoGR#N0$r`
zP|Rn?F6bnE92)A;ACgj5pukZ*?z;`An`4;A5{NMK2^N=^iTuTBh?nlVDNKV8pvH1>
zK6D8Eyo~f4e~zf4qC!T(82XS$WW>>E3w+@897$y3!p4bwfbV&JB2K1${2linE+(JQ
zHx%dBK{ZD-R5XgKzOS;eE^eqFPBZc%wbctFtZjj>kw77}SaIT`HbM?Qup7Q%k#Us3Kw-hB@w{PRW|i&y
zPU>Qb8B7`6;N|Q1HQIJ6bFe}EJ>Cu_mByPWV*C^GmCO_0NiX;#c5g2)88k)DGAv?}
zalcISf}jlgFqC*Te`VJ1ZbU>CN-1+YY91Kmjy^=%mBl_OQzhH+WS?WCI%bdaBjJv#
zhxXDkfz9(KBXUX#V==X*8DV2&>hugrXm3h(2DndF`p}a=g)=(Q)2{dMI*lnV2WVLD
z;_*({37xH5bNuvUT<@CuAxJz@w#Q|H{T%jXGhen^dF@l)PmTUhlx2!^LztM~UaVRg
z*!^Fg;Y;n+epdcO*#dyEg}yl!GSeB>Nh47gt@@x4E{&tbl-3ns2)$$v`Ge3`P$)2y
z6tcOy5EzcCM6e!VTzsX&G)6mx2m%o?KA8lQy0rg~mhTQs#&!K=JjF)Ut>0DCNXk&t
z$RS&^49R0gs);FOJ*wFTf#A<>(nc8^$!diF>j-q8QaJ?)!DX!DWHkH;%)>Y9b+9x(>EKnyZfF0Ec8o!r>Hhc-xQ$U{P*`@tupjrlzV)2SJ
zoD5T$4R<3T{Ni6`;ut9U?z}|ZAFNGX{SvolV6kFTpTSp_lCh947Kn+lyN~sU9l^MZ
z#n@{6>7Xvjg$Ea%NwIkKYe4w*2`nKf+EJR*z=Xms7RBYz5~qAVV5uFET1cp9mP=k@
zTY^jgh?!|E5Qz~lQ-h{Fk59$ljiAi3*5v9L)@}%0e)ir!{EX9zzP4^eEs1+*f2ICo
z6HutmI;Rg*J}aG!#f(w118wwhC{z|D8wyc6@dBIQZEKrsMOUKzx2cZpH;-bB(u5p#
zKg$?Xc5zvPt&Tfiw&Y=rYivnUN%mCf3iK+%^uRF(n4*b$lzEA+pyb+=acVF^x=CoS
zpyT15A~&1jOMrLT7_$o-D>?eHapTo#RC}&*IQ?lS@(b+CP1vsimX|Hf-FI(@NAxgm
zEloG~wzZN}WfVH|lwDHbs|Y~1<+gCcG0Z9Kg~5a&H0({ykY6AvFkfc9l`eg;d+d)Yzq7
zL7@DJtievi9Vhn|sx*1RP4egfN7cI1EWv?{V{&+sJ)y~fr^5_yL6>pa8Q!YrtxH4t
zH~)gr&x4)NbC-QzsxWTQKfi2>#Qd;AP1Dph(!~F+LMEs3;Vv@!Pc(8T9#Bu(<>EL|
z?3E^8A>#Q*p};tww!SY3@~_xy_mLM`?DCctV7Fh>S<}Aw5jBfRCrJgX%H~$po^wRE
z>uJ+{l5iU}l>m{1gv2AGNRYs(;W-{fw*{J^^o_hw197mkN+cM~n33l=tHOG*K7rI0
zxmBwqxSoNtpkg3IYE*3;h(#(4tqGx}Dr4;()l?|mPRQnku?~-r?Om3DQr0Hcn3+*4
zf4A1{!F}g0@4=1PBEW%nu2ubX&c_x@dZ}bF_;#?k%ZX
z{jiEy=Va8zo;IYKQ?2G{}w((a@27j$+|KDEt{|rWAl=H>K%>kL*iYoQ`_=)1Bty^B~Awv%m{D|d#QTzaFowehqPI*
zrvCf#;1(EDyk5aP*+pS
z`r0*PZVopZ#!zef=ZY$8-)1u`E>$yBL=sz9$5{r~(-FVtx&yrORFS0?8MD6Q_Vd2%
z@$=v5Cfi~BCVh^xQB-q_u(5#K5!0*32aELLpsrOC0BAR_amkgA^hO!SCvwQ*)+llN
z22-r>5TCjBdQgtj(FZQT2uM}5@bz8Vmv*_nkup^zRb!xeR311{?TV9gCV{3;n_vd0
zn2x_H@$k68uWUGkrS+|g=YJ?cONj0{Cv_S|WfzS_2H%!eC=EJS@KYC-5E@8Ue7~4=
zCQb6{BC`v@?L$-<5KR;ZVCqv`{2rzRFAqEQpLao@yCsFGUJ(5!LGhSu$eYivzOAq0
z(_yGGa7;F&tbg~Yd3Gme=3*HqYl0e#vV)HrwjtIWbE69a;ub3eWd;8*>qY>3vV6;e
zq}Kerm%Caa6pWek?eR>({gUyq!erQ-Ad)NWlayET6UZe#FXW;hJbuZMT8A}!O9p?Z
zfz;Rrz4v`G#Q$UMz(=C6sFtI>ZXdt8vq*`GSJwD}6Z&8i!U|YK#EBpx%D_7##mI-7
zy5@_XxB@J~yrGpd-R-bMUFyarVQ0rj@6vL!MfXA!EPz=}t<0i{Vem<*Xm;EcI+t`}
zc8o%ph?+re40qWFg1qnRN{6SB89{$su=dq-e#8_Fc-So680hN9PD~h8UtGUJ>Ec3OLkrV!4#{&(YijfQ4Jb>2`ezO0YjBb7UA#C3
z{IbeF|F9Jf{tRTKP^~yK`kTtpMHu$K;>8
zpizk8CyKsNi1|M`u6g$}_%PCSQs_2R!RvS3eWY)99FaFS1$;{XnQ;!J
z-Nh>`*7vRF4J;yB#J(rjNJp7@4Xu!JE-u=;|o`RN<|&QL3Q+cd$$$A8&X5Ua(b
zuMm0Ll~XQ2;@9ut+;Hi+i(_m`alF?A!g>f1RsBI2wp7>+xhDx1rkQH_1-g{o2Nc{p
zszzzVf&m2ee%ZbwnnYmxd0>s)N;R)ulMOb3eva`VeFa}(kASyg^a&J+7
zSbcyvXvH}{$@q0{SURhb!(l-Ii4Ic7c
zI;HM8Vh>m6uu|Hnre1@;2-1AF0s1Eq1csmJ%Qzmz2`TJhdT;}~U96T`FEu-yT-#d@
z$e*44X)3#!*SW;<5Di)j%9L~s!bqw>#!-f|JiBH0Pf?g4%TXK_;|?Us;Ahd>6XE#S
z&-fmaj5&Qz+i6jGLw)`qdnGp7bpxLsAN=|W(+l1cfX5fDug61Um@SYdB0zdaY%23&
zod+$Ur>FAPDBd}-5XBW`FoHY%Py^q(qGek
z`pHigo#Q$)7>h<5$uguR(G1bim>5wPj3oXYt$yX;X
zw@S$inQY#OeP!igcyBH30SL1jADUHax^Kw8mrs0o%U-1sZ}z5tQwuXbV13LDhscGonNcO%y4_mn?jh0ifJE4}|0h`AA>Ob${Z&K2)^)
zy##kFlfk^GAO~2R)50swZf~(wGxQ*!|6W&sFTX{=1EnrO_~2PT5pmK^0&gn^vRW6a
ztQT7?hYFt!0PohLqKpFx#hn$TTfd-nw_NyKX8J~7*gSPaGs9By
zaXIxcx1VNZsr}NL`KF7iHTIco)0=FU#4Ce@IjMoY0K#8`G~j{~7lPSN*Px6(qO8D~
zX%ewhWSPTW9;))+%0p8LGKnKGQw183ft)PohFsf#2hVK0S<&@ut+nOt;KPzv$x$zvPU*KjsBMZX(>jZ$Ur+`
z;`r+0l`eRz3%69K97={^8_`FS>7XU4N_~OW_6dP56%|Xs2TzjB@6nhL`WTO^U#~tA
zK?`)LrdzIqOkLE>!;s2QQ#phAF-fqc=ODCVwsKCttj*ZESc3q3zDde5N6Mor>{8Ou
zrYy9qFBF`geefJnF_`(%Uc0gFd^3nOror&q__g}7?ftZjg)dkwlS_1P3`t6eSa1eDQ|I1yP)BF^o$@Aj_YJoa0
z)ysgf!%8iIN#G51ZtyF%$oZCJz}2w2)?As{9&y5``&X*Tlg-wbca_8zZ+oiQq%Y8Y
z&%D|-i{F`o!f#u@<7#km=|@uZAs_qcSts9q8HyC_$mJ(ICJ*;kM)p=Ynxt9QZ
zU7Y{O1S9hW?Jt#IR%v)oR%rn3+&c1ra-8>bv&?qKX)W7%t`*S8gjSAur0Y@F7ML&0(2-*_BR=IU{4scsp
z2B$@2P?nfEM@|IOEjp$fBIZ5fh@LcL=1*>4XPSk|G3YmcH|MO!;1y!YF{#fxrM
z1sf~{@S-=1QwC1R#>tQeBg5AA4m2sL*YEkof|kmnRdT2r__h*ROUjjRCP5}qy!Jr@
zf5Xio{An?C3Hzj!u)??AnkhImdn0L25!uy%b8GQ(b2DN}3TO!z?^=+lg#i$Ib-b3Z;yT38^
z=NKl1LLg-kNlzHPKYUangoITarA@hOd1X~ik
z1*v0*E
znfd7@%B);#RN?xh*aal$!jme#k>EeHj*N`haPRKm|E!yw_L|cqDxzlI>rfAsQ+m3t$F+B9;XnHfS1>5WL$ChA}n~T(dun5Z4QU$!ZO;_
zwYb8)ndIQ#&G_#$v9mH;k~crC@~H3XnEVpwm|RYgh(B}3x%y{b>}zL@k-^U19DxE-
z>*Ixc`QbDi_H$?`_;?98)L>`1OHdeNFXG`Ul`jdcXaszK(G^%B?JAp?l-AcAK3kb9
zJxs2%rqxh{5`VWO@^L}jNZ-7@Hum5y#u`F
z>hn-ugj7G8P7Sf6NJ)xN@yQ0omuuuGP0r^bg1qU>jr~GW`n(Zu&gD1s`cii&<)M
zW67{K_|+&Wz2XrXJ>eOcs`rfF^-X6JpD$lB=p%rA80G|16-Rc<&k;I^9d6zFf8sNm
zMDZs-KF8b{`=gKRI!On~O&Nm$Ti60nf1uY$;U8+R;`-Kp5j4icyUW$r*hk%Rm?gu_v(N5;3jT2l|MuXAn5+5XyD@Hdz5PnRPP3acBt
zeKkm>hXzpZAo+V0Y;Y(U+Rhf)a
z5OsqH^syoA7GznBX=mIBn}h%O6Z&a$cBzh#_0;8sj%PEx6Ekfnr)~Tj`MaN9xR{
zkCG!F7PKN8W`C*am4%FqlDmRlgex_18kHd^zt
z{*Mfe)VkvGQkg&|Cri9IW!Ewvq!ZaBe=1{aM9w_t@0gs9ut>o8kEL=%J3jb0k5`_l
z99}vo!{v%h`caUy+innYp&!YC$6ZYc!;c7wQZjObQTgE(hJ3>5a)<{fv`W-v^U<|;
zMBxId0lz+TiUur={lmz{elCp=ybq==od!)f&>yTym6tp98?^_x+Iv#2Gw2AG_5B_5?AN5?vhV>j*^UP{huFe$=+_#%aVv`9}$_>5R$)9bvpcH`lZ#
z#M)-xD^=M1BK=<(QA;FC6PlkrixE$L(RKp}Z_OWePTKre6)rt@bdTTqkxiykJloCX
zLX+jY_i9Dl6W{FQgem#Y#@1G(vG-c!4+`11bgUZu?wwaCL#-YQtr9p5PR4DCNFEHF
z&d)YMek~{gHAmP?ooH{29osPNmL>Yy%W3UBr^!xR`(?t)u#zx+dzIL7(-ZD*=)M-IPs+EJA$5eVHpdsF-Uv1f`_;0DUMTV
zuQ-?Md1I^2%zI3`?N6=U6|a3+v&j}+eYFs+A#0cX)A1$jC2ExsKJL{Qp}awy@EjbsEpQagmlxe-vuMOye0!q=n;gCD)_
zPExq;N$AB0Cxari@XXjHhkc-H$@Po#*ZS}(-+!g0sn?~rtmO}QZPU0pb(h_-ZBInn
zGC_QIk?2PN0a{v&J4jsjHXh~HqQ8q`Se%1z6c>#3#j7$@lV4nr_0M9b>C(i(Ubv0r
zc&K-ykah&D_XS9TrtA{<`IvX-NgdTMJt~QcZjgMUUQMnV>rV>#$<}5>gyaroJ_}FT
zcZ>rYafAxXU>zQ?gqP7@j8VSC1ly3(a8Qa{F%8mtT|UAPeqMLl+w&s}q6uJOax90`
ztb(i&e*yx{w6K|eq*dN=&w=2SWAA3VARz2L9;2u>m+WwPL&Lx&TUXfEq-2*aBoLi|
zA*LB(Di#AfIV#7*lK&{MidrvG%AV11l54GH%zY&zQK`>7JX<$;y-(*Ubx1#SA&
z@=MD{XXd(n*)ZR_$S##4h0W+daa}9H4TIANU{fvP(>b&BZ5N2&$4yv$8GqOvo(UR3
zU%5R1G&^XB??{T&j??vT>W4$ZksUHJlpRAwCpa8|0hih0eHW-;qxh)JWRl}mbzr6)
zz6=qG6Zow7=VV(!L~Omic@Wnb*CPz*dqNc9V`GumO`LD#b#N)Olcm;cJb
z|M{3I`FFdWH0qqxk74c_6MKNfS%dH
zJg9)x-h
z16prHz!&{qZ{-$1##N(^maSHhkqDJR|e`(&Dz<6Ok&HV6DJ=r{%}+V7Ga-Te19jF66OMT+Kg~%BP@n
zUvZ-*x9)wjmriuP#HGCvaUQzaRrMMYEO+*}FUMSSbL7WdtMnJ0JLu^+b^G#gYnjB#
zu9*$+Q6_R6EXvU4`hMfqbo(kBogSR$-&ym((P|ol9;wO35
z_b)hkRY2NQ#fkn_1hR-A8NsjV8_V9pr+zya0cVq9cg_|KpCl;wZip1=ZtaVQRQFn(
zz8lm1U7B?a@)7%=tYBc!f^{})muE0)6aDyh_wDdEqpzGF?m?zp&t
z8P0rl&`+@^U_i4?xNJ7^!SkIBU20q1Z%`r8ot)fd1F?^u@AV5=Mp^4t5r8Lss=-T8w1
z*oQ590r0ih$6zgc4%tY8b-n?ybbVbo%qLi~Wu*C;Y_4vezyrkB=lZt5$huwoN&wUQ
zWF#|+!;l_tbPUk@6x0QdmE8U&GfjJSDUBa*R^Ij{1>5$!dub{|0tVbjVLb}DZm7FXQx5n!n)d%(OdqRYHg#rn?r{;D%O
zNeeK7#MDfe8Wb$o{hcQ;k-f0o{|pDo+QKSvH0Zv37+zkL)>JYhF9#bQ+pkD%D0VEyYf
z(jG2%&eWr2k_yd#QO!zUfH?8KraNNmR1nkp`qyjx=%<4J0-aWGz7T4PEH0cqg3i}!
z$7?)6q(|SSZ*K0GIXJK%Elzb(K3EUzG1eTx={bspdH+m0Rjp5ALC)9+$4kKsAR;2-{n7p(u#NS6~SMt4=Q8?mb4EhG!28ot{k`#GodhWPPL3a0_;I*^m}Q$m!uBPra&^_Ay2
zuKlVkNEKW+tTt<`5{F~i05IP8HvF&`gr+e|Kbl?!}X=C1XZ9pAXF0W|v
z3%FJmDgTj(v2zYUxp6Ym^8$@lwJLSI#pCqY#z+5C0#4>6J2&YVA*wy+aALq7{!qCL
zo9TR14>VLw+7vI#2>0o+m&DffK0dFse^VrTdz1*=9bt~8TfWu1$Uad$8oZWxF$_HF
zdS8Cy_R8YnvkZ53ccs~7@R^%lJss`8`-0_ngVr^01fFGYDUu9J)x``?_E6{E^P5n%khZ^dG?9;Oqnje5pBW8*Lv*ap5j7U
zPusJ1nZHFa@lL5fJ(s(>6{h3kc)iR-#8$tnrXd^s+g;@5+U0<8mC~g`JGiE_l9QfR
zjA8air)1yIZ?CbeJ9lrLM8@`Zr`$@8o=s=R^HHp-tt-CyO{b(kw1dbVTyUb4(`{!z
z#Rsu#d*eHetY5Reqtp7?+UR$;ic3b(z5erC1B`~-eZJd@t6j6k703BwU-_zo{ncfH
zcVBg_let%lYSSzsBfjLHh>zMh*aZO7P1!BQfoPvc&kNkHL}!jMF?_oh-$ayN7S;Ga
z5Z$;f>IK{k%f1u2SHCcH_dTrV(3JaXj+MYtRgB79f|~XC2ytX5wm6N~ncLYsi|4t4
zRv4bot`F!;)BezMogIfKYU#(IbyjtOKgA5S9y=j3b~9ER*vD_)3){VHB;C|Q*XW+wq5A-o~%nzt>+dLY}%=7<(LyjPqmzR!#=FcoqXzf7CSbp98{e-B&Uta#@
z{JxW|)2H%I;JX*@@>a^MitQOpTNZegv%ed~DN9~CaFpuz1iT40ib#A0Aldtu
ztrVS_nqtG@F@U;egcc2Q9UAk6$bROI{p3`$@v+spM4vyRUsZl=KbON~aON)S`PxYC=`JI!3MMYlbTr
z4EjX|M>V={YbA{W1P%;xhWA&YX$~@b0W$^}14$i!s=a1o^0lI9D2Oa2*4^$C_Iu<6
z2{(h{H(r$`1>B?5O8S|9R}MZ~=*G5Xe`9VK+{fadj>FTLQVlT7;hHQpl54i*U(X=854##-@8i@LGgdc1r@&cU-gNeyQ#&u=GZw~69y%*_0_7QF7K
zi3iKdpN20J{UrPS0$!%{^!8&tljW@jx_$Xx;5yondWCe};`^iEXCm4LJAkl+nYH29
z#8Jj*A1(FqtlHX##>-b1;q{sQGRjkM#!)Y-s8>jD+$^OvYHted_y2fOT>&MbUT0L`
zZQLjs0$xnu?7W+4KxpC&Z%25uRFW+$b&tgX3(={7Sg@w5N62AsV
zc*Car3jP~W4>POl!&Uv;mKgLtoS+*1wP)fHOe`Yu`^5=LyjKfP?X!JtsC?^OCvD(J
zJF!^39393UKnxoDu);0f*GjL60J}jN)j+P4gY-})@|co;>;eoNKYa>OnQr7$Lz>S0
zcgOJGc-Vhq2xiKunC%yFPybD6{V!hlzt-j4jgeMP|F=K-U%x|oQT;#hyYu_|`_GV0
zJ_)3_p~eVbmS{q_B{-ko
zG*3<%&LVA=X=N>~OG~gT!BJi>+^mNGs{+5#9fQ>dM_G}<=MFH5ZmUFJI?SoNyfAYT
z?`*#4t?gH?F0p51@ZLJghTsFF}Gs2NcR`hJW;lQLo;O96w)OXP>mX
zKh+~vvvo^c?t!SFg4NJ-H@g&y^nN!M(i9jpKz;OobXn|DeMyIbaK(tO1%oCypU(Jv
z;4zko<_`B=-?~VQF8Vs6E*eFGCXR0SV$L_0SiL%E<(ij1*TXt%1D`HA*Sx24V^UMU
zIuQ#YaN6DVGtNWb&$dM0?DZPfh)v%-Lq1XW0#d1ejHXD7O@^Xv0Mv2wGhe2*?#p-~
zM-J52?%R0ziOWOQdDX%1x`u$|_#=~t?~1ER6by{Xzn5=lylkngRW
z?&`TS_DtUA;pv%FX?M6glt1zgIVORa1o-8Szfu
zJHMl_%`I>A9;f;-wmv>}+ihqssjcR{*m_ru(Cm5qB(M9P=@XBzeY7n%_|hHq$F!%{
zlLfLlKoR_fuV-qi#!9NT@N{JW9s0CGo@x?^jUl!%Gus=pyNPsb)tNWXsK3+OQ3xZx
zfH8eBIz{)kqm6p4_`qHo;WzU=CFclr$+|6uFR{1#&ojbC(laBCA(2$E_o+GUr3`VD
z9XyvbYu>?O70Vc2q7kA#g!A!c`uf3wjgMoDxSSC^-8j62dVxVzfaPT){phc?xWb=v
zPZim`zjV~0IVnRHX$m_*^qU8
zu?LjtyI$@8Lyh}t*}Hd4Y1$}Z2-gcrBfJ8Vnn@1aue;>6+h1Kv%Ji0%f0kKfHfEw4T0fhJKH9?EM$F^N;K&gjVdlOTXE7;x&~Rupxo|z-a`#3C6jNwUg%F
zGLST7_`H;gJx(u
z+ZNE{bDLeRSv6wg7334KoH8=49mU;reo!*q?&w0-IvM>5SzlCzxc6FY|CeR3N#^=3
zMwagbUIv#?kky;)KQ|Ynb^Zc|h%*Sbzj{t58Z`gbN#>&7N`kJ?y%
z>^$UL{BCRMMgpK_WU!|gtDcYbAZnr^^2qAd(zB?#`gQlIgORCKh-bIY3P3#y-ojop
ztw%BCp$w}ulHuvK&wCBO!;rxa7MgJg>
zWrB|z#Ud%`?C;;Obd5sB4AYbwtVh>FcC|lsX4Ac^uyJrB9m_*3qY*G@zW+@bpWi=K
z>4`#7qFqVFX6Wx>bC45%2K~uZ-0W@(wRp?9x6?u^+5oO&4p@yScyl@2=BC);rwl-R
zt7r8EX|#@|WkLp;Y4sJXvu1bU)pN^oI|-vjcx?tgt4nJdJQJ%VlkzV9R}^(}Wp9_2
zrT7Bva?zEWBcH#s7qHxU@R3HuO5aCAKEcDF$YU2er%zGiF$!=z=Q?U+_0ea6k_
zt@h}*lA+^6f++>B)z;71MepnA+os$n&ZBemXA`uyM*pz7@m6Nh8{9>R2GQ!vuSuQ&VM`(FRD0d{u)2`jC%7nd
zsg<^Ep|zs$l2QN!XL!Jt^*Lno88$F_+SB)Wg5`r8{*PPf50tp{X1B}}gqYL1L)5s)
zpU*3OX}juthg8Cfk^qOUvW;L2ZQfK5?cur*f9INa%yZrwOpa+dO-l!w7pk^CEj*
zGkwu23!fXxJ8{>(56xu19JT1A|FUdh^u<)Qzq@ym!W}2UF>-Pn$7y5dC;_Y#!7M6>
zUOFY5|KW%C<8!bJR4Rc_X<_1*&;21rv#k@i88i$EClJY!j(&b^p%6tajxP}GHK?ws
z5@f5Ce$^I+-xs!tw)vt|>8RU$A^Up_;n>@%TD)Ihe`IAxF1&CZbav{xlACd*O~6X3
zAiM8D*bL?v`7D(rC1-@_y_9jUHne|N%r&aCwHdh3;=%64@FzlDhuF;dW$~EgS04ZP
z-|cy-+`_MP(*iLgNv{untb+#k3&R_E;I(qlzh|e{L#l|CKXw2bheip#mCF*kQ?;1gy*eiFoob
z3}z{xn{XDd!57NDOS$6g)QbgALu)!ugkIgvi1S;j`FZ;LJ13|Sg69#wtm3p>LndVe
z^}bOS#O166n*dD{UW{bH(OuFcNv+2;#go%QxjG)D#Dg$ZRQ(mod3!43r2ZZnRYCp)
zkgOnL`(kk>i4C&*A)YuKF7H@A+;GRjU|0r^K&O*(ZqRG@z8O`^t_}}2=8<_UUQy9R
z9A#pUs~GMyIZ*5>UvxjjciB-^R@8<;Sd%7ZU%AYs>o5?sq4N>O(9R{1$)4YK!~E(l
zw*48|@6}vK&aN!WCZ@b@X6sNnR?|x_T!LKSTlBaBn3^}oQ{tUQ)!#a2qFCxLj{?so
zb_%vMf?lSLG(NZ=yITIV1BYubA=~Gh
zIWTXMgkl}LKqt@2>Iaeab$Y8803Q|e?xV#@M(I6@HE8(kMKFaf4qVqGw_WY5lsk!0vx+s
zNHkA0vR_yKwau=%iM0)(^1{oFE`**YF+D4!hvw4pQE(q9(-UdBLA-Lgg4gW5hmGDc
z+`UhGoyktDxCf1{28Z%<3^A5&ea^T2@oP0yW_TN2Q8l*ZfkgVe4~>f95ToTwk)@cb
z%W=fS0#np6CAzUDu3jByj-WOtug&8anGawu(A!GUQcif6zUL_WvAacYxIxblAj9*Y
z8L{3&0HPUY*}KiA0e=ujX6W|~)M7w;_)Gdw6*aqhh@_tWJ^N{9Op8Z_hWBddJa{px
zM^dz?kn{~!!V9j>_%gow5F*-arsgX-z(#(lymcFUk0ond!NTo>luy@`Y_-E8><>kj
z;zZO|Z1TdxKFz)Nq2B%a1ANQkovmF-r;r!E?P|Vq7>bn`m*om7GWa83wtKVekyyKfRimwm#
zKSLai*Zd8qJVRe5a%qEs5e(dZ4<1n2lu@q_ZoIzWP8hl(9&v?_yTJ=UW&)HTnW
zYsYctBc(8?&+H8M3wYQ~Fshp)hC44bQ>zM<#(7wnBVi+ahJmfxLw$ZfD`Hm2IF5sM
z=ta8*-3kb$vrc$FkCI}$*A|r9b>sYFHmetDb}_AIX(u___C#{;vr+0V!poGOJ~Kp<
zp|07pr~*y^b+>@i?AFJrY2T0}7^^upAO3;|DR~tDEs(gxDzj%yKWJ|?
zKDRqw-jm|-)|#5(qr-8>W%j^O#ehyl3X|Ie>k4q#^9*`|8X+GXoV>SD_zG>4Q~<+%C7}wn>=OJZjT*Kw~^=9rB~c+
z6J4)_8N}B8*_q9r%$<4mkSo7NC;t4S?(1lcuzAY^Fy03$DLB)J?fz;~mGa!?iI3f;
z$H#qhg3jhQ(7JRI7r@#S;Gtb^CH!(t0Oxm+=D<%Kb}uu*8i!Gf53|Mz6sAl0g=m^_`M3%v-jd0T#cOQ!xotzqLQ}O4=p>F>6ao4SQ?S~m
z7@wruS!i?ZKE!AV{wE*(O^p$7QS>KBKxU%rOeBc8X=nHG1xxS6ck{vsJ9(-MC*&yt
zCpziWs9HjmW9;3^pP#2Is7RRmYd}HW-O|I_5U8gl!KWeIBnNB8=9223scqV^?foZk
zxZzqL5$OJsh%mo!s~q?^jpJW))a!lzCqYVgvN+Vy2!wRA!U_kzQAdgo6q}pF@?7(!
z5M?BEw;~KzWjIYgW``~u!RE}J)O!-YvJX3{Wx?tT{-gkd_My4H_LWRx6
zOrz_W0B+;f=avuWSdVX{<7{lb^=awTXolR@6is?HC0>*N4Fe!&VEIRuBI|&B+M8mf
z9-{1xj;i@hR&`Tt_7g}GjITFAYCI>&E@p6Zn!mNvw5?lN2C3%lF9zmq<`L%zBKT!N
zmonv=;?Z@~B@6I^~<+BPAj1T)t16gO-`h$hgLZ@Gd}24&z`2X`ZU@0
z=AsHcV@y0|+yAM{-xU;+X~&h5>*oR&4=4~WbXoliGsQ8ZD(!7jwVpaBUN-rvU{DF-3@p7)uT;j?xnRVzq+@J0a);<~K$yCnU}
zRhoejHHk{^7rvm&x3^5BXM9tpwx8|wN+t#i+2SxwrVZC?J4HNSFVsDuxWA=A`L_v`
z{qmme?{vf%We;SL$i82WaSjhXI}scj|J
z)OtFp5@r10TYU!}>jASqdXJLxpB}?+UlPZl&+!pvV#dYA>6E4COn$O%EiNpGg?@ayiKYuxr2bPio`qHS${kOrgJcxBd`2?@SpwIR9ri{87|PgYL{z-be0
zF0y?4cVE4<^Zj4d^idYkETxj~QlPt+{>VcSq8R`$aQGc#XO|iJ5bl{jPV+~p*pb~c!%
zQEBP|fy(5~8`t?~RUNWk9~GD@w>OHY%>Ml#e-f=)^;mmktpM2kF8W2CIzE91zP&9@C#ZtZ2)DmrHu1$Kg}
zJX_`*!CXG5UaN0eD~`7-X#*vlGrI9DUpUoO-3hBE3{;XQuRAR#)ey%an%Zb^#ayM>A$Jn?-xt^OdGpggpdKv)@vXGJsZo;<_)NL<}q
z=$yd@0Djv|9_*Lksi~Q$jth8jt@E%mOO=XL6CYgh*tYiKOQb)mP2S2EwwHQDk8xr3?i4?0<|~n>Y&HpQ*h_vsBw#%$u#j*b
z_s^Gm)xxfx3IiEeh7px+;Y~3-Vcds&GMQ2N?Q|R>6(YnYnlIxsC8HGs5pDPk3@*+~
z>$Vrzv2--QtEyf}H(IjM2?3VbS&7bNQ8#-*#P)8c^C8S1r;wUqtzkYqV
z{XhvSX0ZEe;Y=O%?Puv`Wdf!0^grNBC@~o|p0*gqOp5OU*qK%k
z?>P2~U9#Ab3#p1_6b}FsD*i&9>{acq`%%lioYQ?X9#=OZH$g(0RE-C$qP_%UMe@L9
z7CZ;v{?otJIIVYRT+8H52fB`~%b?zGp6qm07E9(cEF{c~C>A$3k0T~W(}4$OI6l`|
zQE|}``h1yi!t;FWIL5E`a$;wWG1Z`_XCHpNinFBoIXCO3ub%##GW}O7URoziG5!0a
z*yzynIU8QBWk1?VjcJpr#l_kk)aYF|pm$`3dE`|=KT@6(?CzO-uFhNWj#&&YPO9w@
z+V@(g`L=s?Zil;GeP7#)D*-P)K8!Y|E|;tSQ4=f4p!-xT`SC4{r>NJ{9Z~Hk8YXx`
z_XaV1u5k_N`?Alc$G{G}?hji;w%p~EPK(#3E=8w5Cnm9E$0E=-?vn!5Dof{nY^MPc
z;Qa{);nIcMYG8O(+3yCz&dA%giaeQ+P*kQp2c2)pA!s-b#g3l>TazB!i%$9*PfB#Y
z+F~>C<1=|d_#}|UV83^^6M4w1PFiC*>aysXc~hg{+&kUVOxw|>|=d`qDEkQF3b|S^xFX~l1N7vwEq{-&&
z@2rRV_+z7T7vZ4Ew`H@ry0!sedwsO;n$BfH%cQ+rZXA2EHB?*+K+}#-dkYaO{Hz>{(wMR-QljPZb!2T=VFf3>dT8+?{ja&Z-SN@44==H
zC0OePh(DjXnf9mqeZKJxUh|cH+g5wLf8kZr#;$kg$AJKhRB2DKJAa98SvOV&pG}r>
zvj!Tj_wCVbb^|@SE5;E^o{vg)n-$uNSLauVW_W8bHu#Gpn7tYD8q3gCxlg;F1<%43
zF}TXNQvdDY0h6KAUorySSK*^q&GLdO;4z)?c>m%4WPo^^SuAKeRem$>7g1-t>q3Af
z7(oiT7I1y6R+8U&zVft#xY?uK_LK9$tt*1RcJr~6ekAm!1NVH3Eb8fQRyNgB*3KB1
zG8A1eas`Z4qb#%_tcR|=x8lgT{vMlme0$iVg^IC9c0vB?mc+_`Oh{zv1@5DIjikfP
z<(`I{{qKlxT3&Il?Gpy+RxDrZAe_n6$f-PFdJnLQn;q&tsfJZp<9>(SN_!8#?J0{-
zwsW0q#~o7KEaHu-m{F1KT{DBGd)~Ai-k)739I|{(?&JMul}dJ@Y4!F9dp>G=lQMS0
zN)+R!M|UzNQ@)&dn0|wtH0DhmV_&+fs4K|PM9fav5^ui!Ps`pISvo@|1%|}@xG_nH
zzojqbI*pu|M)p>TL@&3c7%CFn>lL^fQ#Gj~+ZK7^Xxmc?$S5y_(bd!u*|2H!C-Zf5
zS1!Ba9|JlsA7c`1?y64{1RQI~l+qnD2s5}OQZl-QptWBbu
zvO5W$Mf{^fH)G2xP_RY9&7MWvl
zu0p*RP*xsy)S*JW*}GdxyrQ%yhUqTo5^T+p3e?ypoTI!_B~lq5`4s!4o|LkSD7$^n
z?0rU`sFGX*V-a-)z+TrXr
zZG75BwqZ
z7_RD`b#(E1@Lp7v;k2m{2TXW^@Jl@~m+I({`?D@5Um^j7>%8z)Gof?0-
zJ%%&&%+lwEvfDi4QmvZ#k!S4GxTC8<4_)UKOGB^3)
zZU#FvHh*c@a>O{N(l)LNH{!U5nqBZkt0K>>DN|8}KR!FXQ+OF}r^Qm#
zKH{_yfhqB)M2)ZVON48}hOB~{JG5X#ku}`E(;7Ra&b_jf)GnS*so}irBd0C77Hftg
zIl{F3AasF)wwJPF2aIm)4Q5DxRO&j7b|J+H8y%cvjpvt
z?~Ls}5Q*@%=IZ_y2FQ&)uU2{qSIy3u^M;>it`L*ZVz|(~clKqP8q2S1_%-#rR2=Ki
zA8PxZ=#3rf=)@def;Q;)J|1j+mq2V+Zp3p&X%WV0glhNv>@(71Yk5;7((^7A!BAZO
zS7t-1g_*hV1l;t1p^jK<1ztDes`WW-LQHm$lA0ww!|2ivblO}hc`AgKa;}XU$DW^=
z_RXE7Ve}*|m{vXcz%vix-!C4h8llqht#bl#v2V-?C}XNirbSD_>FeL%Af?j;{W&E~
zTLofEsjpI{{lp@!&O>{+8-=QxqP>WEHqOhrZ%Me+)3|9Fr#eJOhx;dmx)R@4{~2C%
zr%w`*agIzuUif0PH(MLe+YRQXC^4`vuE6r9b%V#MOz-tfY-8qpiuEj@@nmT}|1ml;
zeG9#~a4ERdy7t;&&b_TWXlZH<&JG`O0W9lM1c$Bh@jFpwgLsR<_M
zYk$O%wfIZA7k?1RUv=7l(%SlIMuaAzEPQ*+p(Jmv_zG>+lu(-pZq7~Gt<_C2Cz@Nx
zigvq)=Rp$#+|wW(`n1^2t8jMiq`A*$^2pn$)c5m{YL8tNa7*oU>nQYg^xOGbdC{&U
z_*I59u|NPPgKu~KIv$sL(AkClPuYCF6}qK$kn?eQ4>7eAhBkO?O0v$DuRMYJKfu!c
zFx7yjTVO@{fM)yOQ^WH64|HOiZ?@SaX~fR0ejvXD25+<3Oiax=SUZI%hz)4|%xtr8
zh-?savH6U3QvFm2BtsG|*wB*`O4=tEB~GHA3@n%`i0IW(t@qY6(L8fecRMim7rKC$
zY{WoSg5u-p`uXlsW-+NGQ_#vgWxpsLM(C}9c)KT>7sGhO(2Xb8D;pQT`eg)fb&YE0X5Qxb
z)ZWtXcTlOs#llTB)~kUXQ1;`b9rJ#kD7fYNc!Rt&rM@g_9`y6bK_Z@q0sCe&_jssX1v*s7}R&CNG2
zSZ!WJ#M7Y{L2N#Ju>0L=8mSxdY<{%6SdA}-l^o=>@HH0siiRw_@acL5j=`#DTOkMm
zws_~zCLpawqa3Gl$h4|BhG^7_kSQD4q&gMJ7WSz$Gz-yry?JP&!7lw*iCcHJ|0h6huqkT9f9IVvOXxcW+2
z_T3y|3N!Kix-=PDZva!%Ro4ay8p-ki(U_JCHa1~sLi+QhJlCG`8j>?NU`VM23gXC_G)aRH9>kb
zK-^VlMB0k(;yY;h+W8r`Bhjm`B&g?(>4Tet9yVs)
z;Up})io&T5Z@qag>M9rRhqRaP!=~eB5UQdjv^1rks;ecI@sQnH7sSZ5$a1qCn3xQI
zCfSp^
zsB0Il`B2``Hl1WUM>{vh@qMfLazVI4%!t~OPm0s&qNJka_;%TEExeecMD?Z*!S6bU
zjqcN0=j;{{(N~5#I=94_g_tcrrd_eeG~z4VCdDbpA$MLf@mZ^0g?-kM!x6FjY6h
zX~c1#&!{S#HmuuPi$iNvIp}o*7%k>ii6e?j(7|i&3^A;l&2BdjPm%KRQG&JHNh-mB
z=gf3RK*3$QQrfg1d%crFG(xT&&q%T&PU@0}@wI!c;kkVf3PR_`Rl?R*d}w{*I8uV!
za-*y#LFX(yD3cu9tD3B1G@Ap`-#hs>RLM}ZMk(Dbd=qSoJ0#@{av!0q@=2bYR1w|+qP)nbx>S0GYzTm7zZ?h`H0X8g0MRiC5Aph-}IZl?)MSD^`ERvnc-~D
z^ts2*t$<_B++)DxyqW{y($J#qL)E?CAbXURmg7iGURi}Gg3G>^2h~Iiy*#APOQPy13xUwc>HfnCWmc@5=Xz(F75z{ETEa?N)g_iq`s|
z_ovN7c?is&lvNKiCn){%=mj;o?*$C~Ex+e#+l=ulXJ7?Tig
zUXPi9OT_4wua@=@Or~wmx7zz8p)K2w@Y0nwZUUtj=;3~?V!uP&clW284C{q-7lxuI
ziI?+AFjAY*;XisA&Ry}YE5a~ND0%>RU_|rD=vO~vjHKtzCZCO=J9>IwcNtuJ$jS1h
z&Jd)LA%JqN8K)+9-xThjZ{T*1pGx7YQ2ufLwU-DKHH4U)Azs!R?3Rk7Yi=fiCm>V!&m#dyo<=B51J8~zDhd6mKQ#LV!^aWiP2;lO^Kl4U^v
zSm*AveA>o{)K&UAuEMRci4Bj&b}#u}5UbQ$O-^f{+j+m8S8I-ywdW9m^!P>HP)a{3;=
zPz%TYTI$C@hi4>FyuM?KyatCcw8VOUd&rhiCWbKQArkyW6rst0Y6m7X_=Qz->jOR?
z&T?3VM<4|mO*Mhurx#$ukDl$YywgB3SH1wMrxpiM$PMQ?9L&m5(XV7ePRF8Rz$W}}l&>C_^IiS1kbsw$*5`7W+8kVp%#9vtfxNK{!j1(%0x
z?PWpz9kb(1SQ!_hA1IcWgwGg-+62&O2F0Ju_|NgnDOyP{ai4|7K+{Pndc+n?h90~$f)zVoghhj3PgbTuhc
z1V#75@KI4fMFvvO6}1d-G_ALhYipsD48%PT&OzS*R+i-ABjd7z2M43>+$o&PZ{HsG
zZ^1q2{MMJg!qNq|WA^mx5X8^K#>1J$Z#&ncc338O%pG|BwWXMMD_ee$2M!-ShQf;W
zC1wK+q@JM|F{m#brA;`Wd>n~qOHGa~4=&y@n0eDQ1Sm8}+xr`S+M3pKPxUm9Ybgt;
z3b4aUeUv~fMBg!7nq~uCNpfpicNz0Ioq*>Mm;CXu0`
zq5}1-%-2zBB@pNkFmxjNc`2Y%7huo!L{ylLCWakG&YFX9F@8Yt3HMMJ?
z@1}J=CFc&}Z?k?220Azf^~bnj1K}*JL-NtqwC-Z%@A9BAc=VxL5bVf9+J&A%qY@2qA>@ipwx=VnCQL6fNT>{M(M3
z7;^=Wn_y+-u8o^`rRU=&()QtCX2;_uYO)XE_cr4ur06^LF011v9L>f}n2${`FzHLL
zFda8B6`@`p?Z-`QzJPHPX`Q#jb~bLpy!_sB-rYo~D5<-!aTB*cf`w0ZVcf)_-B#*y
z3AsG9T5Y@j8CC|o@(?g;F7BN(01iSOQZ}u`>b+@}6E>EhIfjkE!}s0-U*zC_Uww>(
zGu05-xpjTOs{;>mM_2fHNpNQWI(+xjAJ}>z3AKEC__3SQJs=1^{4(r5c@7OGiE&Hg
ziH!s~Zd
zn^}V9EJVT41bn}04bIk>Rmti1eK_0qTiL#6cVpjmXsj=TTI`O#k$vDF-~&ZtC9+B?
z(4^zT!8-&~Z+;N}nGy+UZ5DoCxfo1hfCH?6@BzpcZ*lst%S
zU0G)83eSK*i1Jd9aJHaL#j+ykIy`#+WH|DwkhpI>{@VB_HvGLEJ9q8Ek<2^@?LFY@
z?TbFK(GV7=;ACo{g~7g-@F4OTfIDuQif~T}()Vw~@0<6dT-i1;6$pJW^Wg<}?ACE`
z5$jOTGP^w+e#7r<`!};~NH~*=CapL6gm|KO-$o=BHy?W*1P)=io()QyRmKeM_8XQh}pKA@B3BbQU-1&>yq0FaeDJF_+{HM2;9Tr=V}X&
zV3wJyvhd^oR^woPBLc#M;2@F%700pZNVf4uddB>kckSjiZ>rk^UIkJF47~;S-xvpV
zWhM@6TZ2C~uxoYmPV7ubKuTdX>^=S1br*oBzFsIfdKBkakilK``lrIe0fWX2hMPV6CfwuiR12s{7!
z4*&aY1GXJXf!H-1gNMYx+13^^Q4>lt4r9|cO9Ng-@QIGc=pm6%@OhAWgraX~07{eh
z+6WfqnXnGUy5qHC*cyhrsc*}XHFF1`qe)3d@%!2VDh>H$^
zom32guJZw}+`f#NG8-W-LI|5`abo{=9PM<}s?miEcy)-uUHAP5k!}KHChfw;jlbfr
z4coAN`%dghNExlIdIeI+tdj$`jGc8%`biDGdeg1sCd
zYRti(D_7#@zcykA`<(l3=gvLYe>f4jH%cxY;@MPb%yb`2V~?{ujSfq{XrK_!lE`4uZyufevK
zZQp@?2ae%Pb_q3_5kkmipsLEn;k~QhZ@_^=nc#W(!_&!*#U_40;Sn4@Q-s#OcRUEaWAX5U|HFTV2Y}DwkF&e}!f*fk
z9qYDjM`}SW+h=w>TzwG~?2r7E-8fs&Ho$M`3?&r9Co~j})hF@O_upc}mc1-5m$3N8
z2aYmZxVuP^dT1B&OoL8S0x>HCi}mGSi9`bbh!~8y{bAgB-2gc7>XEYUPyBDo0aP@a
z53RTiSRAG3Hy-xE9uSBCP4+<)t^WloVIbrl4iLHq!^RG%-1a4kPh>(A7z(k(5kg%S
zN{=K#+Y1A0G%6)Z^D^|;Hg|0t$*xCGP#|n2jYwnXaOKLMvDtXta52{56Uiv3Fpp07
zuy+f?*r{XS!OFm5i9JI54}r8K6@UN!CyQ@47|P>WtUS&(`ShWq$SCDHyRq^>-1T_o
z@rN-y#2!rz|Iglez&BOxar~E^bR_9W_iibr?7jD14#e?P93ZH;_m-)EsGy)M7l@)D
zL-yXgY)YY}d+(W!?04=>+NNpJT>(Ac_wL7%WZ%?QXu{mnW9NKO`6xF?;aqhD|tn;wp;xWW4#gLMC^E%#efg
zmm*P8W!7m?ke!4}r;f2@yLc-ZPJvyZkZ6&5{Qy>f|10*H%FUMj$Z1?-k~gaYPtEt=
z8`I}+S@e0HP=-;@et_u@cY&0rLHdnT_-)$`(on*X6k@4QS?_koFM9vZ0!`gV~O1&`R
zskiXb{k*P%MUk&tvCp7}4(F
z;)DJphr&~+MSffacAkkdeM9r8F1zwyQTCAj4`6s#KiG-*&}Ll0#)Fq_C%h61t*9U$
z-fep!w2dEp9Yx45%tlsW1w>*ywCgt!&pi1sI(Q3^eCZ&5UcU$FW!15VwgVr)$S(d6
zs?%|3#{t~PDgu|(8*_Rnhgko5y#aioBf57|=7E_|PS|e*Ppb#8%GdH63++7Gk4sAnIl;Qp@>p_I8JFP&oSZ9fV%p
zIwLf+4dnI$=D&vcM0x}5d#>TP=mB<=W*szyv6Cb
z1fCa$O`1UZyeq^OeZD#EZVR5h6NXNC9TPh^Lz!|4Kd$%-3AOA(4m9kX)J>c@euVfv
zM{v`8PHO)q(nq)8AbFj6@5qs(ICbuVl{qOMe8NWKty!;NKpS`P)Wt};aunNt--y47
zPjQ6!U$-)O@b_*57gY+*U5G^GqtuAo*g3(kttXT*N7=Z|o}*`xP{d_72T{0tA=s6V
zOP4I>EUNFvZn{ilh5O~H5+g>tJHLF^L1QM
zp9vV!(VO@JrHDLm7?J4(X34fjL=K%W?tfzu>Ljeu=Nm5PeTkeO(xN&}SD{2w%nd|F
zM&fp^3Jz}W@bnEt*WQEBzgIW3Z{Gp#o-!1q=b}ubV#lr}dVM(x%eZLMzB~N-dr8W^s#}&4+@m)QIdTN$M!~`
z(DD&EQiS&zjp6;i+F%~-!_2Tt8c
zgHYP=dDuoLyjJB+bT{Tr95m)l?Au3tPU1h_$Sj83B5xutr<9FvG$n81)SurpIB!Cr
z2t)5$c@x$WUTfz~bZQfToXFGI^v4$L*tgGEmaJmfS;?E2ZEYSl!eSnl-`aT-!Bu$^
zo3`)AjV9(zZ2WsaqHblu!!mDzCcIMaGOQ)MT0b|N?}olZyTXGAHC;gzb|1ZkGL>}^
z$Q&m<@*yU0EkfiWGWahw@)Q{c4i9`0aAoIeEdF98{@U_4c5GUYWnaw2rnBip=s6&C
z#Qhi=A~oGmi4NL|Vr1v0A+IGm)hz*CMps^ARX5k7T;br5m@4m)2-~WOg
zJ9c2(udDFY(&acAUje?I2L?`>fNpMl(+x)SWf?g5`xkh9+KZUAa0Pz-Z5w+pbN%_l
z3d~xx8W$1@!MFE7-+q1J?p#OxnZXmfVfe&H&@Nd242zF^T;*mWfbJSXdAr!)^;+Er}WvK7CtV+y;xs+r^;n&6Q;5;Me6}
zWA3ueNH0>s!LtX34{QUm$aGG;UW1gFvp9Ys33{FgKJ5o!KsPsHj#VR)g`)rPPGAo0
zFU`4(lc!USZ!qv&&~xZ`bZz6pj@v!!zQR|_f5n!+$!`-m`PFA%;Yg|gPRz{>MIuG0
zpC_(uS%pvLFT&c58*7wh*4OJ0S7J1;#PW8SIO#t4ISEjjL1faGpJVa5Ke3e=r+@y2
zg&!`$M&^WSzCC&jAB}#&Uf|m}BEMO1#c=dIYb&Dih$CQj0qUo6F6C*z6EwnJF&vBVH3iqv=`vg5Df;BHeNC71D3JMxw~
zA6HKu!A{fb#%nJw+)l4CIWKlx&mwE%pUB`fWBQH;8PS(;psLU8z^*;}aWTFg>x3_YlW!Oh
zxd-#+%)|G;{91KC>yOW0LdHNQ8wh6NGh^?Yh&~}Q=qj?XW94EjT=A39cm8`DzFYVe
zmi=}D#CwNF$AK8zzdP)0G(2A{L;jU@m^XhJHf|;M{FdLaYOYoaCn_XhO&P8NM$;?^6(
z$Aq(fR&LZzeD!(dy?%0A1EO`EXt
zs|EOa{eD#KrP?Sz&Zs|I5+>$x;J2p5DkW}xG4a6xa3W&ga=fqoj*R!;Ps2dk%KCWc
z!7Hd2CXMS0xrBq%tNZcQy!lwMaV!2NzrAVwkC^-6GMr8=BDRMoM%;ftLS&5z!u&^Q
zVvPFEVt`IlhCCu`G7GbipP7urxHu%GXG6tzL0GT;c=qMj@z&I_@DdPxVY;iuX@ZWk
zjMx40*V`z&n0%`B%5E=VzQ1_`}*?h>n_p-hO;sKCli;m;Q{Kxiw=C5>gMW(o?HZPAZ4z=!Ox`
ze1z{;t;So=jfMkxTc^uGL7`Er@Z`==Fm+8}7D8lZ691O`Ep%v$9^IU4*QwA2LH!1y
z4P(w(l%2=plC~oTr9n^p?{ao?6-Ae^_-OHJtX}#t`n7ijms5`7k^-ohH$`H|rBWDs
zT63u&e$O==+Pe`yemxs+zW6*Ato#k{E|#_v4~}6y&?CqlLgo$9=iSWu
zd~@2}7H;jwVSF!da$^M|_V2>goa$u7YV;^)=cIOG=T75wqH<1(+?RO$3_B-P`QFZ*
z%$!tItvM;7ojXQPc?x}eoxmy0z@8uG<27bp&*W1u{@1U|=i}p9OK|972H2c9EAYhj
zD7?7?3qSdi-K=QG7Gr-{w)p~zRD5{$urePUQ!;WvxE#U6`IyUcZ51?jgn<@}6#bzDYTCRAGThR_e)`}o5_Y`GRk
z1n+@^(Zkay&&~UM!*yIvE9f{Kin8Ny=ENTSwsJAve`y*%T(AuLBQ7IPV~>vgM_}sn
zui@J@-(b$0&tO1!C=||eB0sA8tX`u+%;CTA?N{I9bdm-=pLiKxe7lCN_n%gMhfiLf
z2rpz~?}qQO^rr*JF*{){X%AG%DdV;8U_{M0EsF^K
z|MvtQ=obnBk#U#*UV(Q$nuld3Uy@lXvT4oN`1GS$_;u%1B9BeCSt4`*w;%#5=P$-;
za_`O^f3yAJi`9Q2p%Ae5=!nr{`oq&!keWiFP$(1%g+ifF?lG$JCVu@3doA)Nw*3B2
z=S{3-{l0DM|KYreZ;W{p%x^RP<9f@yi9w;njW+p#Y~I8<9KV=cN8Uu68hI0^8j&~g
z^=Dt>G3#xQcmAy@(^HFQHV=hl@`;gtc*J->ak~;V6FlZ66{NQ;=I&
z#>^iXF&LmPFGtR$(>NcS36-7;zrYa4ot#V`unn`@W!u4{@Z^1i;LIz*nf)8F`)G9S
z6A#4h=-8tdd|d=k7baup=09-eY7z>`^RxYoJcFU49Qipnv46*JIA*qOAYbN&EQ-Pk%@9mj<^E?#20>>fJWP?r!MXy%U_s=kro-;n4nL$f^`JL{6q9UBS*h
zq};|=8U{y?qMHZs=Z-yyP0xe6vM{EyWaZ{pjVW<(H}nZ&7NBWxWAC5%``m35sdVhF
zq6WPV`I#57W9K;(mT4exZAXmK0Eh(*OOS6IYTQ116sIrWK(XoJ)MdH29dQcr`J_mE
zKAb##YfRS3+IK)lUpMfr-K#3bwTNRlPRi5Jok>*1xrjKt9ox5)Yu^#vVivo$CWY6q
z^|!xq`bG>gbBa)Ic2c&12<^hS8@Ld09eHXFWHL|q2KbrYZ9Xm?`lDZ%f*1*jSH(3L-03ZjK_HM<+EFzx;uIM?S8{}d$3z*qmgSp5ZdZOzJa!OEEH7llr
zy0j1(w6AFLeuLM3WY-X?@_leW8JRPA@AZboJnK!umJu&2OvytT=>HvcJ@S%
zp55Uh;jljDi8JS6%4uz347F@C9kCfRZyd+9bmKM>cJ7`~5Fe=u2N#634~MHgk38Eg
z?AW#)ky*+ryGvD?iwo!WXOOB;)*43b8)q~=#ppejG#cH|I3_O;pO2DM?)!+q9PD+{W@|~N~3Rfm!VgcvZtOH
zbQ}XSBv+&3}(hs?H$QY6{$%t0&Ez87_eTNa5ntxaJ_5V6)SNCA;cc0;%ccx?7v?*-7@2MFx@X?~B
z*m?0Llw2|V+786iFHT0AmRy-%1KV=`db;u7O_uW)8zSdh2qHV;4z0&;`%fXKsDzAt
zlV7RR5&4majNGE?F+j!!do{q1OUP__Kd$ky%Ie1V=|2G76zp_Zjq%Rht1Yc(ypy(Y
z4egFFR{?b8IXJU@1CGV!qs+80RHrUSQrrpbJ`&5erPQw{`i3!a5=u>)7NfqCsEV_2
zYTF7dnD;hj&NR&}Jo5~uzwoE8S|zE;Y9Wt^yMhZ$wbuIL)dcU8uKEBLiv{1Sx)l%HCnU?(t;debq}BrH7AZY|ntO!F@(w{KzB+U0ot`PcB~M@z6_({5Z$&4WCsJ070-
z0ha#o9o~6q26~3nTt3&ZwFlYttUWMNU)hTj$DFP_5101u#HE}MiZZxok!iV6Q^&ancis_&7FrWPLIK%y{7vQ
zokKz8e0m)el@uWMW)xymN+6WFv*U%iycII&^W$*-+*PEeGaJR3Zm>a3Isah2Ixh9%
zA>7Pi0-FMK81X2^^y&tg*ccz+i|rBIWddeA+82(MF^78kd@UU(Z)F`v1~QXmpP7<`
zQ#*dg>^I)Rj92Gl#Sees*tHmFor5rZ^7Ht7#rK%|!JC-aD+nSpe?(9Imz|k_n5Z}u
zXzk$X?Pt^z0se3isgZp15-vq1p;T4I`Z#sb82w(Bn}N%hE~8k(fy}cVCO-5SI(T`o
zvYbKc5r#429>s|6#BVhzsJi5Qv-LaQ9$klwM7LlMXbY2Zc>h5p=GHHtn~bbI2J{MH
z*QqeF@%JtJ5fz__5|b~$a_kv>E!*;O2qdI1-y|vUT
z!n_0|-Yy~{&I}@y@)9p1F_%0jPXKqn0GOZgnsMzc{`_-0E?i7OZf-d$Dk@M>T#Ur?
z$FOesQv9-$2yp`kjvnnXdSI~a38&0R{PgWYeER99?Dg%k<=A&AfeiYgcy8wFcy;Ws^*rREAvL*$6v#H@4bady?l{+b~hG(_8I&8Uwp9~5$OU9xPKZx{ODsmJEkoh
zY(}ulkQf_}e9Jac=OaEg8U;lD3+x=>>F!ZA;m2Yig4e(-C=J7~$dLrO|#t*-2M_Oet*9?)f96`Z>u;=KI
zlM;oqr*EU&>It?6=yh6{e}$uy9Kk{U5b9LOO}vKaJU&EX2_zEpONX?yB;*$nMPlaw
zPd9fkOI#@}Ls?k~G#cX~er5p{`F|DcG9|SJ#ih)6;6NynG8H