diff --git a/ChatQnA/chatqna.yaml b/ChatQnA/chatqna.yaml deleted file mode 100644 index 0344b28317..0000000000 --- a/ChatQnA/chatqna.yaml +++ /dev/null @@ -1,72 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -opea_micro_services: - redis-vector-db: - host: ${REDIS_SERVICE_HOST_IP} - ports: - - "6379:6379" - - "8001:8001" - image: redis/redis-stack:7.2.0-v9 - dataprep-redis-service: - host: ${DATAPREP_SERVICE_HOST_IP} - ports: ${DATAPREP_SERVICE_PORT} - image: opea/dataprep-redis:latest - environment: - REDIS_URL: ${REDIS_URL} - INDEX_NAME: ${INDEX_NAME} - tei-embedding-service: - host: ${TEI_EMBEDDING_SERVICE_IP} - ports: ${TEI_EMBEDDING_SERVICE_PORT} - image: ghcr.io/huggingface/tei-gaudi:1.5.0 - volumes: - - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host - environment: - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - model-id: ${EMBEDDING_MODEL_ID} - retrieval: - host: ${RETRIEVER_SERVICE_HOST_IP} - ports: ${RETRIEVER_SERVICE_PORT} - image: opea/retriever-redis:latest - endpoint: /v1/retrieval - tgi-service: - host: ${TGI_SERVICE_IP} - ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.6 - volumes: - - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host - environment: - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - model-id: ${LLM_MODEL_ID} - ui: - host: ${UI_SERVICE_HOST_IP} - ports: - - "5173:5173" - image: opea/chatqna-ui:latest - environment: - - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT} - - UPLOAD_FILE_BASE_URL=${DATAPREP_SERVICE_ENDPOINT} - -opea_mega_service: - host: ${MEGA_SERVICE_HOST_IP} - ports: ${MEGA_SERVICE_PORT} - image: opea/chatqna:latest - endpoint: /v1/chatqna - mega_flow: - - embedding >> retrieval >> reranking >> llm diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml index 9b4002b5bf..50e2f00591 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml @@ -97,7 +97,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 chatqna-gaudi-backend-server: image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} container_name: chatqna-gaudi-backend-server diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml index 715db8976f..eabff7f865 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml @@ -1286,7 +1286,6 @@ spec: type: RuntimeDefault image: "opea/vllm-gaudi:latest" args: - - "--enforce-eager" - "--model" - "$(MODEL_ID)" - "--tensor-parallel-size" diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh index 3b1efa8547..263a17a0d5 100644 --- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh @@ -39,7 +39,7 @@ function start_services() { # Start Docker Containers docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 - until [[ "$n" -ge 100 ]]; do + until [[ "$n" -ge 160 ]]; do echo "n=$n" docker logs vllm-gaudi-server > vllm_service_start.log if grep -q "Warmup finished" vllm_service_start.log; then diff --git a/CodeGen/codegen.yaml b/CodeGen/codegen.yaml deleted file mode 100644 index 8dc864f6f6..0000000000 --- a/CodeGen/codegen.yaml +++ /dev/null @@ -1,48 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -opea_micro_services: - tgi-service: - host: ${TGI_SERVICE_IP} - ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.6 - volumes: - - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host - environment: - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - model-id: ${LLM_MODEL_ID} - llm: - host: ${LLM_SERVICE_HOST_IP} - ports: ${LLM_SERVICE_PORT} - image: opea/llm-tgi:latest - endpoint: /v1/chat/completions - environment: - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ui: - host: ${UI_SERVICE_HOST_IP} - ports: - - "5173:5173" - image: opea/codegen-ui:latest - environment: - - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT} - -opea_mega_service: - host: ${MEGA_SERVICE_HOST_IP} - ports: ${MEGA_SERVICE_PORT} - endpoint: /v1/codegen - image: opea/codegen:latest - mega_flow: - - llm diff --git a/CodeTrans/codetrans.yaml b/CodeTrans/codetrans.yaml deleted file mode 100644 index c362599788..0000000000 --- a/CodeTrans/codetrans.yaml +++ /dev/null @@ -1,48 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -opea_micro_services: - tgi-service: - host: ${TGI_SERVICE_IP} - ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.6 - volumes: - - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host - environment: - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - model-id: ${LLM_MODEL_ID} - llm: - host: ${LLM_SERVICE_HOST_IP} - ports: ${LLM_SERVICE_PORT} - image: opea/llm-tgi:latest - endpoint: /v1/chat/completions - environment: - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ui: - host: ${UI_SERVICE_HOST_IP} - ports: - - "5173:5173" - image: opea/codetrans-ui:latest - environment: - - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT} - -opea_mega_service: - host: ${MEGA_SERVICE_HOST_IP} - ports: ${MEGA_SERVICE_PORT} - endpoint: /v1/codetrans - image: opea/codetrans:latest - mega_flow: - - llm diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/README.md b/DocIndexRetriever/docker_compose/intel/cpu/xeon/README.md index 58354babfa..3ad27345cb 100644 --- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/README.md +++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/README.md @@ -62,6 +62,17 @@ cd GenAIExamples/DocIndexRetriever/intel/cpu/xoen/ docker compose up -d ``` +Two types of DocRetriever pipeline are supported now: `DocRetriever with/without Rerank`. And the `DocRetriever without Rerank` pipeline (including Embedding and Retrieval) is offered for customers who expect to handle all retrieved documents by LLM, and require high performance of DocRetriever. +In that case, start Docker Containers with compose_without_rerank.yaml + +```bash +export host_ip="YOUR IP ADDR" +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" +cd GenAIExamples/DocIndexRetriever/intel/cpu/xoen/ +docker compose -f compose_without_rerank.yaml up -d +``` + ## 4. Validation Add Knowledge Base via HTTP Links: diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml new file mode 100644 index 0000000000..986fcb41af --- /dev/null +++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml @@ -0,0 +1,102 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + redis-vector-db: + image: redis/redis-stack:7.2.0-v9 + container_name: redis-vector-db + ports: + - "6379:6379" + - "8001:8001" + dataprep-redis-service: + image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} + container_name: dataprep-redis-server + depends_on: + - redis-vector-db + ports: + - "6007:6007" + - "6008:6008" + - "6009:6009" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: redis://redis-vector-db:6379 + REDIS_HOST: redis-vector-db + INDEX_NAME: ${INDEX_NAME:-rag-redis} + TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80 + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "/home/ligang/models:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + embedding: + image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest} + container_name: embedding-tei-server + ports: + - "6000:6000" + ipc: host + depends_on: + - tei-embedding-service + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80 + restart: unless-stopped + retriever: + image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} + container_name: retriever-redis-server + depends_on: + - redis-vector-db + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: redis://redis-vector-db:6379 + INDEX_NAME: ${INDEX_NAME:-rag-redis} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80 + restart: unless-stopped + doc-index-retriever-server: + image: ${REGISTRY:-opea}/doc-index-retriever:${TAG:-latest} + container_name: doc-index-retriever-server + depends_on: + - redis-vector-db + - tei-embedding-service + - embedding + - retriever + ports: + - "8889:8889" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-0.0.0.0} + EMBEDDING_SERVICE_HOST_IP: embedding + EMBEDDING_SERVICE_PORT: ${EMBEDDING_SERVER_PORT:-6000} + RETRIEVER_SERVICE_HOST_IP: retriever + LOGFLAG: ${LOGFLAG} + ipc: host + restart: always + command: --without-rerank + +networks: + default: + driver: bridge diff --git a/DocIndexRetriever/retrieval_tool.py b/DocIndexRetriever/retrieval_tool.py index b902b7a20e..9581612a50 100644 --- a/DocIndexRetriever/retrieval_tool.py +++ b/DocIndexRetriever/retrieval_tool.py @@ -1,6 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import argparse import asyncio import os from typing import Union @@ -124,8 +125,37 @@ def start(self): output_datatype=Union[RerankedDoc, LLMParamsDoc], ) + def add_remote_service_without_rerank(self): + embedding = MicroService( + name="embedding", + host=EMBEDDING_SERVICE_HOST_IP, + port=EMBEDDING_SERVICE_PORT, + endpoint="/v1/embeddings", + use_remote_service=True, + service_type=ServiceType.EMBEDDING, + ) + retriever = MicroService( + name="retriever", + host=RETRIEVER_SERVICE_HOST_IP, + port=RETRIEVER_SERVICE_PORT, + endpoint="/v1/retrieval", + use_remote_service=True, + service_type=ServiceType.RETRIEVER, + ) + + self.megaservice.add(embedding).add(retriever) + self.megaservice.flow_to(embedding, retriever) + if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--without-rerank", action="store_true") + + args = parser.parse_args() + chatqna = RetrievalToolService(port=MEGA_SERVICE_PORT) - chatqna.add_remote_service() + if args.without_rerank: + chatqna.add_remote_service_without_rerank() + else: + chatqna.add_remote_service() chatqna.start() diff --git a/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh b/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh new file mode 100644 index 0000000000..0298a8a55b --- /dev/null +++ b/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -e +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" +export REGISTRY=${IMAGE_REPO} +export TAG=${IMAGE_TAG} + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + echo "Building Docker Images...." + cd $WORKPATH/docker_image_build + if [ ! -d "GenAIComps" ] ; then + echo "Cloning GenAIComps repository" + git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ + fi + service_list="dataprep-redis embedding-tei retriever-redis doc-index-retriever" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + docker pull redis/redis-stack:7.2.0-v9 + docker images && sleep 1s + + echo "Docker images built!" +} + +function start_services() { + echo "Starting Docker Services...." + cd $WORKPATH/docker_compose/intel/cpu/xeon + export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006" + export REDIS_URL="redis://${ip_address}:6379" + export INDEX_NAME="rag-redis" + export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + export MEGA_SERVICE_HOST_IP=${ip_address} + export EMBEDDING_SERVICE_HOST_IP=${ip_address} + export RETRIEVER_SERVICE_HOST_IP=${ip_address} + + # Start Docker Containers + docker compose -f compose_without_rerank.yaml up -d + sleep 5m + echo "Docker services started!" +} + +function validate() { + local CONTENT="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected: $CONTENT." + echo 0 + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + echo 1 + fi +} + +function validate_megaservice() { + echo "===========Ingest data==================" + local CONTENT=$(http_proxy="" curl -X POST "http://${ip_address}:6007/v1/dataprep" \ + -H "Content-Type: multipart/form-data" \ + -F 'link_list=["https://opea.dev/"]') + local EXIT_CODE=$(validate "$CONTENT" "Data preparation succeeded" "dataprep-redis-service-xeon") + echo "$EXIT_CODE" + local EXIT_CODE="${EXIT_CODE:0-1}" + echo "return value is $EXIT_CODE" + if [ "$EXIT_CODE" == "1" ]; then + docker logs dataprep-redis-server | tee -a ${LOG_PATH}/dataprep-redis-service-xeon.log + return 1 + fi + + # Curl the Mega Service + echo "================Testing retriever service: Text Request ================" + cd $WORKPATH/tests + local CONTENT=$(http_proxy="" curl http://${ip_address}:8889/v1/retrievaltool -X POST -H "Content-Type: application/json" -d '{ + "text": "Explain the OPEA project?" + }') + # local CONTENT=$(python test.py --host_ip ${ip_address} --request_type text) + local EXIT_CODE=$(validate "$CONTENT" "OPEA" "doc-index-retriever-service-xeon") + echo "$EXIT_CODE" + local EXIT_CODE="${EXIT_CODE:0-1}" + echo "return value is $EXIT_CODE" + if [ "$EXIT_CODE" == "1" ]; then + echo "=============Embedding container log==================" + docker logs embedding-tei-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log + echo "=============Retriever container log==================" + docker logs retriever-redis-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log + echo "=============Doc-index-retriever container log==================" + docker logs doc-index-retriever-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log + exit 1 + fi + + echo "================Testing retriever service: ChatCompletion Request================" + cd $WORKPATH/tests + local CONTENT=$(python test.py --host_ip ${ip_address} --request_type chat_completion) + local EXIT_CODE=$(validate "$CONTENT" "OPEA" "doc-index-retriever-service-xeon") + echo "$EXIT_CODE" + local EXIT_CODE="${EXIT_CODE:0-1}" + echo "return value is $EXIT_CODE" + if [ "$EXIT_CODE" == "1" ]; then + echo "=============Embedding container log==================" + docker logs embedding-tei-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log + echo "=============Retriever container log==================" + docker logs retriever-redis-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log + echo "=============Doc-index-retriever container log==================" + docker logs doc-index-retriever-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log + exit 1 + fi +} + +function stop_docker() { + cd $WORKPATH/docker_compose/intel/cpu/xeon + container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2) + for container_name in $container_list; do + cid=$(docker ps -aq --filter "name=$container_name") + echo "Stopping container $container_name" + if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi + done +} + +function main() { + + stop_docker + build_docker_images + echo "Dump current docker ps" + docker ps + start_time=$(date +%s) + start_services + end_time=$(date +%s) + duration=$((end_time-start_time)) + echo "Mega service start duration is $duration s" + validate_megaservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md index 82cbcf841b..212f5693dc 100644 --- a/DocSum/docker_compose/intel/cpu/xeon/README.md +++ b/DocSum/docker_compose/intel/cpu/xeon/README.md @@ -67,22 +67,22 @@ docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build- Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI. -#### Svelte UI +#### Gradio UI -Build the frontend Docker image via below command: +Build the Gradio UI frontend Docker image using the following command: ```bash cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . +docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio . ``` -#### Gradio UI +#### Svelte UI -Build the Gradio UI frontend Docker image using the following command: +Build the frontend Docker image via below command: ```bash cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio . +docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . ``` #### React UI diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml index 72332a9012..170cdc79b8 100644 --- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml +++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml @@ -95,8 +95,8 @@ services: ipc: host restart: always - docsum-ui: - image: ${REGISTRY:-opea}/docsum-ui:${TAG:-latest} + docsum-gradio-ui: + image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} container_name: docsum-xeon-ui-server depends_on: - docsum-xeon-backend-server diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md index 172f24d67a..f9f3d6af58 100644 --- a/DocSum/docker_compose/intel/hpu/gaudi/README.md +++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md @@ -51,22 +51,22 @@ docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build- Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI. -#### Svelte UI +#### Gradio UI -Build the frontend Docker image via below command: +Build the Gradio UI frontend Docker image using the following command: ```bash cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . +docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio . ``` -#### Gradio UI +#### Svelte UI -Build the Gradio UI frontend Docker image using the following command: +Build the frontend Docker image via below command: ```bash cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio . +docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . ``` #### React UI diff --git a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml index 39bb3d4777..2e8211fdf3 100644 --- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml @@ -108,8 +108,8 @@ services: ipc: host restart: always - docsum-ui: - image: ${REGISTRY:-opea}/docsum-ui:${TAG:-latest} + docsum-gradio-ui: + image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} container_name: docsum-gaudi-ui-server depends_on: - docsum-gaudi-backend-server diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh index 7b9ff49260..b26bb7eab9 100644 --- a/DocSum/tests/test_compose_on_gaudi.sh +++ b/DocSum/tests/test_compose_on_gaudi.sh @@ -46,7 +46,7 @@ function build_docker_images() { git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="docsum docsum-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi" + service_list="docsum docsum-gradio-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 diff --git a/DocSum/tests/test_compose_on_xeon.sh b/DocSum/tests/test_compose_on_xeon.sh index 60da44a7eb..2b32f4438a 100644 --- a/DocSum/tests/test_compose_on_xeon.sh +++ b/DocSum/tests/test_compose_on_xeon.sh @@ -45,7 +45,7 @@ function build_docker_images() { git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="docsum docsum-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi" + service_list="docsum docsum-gradio-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/text-generation-inference:1.4 diff --git a/FaqGen/faqgen.yaml b/FaqGen/faqgen.yaml deleted file mode 100644 index 5b924a38eb..0000000000 --- a/FaqGen/faqgen.yaml +++ /dev/null @@ -1,47 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -opea_micro_services: - tgi-service: - host: ${TGI_SERVICE_IP} - ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.6 - volumes: - - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host - environment: - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - model-id: ${LLM_MODEL_ID} - llm: - host: ${LLM_SERVICE_HOST_IP} - ports: ${LLM_SERVICE_PORT} - image: opea/llm-tgi:latest - endpoint: /v1/chat/completions - environment: - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ui: - host: ${UI_SERVICE_HOST_IP} - ports: - - "5173:5173" - image: opea/faqgen-ui:latest - environment: - - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT} - -opea_mega_service: - host: ${MEGA_SERVICE_HOST_IP} - ports: ${MEGA_SERVICE_PORT} - image: opea/faqgen:latest - endpoint: /v1/faqgen - mega_flow: - - llm diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md index d0a1c7d279..dcd145bbd7 100644 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md @@ -78,6 +78,9 @@ export https_proxy=${your_http_proxy} export EMBEDDER_PORT=6006 export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT/v1/encode" export MM_EMBEDDING_PORT_MICROSERVICE=6000 +export ASR_ENDPOINT=http://$host_ip:7066 +export ASR_SERVICE_PORT=3001 +export ASR_SERVICE_ENDPOINT="http://${host_ip}:${ASR_SERVICE_PORT}/v1/audio/transcriptions" export REDIS_URL="redis://${host_ip}:6379" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" @@ -144,7 +147,21 @@ docker build --no-cache -t opea/lvm-llava-svc:latest --build-arg https_proxy=$ht docker build --no-cache -t opea/dataprep-multimodal-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimodal/redis/langchain/Dockerfile . ``` -### 5. Build MegaService Docker Image +### 5. Build asr images + +Build whisper server image + +```bash +docker build --no-cache -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile . +``` + +Build asr image + +```bash +docker build --no-cache -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +``` + +### 6. Build MegaService Docker Image To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the [multimodalqna.py](../../../../multimodalqna.py) Python script. Build MegaService Docker image via below command: @@ -155,7 +172,7 @@ docker build --no-cache -t opea/multimodalqna:latest --build-arg https_proxy=$ht cd ../.. ``` -### 6. Build UI Docker Image +### 7. Build UI Docker Image Build frontend Docker image via below command: @@ -165,16 +182,19 @@ docker build --no-cache -t opea/multimodalqna-ui:latest --build-arg https_proxy= cd ../../../ ``` -Then run the command `docker images`, you will have the following 8 Docker Images: +Then run the command `docker images`, you will have the following 11 Docker Images: 1. `opea/dataprep-multimodal-redis:latest` 2. `opea/lvm-llava-svc:latest` 3. `opea/lvm-llava:latest` 4. `opea/retriever-multimodal-redis:latest` -5. `opea/embedding-multimodal:latest` -6. `opea/embedding-multimodal-bridgetower:latest` -7. `opea/multimodalqna:latest` -8. `opea/multimodalqna-ui:latest` +5. `opea/whisper:latest` +6. `opea/asr:latest` +7. `opea/redis-vector-db` +8. `opea/embedding-multimodal:latest` +9. `opea/embedding-multimodal-bridgetower:latest` +10. `opea/multimodalqna:latest` +11. `opea/multimodalqna-ui:latest` ## 🚀 Start Microservices @@ -240,7 +260,16 @@ curl http://${host_ip}:7000/v1/multimodal_retrieval \ -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" ``` -4. lvm-llava +4. asr + +```bash +curl ${ASR_SERVICE_ENDPOINT} \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"byte_str" : "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' +``` + +5. lvm-llava ```bash curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ @@ -249,7 +278,7 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ -d '{"prompt":"Describe the image please.", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}' ``` -5. lvm-llava-svc +6. lvm-llava-svc ```bash curl http://${host_ip}:9399/v1/lvm \ @@ -274,7 +303,7 @@ curl http://${host_ip}:9399/v1/lvm \ -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' ``` -6. dataprep-multimodal-redis +7. dataprep-multimodal-redis Download a sample video, image, and audio file and create a caption @@ -348,7 +377,7 @@ curl -X POST \ ${DATAPREP_DELETE_FILE_ENDPOINT} ``` -7. MegaService +8. MegaService ```bash curl http://${host_ip}:8888/v1/multimodalqna \ @@ -357,6 +386,12 @@ curl http://${host_ip}:8888/v1/multimodalqna \ -d '{"messages": "What is the revenue of Nike in 2023?"}' ``` +```bash +curl http://${host_ip}:8888/v1/multimodalqna \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}' +``` + ```bash curl http://${host_ip}:8888/v1/multimodalqna \ -H "Content-Type: application/json" \ diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml index eece99da85..4f9900db72 100644 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml @@ -2,6 +2,27 @@ # SPDX-License-Identifier: Apache-2.0 services: + whisper-service: + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + container_name: whisper-service + ports: + - "7066:7066" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + asr: + image: ${REGISTRY:-opea}/asr:${TAG:-latest} + container_name: asr-service + ports: + - "${ASR_SERVICE_PORT}:9099" + ipc: host + environment: + ASR_ENDPOINT: ${ASR_ENDPOINT} + ASR_SERVICE_PORT: ${ASR_SERVICE_PORT} + ASR_SERVICE_ENDPOINT: ${ASR_SERVICE_ENDPOINT} redis-vector-db: image: redis/redis-stack:7.2.0-v9 container_name: redis-vector-db @@ -102,6 +123,7 @@ services: - embedding-multimodal - retriever-multimodal-redis - lvm-llava-svc + - asr ports: - "8888:8888" environment: @@ -113,6 +135,8 @@ services: MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE} MM_RETRIEVER_SERVICE_HOST_IP: ${MM_RETRIEVER_SERVICE_HOST_IP} LVM_SERVICE_HOST_IP: ${LVM_SERVICE_HOST_IP} + ASR_SERVICE_PORT: ${ASR_SERVICE_PORT} + ASR_SERVICE_ENDPOINT: ${ASR_SERVICE_ENDPOINT} ipc: host restart: always multimodalqna-ui: diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh index c77300a132..b9c9e3d5ea 100755 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh @@ -12,6 +12,9 @@ export https_proxy=${your_http_proxy} export EMBEDDER_PORT=6006 export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT/v1/encode" export MM_EMBEDDING_PORT_MICROSERVICE=6000 +export ASR_ENDPOINT=http://$host_ip:7066 +export ASR_SERVICE_PORT=3001 +export ASR_SERVICE_ENDPOINT="http://${host_ip}:${ASR_SERVICE_PORT}/v1/audio/transcriptions" export REDIS_URL="redis://${host_ip}:6379" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md index 9e7db70b79..67e95cb51b 100644 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md @@ -37,6 +37,9 @@ export LVM_MODEL_ID="llava-hf/llava-v1.6-vicuna-13b-hf" export WHISPER_MODEL="base" export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip} export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip} +export ASR_ENDPOINT=http://$host_ip:7066 +export ASR_SERVICE_PORT=3001 +export ASR_SERVICE_ENDPOINT="http://${host_ip}:${ASR_SERVICE_PORT}/v1/audio/transcriptions" export LVM_SERVICE_HOST_IP=${host_ip} export MEGA_SERVICE_HOST_IP=${host_ip} export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna" @@ -95,7 +98,21 @@ docker build --no-cache -t opea/lvm-tgi:latest --build-arg https_proxy=$https_pr docker build --no-cache -t opea/dataprep-multimodal-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimodal/redis/langchain/Dockerfile . ``` -### 5. Build MegaService Docker Image +### 5. Build asr images + +Build whisper server image + +```bash +docker build --no-cache -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile . +``` + +Build asr image + +```bash +docker build --no-cache -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +``` + +### 6. Build MegaService Docker Image To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the [multimodalqna.py](../../../../multimodalqna.py) Python script. Build MegaService Docker image via below command: @@ -114,16 +131,19 @@ cd GenAIExamples/MultimodalQnA/ui/ docker build --no-cache -t opea/multimodalqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile . ``` -Then run the command `docker images`, you will have the following 8 Docker Images: +Then run the command `docker images`, you will have the following 11 Docker Images: 1. `opea/dataprep-multimodal-redis:latest` 2. `opea/lvm-tgi:latest` 3. `ghcr.io/huggingface/tgi-gaudi:2.0.6` 4. `opea/retriever-multimodal-redis:latest` -5. `opea/embedding-multimodal:latest` -6. `opea/embedding-multimodal-bridgetower:latest` -7. `opea/multimodalqna:latest` -8. `opea/multimodalqna-ui:latest` +5. `opea/whisper:latest` +6. `opea/asr:latest` +7. `opea/redis-vector-db` +8. `opea/embedding-multimodal:latest` +9. `opea/embedding-multimodal-bridgetower:latest` +10. `opea/multimodalqna:latest` +11. `opea/multimodalqna-ui:latest` ## 🚀 Start Microservices @@ -189,7 +209,16 @@ curl http://${host_ip}:7000/v1/multimodal_retrieval \ -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" ``` -4. TGI LLaVA Gaudi Server +4. asr + +```bash +curl ${ASR_SERVICE_ENDPOINT} \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"byte_str" : "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' +``` + +5. TGI LLaVA Gaudi Server ```bash curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ @@ -198,7 +227,7 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ -H 'Content-Type: application/json' ``` -5. lvm-tgi +6. lvm-tgi ```bash curl http://${host_ip}:9399/v1/lvm \ @@ -223,7 +252,7 @@ curl http://${host_ip}:9399/v1/lvm \ -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' ``` -6. Multimodal Dataprep Microservice +7. Multimodal Dataprep Microservice Download a sample video, image, and audio file and create a caption @@ -297,7 +326,7 @@ curl -X POST \ ${DATAPREP_DELETE_FILE_ENDPOINT} ``` -7. MegaService +8. MegaService ```bash curl http://${host_ip}:8888/v1/multimodalqna \ diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml index ddaf2b09d1..87a6821c77 100644 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -8,6 +8,27 @@ services: ports: - "6379:6379" - "8001:8001" + whisper-service: + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + container_name: whisper-service + ports: + - "7066:7066" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + asr: + image: ${REGISTRY:-opea}/asr:${TAG:-latest} + container_name: asr-service + ports: + - "${ASR_SERVICE_PORT}:9099" + ipc: host + environment: + ASR_ENDPOINT: ${ASR_ENDPOINT} + ASR_SERVICE_PORT: ${ASR_SERVICE_PORT} + ASR_SERVICE_ENDPOINT: ${ASR_SERVICE_ENDPOINT} dataprep-multimodal-redis: image: ${REGISTRY:-opea}/dataprep-multimodal-redis:${TAG:-latest} container_name: dataprep-multimodal-redis @@ -119,6 +140,7 @@ services: - embedding-multimodal - retriever-multimodal-redis - lvm-tgi + - asr ports: - "8888:8888" environment: @@ -130,6 +152,8 @@ services: MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE} MM_RETRIEVER_SERVICE_HOST_IP: ${MM_RETRIEVER_SERVICE_HOST_IP} LVM_SERVICE_HOST_IP: ${LVM_SERVICE_HOST_IP} + ASR_SERVICE_PORT: ${ASR_SERVICE_PORT} + ASR_SERVICE_ENDPOINT: ${ASR_SERVICE_ENDPOINT} ipc: host restart: always multimodalqna-ui: diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh index aac8ebdf86..9ca2d6d9fc 100755 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh @@ -12,6 +12,9 @@ export https_proxy=${your_http_proxy} export EMBEDDER_PORT=6006 export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT/v1/encode" export MM_EMBEDDING_PORT_MICROSERVICE=6000 +export ASR_ENDPOINT=http://$host_ip:7066 +export ASR_SERVICE_PORT=3001 +export ASR_SERVICE_ENDPOINT="http://${host_ip}:${ASR_SERVICE_PORT}/v1/audio/transcriptions" export REDIS_URL="redis://${host_ip}:6379" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml index 5b19aeffd9..0efa6a37e3 100644 --- a/MultimodalQnA/docker_image_build/build.yaml +++ b/MultimodalQnA/docker_image_build/build.yaml @@ -59,3 +59,15 @@ services: dockerfile: comps/dataprep/multimodal/redis/langchain/Dockerfile extends: multimodalqna image: ${REGISTRY:-opea}/dataprep-multimodal-redis:${TAG:-latest} + whisper: + build: + context: GenAIComps + dockerfile: comps/asr/whisper/dependency/Dockerfile + extends: multimodalqna + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + asr: + build: + context: GenAIComps + dockerfile: comps/asr/whisper/Dockerfile + extends: multimodalqna + image: ${REGISTRY:-opea}/asr:${TAG:-latest} diff --git a/MultimodalQnA/multimodalqna.py b/MultimodalQnA/multimodalqna.py index bf9b375749..87565a5b8a 100644 --- a/MultimodalQnA/multimodalqna.py +++ b/MultimodalQnA/multimodalqna.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 +import json import os from io import BytesIO @@ -16,7 +17,7 @@ ) from comps.cores.proto.docarray import LLMParams from fastapi import Request -from fastapi.responses import StreamingResponse +from fastapi.responses import JSONResponse, StreamingResponse from PIL import Image MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888)) @@ -29,6 +30,9 @@ class MultimodalQnAService(Gateway): + asr_port = int(os.getenv("ASR_SERVICE_PORT", 3001)) + asr_endpoint = os.getenv("ASR_SERVICE_ENDPOINT", "http://0.0.0.0:{}/v1/audio/transcriptions".format(asr_port)) + def __init__(self, host="0.0.0.0", port=8000): self.host = host self.port = port @@ -73,7 +77,10 @@ def add_remote_service(self): # this overrides _handle_message method of Gateway def _handle_message(self, messages): images = [] + audios = [] + b64_types = {} messages_dicts = [] + decoded_audio_input = "" if isinstance(messages, str): prompt = messages else: @@ -87,16 +94,26 @@ def _handle_message(self, messages): system_prompt = message["content"] elif msg_role == "user": if type(message["content"]) == list: + # separate each media type and store accordingly text = "" text_list = [item["text"] for item in message["content"] if item["type"] == "text"] text += "\n".join(text_list) image_list = [ item["image_url"]["url"] for item in message["content"] if item["type"] == "image_url" ] - if image_list: - messages_dict[msg_role] = (text, image_list) - else: + audios = [item["audio"] for item in message["content"] if item["type"] == "audio"] + if audios: + # translate audio to text. From this point forward, audio is treated like text + decoded_audio_input = self.convert_audio_to_text(audios) + b64_types["audio"] = decoded_audio_input + + if text and not audios and not image_list: messages_dict[msg_role] = text + elif audios and not text and not image_list: + messages_dict[msg_role] = decoded_audio_input + else: + messages_dict[msg_role] = (text, decoded_audio_input, image_list) + else: messages_dict[msg_role] = message["content"] messages_dicts.append(messages_dict) @@ -108,55 +125,84 @@ def _handle_message(self, messages): if system_prompt: prompt = system_prompt + "\n" - for messages_dict in messages_dicts: - for i, (role, message) in enumerate(messages_dict.items()): + for i, messages_dict in enumerate(messages_dicts): + for role, message in messages_dict.items(): if isinstance(message, tuple): - text, image_list = message + text, decoded_audio_input, image_list = message if i == 0: # do not add role for the very first message. # this will be added by llava_server if text: prompt += text + "\n" + elif decoded_audio_input: + prompt += decoded_audio_input + "\n" else: if text: prompt += role.upper() + ": " + text + "\n" + elif decoded_audio_input: + prompt += role.upper() + ": " + decoded_audio_input + "\n" else: prompt += role.upper() + ":" - for img in image_list: - # URL - if img.startswith("http://") or img.startswith("https://"): - response = requests.get(img) - image = Image.open(BytesIO(response.content)).convert("RGBA") - image_bytes = BytesIO() - image.save(image_bytes, format="PNG") - img_b64_str = base64.b64encode(image_bytes.getvalue()).decode() - # Local Path - elif os.path.exists(img): - image = Image.open(img).convert("RGBA") - image_bytes = BytesIO() - image.save(image_bytes, format="PNG") - img_b64_str = base64.b64encode(image_bytes.getvalue()).decode() - # Bytes - else: - img_b64_str = img - images.append(img_b64_str) - else: + if image_list: + for img in image_list: + # URL + if img.startswith("http://") or img.startswith("https://"): + response = requests.get(img) + image = Image.open(BytesIO(response.content)).convert("RGBA") + image_bytes = BytesIO() + image.save(image_bytes, format="PNG") + img_b64_str = base64.b64encode(image_bytes.getvalue()).decode() + # Local Path + elif os.path.exists(img): + image = Image.open(img).convert("RGBA") + image_bytes = BytesIO() + image.save(image_bytes, format="PNG") + img_b64_str = base64.b64encode(image_bytes.getvalue()).decode() + # Bytes + else: + img_b64_str = img + + images.append(img_b64_str) + + elif isinstance(message, str): if i == 0: # do not add role for the very first message. # this will be added by llava_server if message: - prompt += role.upper() + ": " + message + "\n" + prompt += message + "\n" else: if message: prompt += role.upper() + ": " + message + "\n" else: prompt += role.upper() + ":" + if images: - return prompt, images + b64_types["image"] = images + + # If the query has multiple media types, return all types + if prompt and b64_types: + return prompt, b64_types else: return prompt + def convert_audio_to_text(self, audio): + # translate audio to text by passing in base64 encoded audio to ASR + if isinstance(audio, dict): + input_dict = {"byte_str": audio["audio"][0]} + else: + input_dict = {"byte_str": audio[0]} + + response = requests.post(self.asr_endpoint, data=json.dumps(input_dict)) + + if response.status_code != 200: + return JSONResponse( + status_code=503, content={"message": "Unable to convert audio to text. {}".format(response.text)} + ) + + response = response.json() + return response["query"] + async def handle_request(self, request: Request): data = await request.json() stream_opt = bool(data.get("stream", False)) @@ -165,16 +211,35 @@ async def handle_request(self, request: Request): stream_opt = False chat_request = ChatCompletionRequest.model_validate(data) # Multimodal RAG QnA With Videos has not yet accepts image as input during QnA. - prompt_and_image = self._handle_message(chat_request.messages) - if isinstance(prompt_and_image, tuple): - # print(f"This request include image, thus it is a follow-up query. Using lvm megaservice") - prompt, images = prompt_and_image + num_messages = len(data["messages"]) if isinstance(data["messages"], list) else 1 + messages = self._handle_message(chat_request.messages) + decoded_audio_input = "" + + if num_messages > 1: + # This is a follow up query, go to LVM cur_megaservice = self.lvm_megaservice - initial_inputs = {"prompt": prompt, "image": images[0]} + if isinstance(messages, tuple): + prompt, b64_types = messages + if "audio" in b64_types: + # for metadata storage purposes + decoded_audio_input = b64_types["audio"] + if "image" in b64_types: + initial_inputs = {"prompt": prompt, "image": b64_types["image"][0]} + else: + initial_inputs = {"prompt": prompt, "image": ""} + else: + prompt = messages + initial_inputs = {"prompt": prompt, "image": ""} else: - # print(f"This is the first query, requiring multimodal retrieval. Using multimodal rag megaservice") - prompt = prompt_and_image + # This is the first query. Ignore image input cur_megaservice = self.megaservice + if isinstance(messages, tuple): + prompt, b64_types = messages + if "audio" in b64_types: + # for metadata storage purposes + decoded_audio_input = b64_types["audio"] + else: + prompt = messages initial_inputs = {"text": prompt} parameters = LLMParams( @@ -207,18 +272,24 @@ async def handle_request(self, request: Request): if "text" in result_dict[last_node].keys(): response = result_dict[last_node]["text"] else: - # text in not response message + # text is not in response message # something wrong, for example due to empty retrieval results if "detail" in result_dict[last_node].keys(): response = result_dict[last_node]["detail"] else: - response = "The server fail to generate answer to your query!" + response = "The server failed to generate an answer to your query!" if "metadata" in result_dict[last_node].keys(): # from retrieval results metadata = result_dict[last_node]["metadata"] + if decoded_audio_input: + metadata["audio"] = decoded_audio_input else: # follow-up question, no retrieval - metadata = None + if decoded_audio_input: + metadata = {"audio": decoded_audio_input} + else: + metadata = None + choices = [] usage = UsageInfo() choices.append( diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh index 5ac1228dbf..0c522dd08f 100644 --- a/MultimodalQnA/tests/test_compose_on_gaudi.sh +++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding-multimodal retriever-multimodal-redis lvm-tgi dataprep-multimodal-redis" + service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding-multimodal retriever-multimodal-redis lvm-tgi dataprep-multimodal-redis whisper asr" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 @@ -35,6 +35,9 @@ function setup_env() { export EMBEDDER_PORT=6006 export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT/v1/encode" export MM_EMBEDDING_PORT_MICROSERVICE=6000 + export ASR_ENDPOINT=http://$host_ip:7066 + export ASR_SERVICE_PORT=3001 + export ASR_SERVICE_ENDPOINT="http://${host_ip}:${ASR_SERVICE_PORT}/v1/audio/transcriptions" export REDIS_URL="redis://${host_ip}:6379" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" @@ -239,13 +242,29 @@ function validate_megaservice() { "multimodalqna-backend-server" \ '{"messages": "What is the revenue of Nike in 2023?"}' + echo "Validate megaservice with first audio query" + validate_service \ + "http://${host_ip}:8888/v1/multimodalqna" \ + '"time_of_frame_ms":' \ + "multimodalqna" \ + "multimodalqna-backend-server" \ + '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}' + echo "Validate megaservice with follow-up query" validate_service \ "http://${host_ip}:8888/v1/multimodalqna" \ '"content":"' \ "multimodalqna" \ "multimodalqna-backend-server" \ - '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}' + '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": [{"type": "text", "text": "goodbye"}]}]}' + + echo "Validate megaservice with multiple text queries" + validate_service \ + "http://${host_ip}:8888/v1/multimodalqna" \ + '"content":"' \ + "multimodalqna" \ + "multimodalqna-backend-server" \ + '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": [{"type": "text", "text": "goodbye"}]}]}' } diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh index 7d3ab0faee..0475272206 100644 --- a/MultimodalQnA/tests/test_compose_on_xeon.sh +++ b/MultimodalQnA/tests/test_compose_on_xeon.sh @@ -21,9 +21,8 @@ export caption_fn="apple.txt" function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding-multimodal retriever-multimodal-redis lvm-llava lvm-llava-svc dataprep-multimodal-redis" + service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding-multimodal retriever-multimodal-redis lvm-llava lvm-llava-svc dataprep-multimodal-redis whisper asr" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker images && sleep 1m @@ -34,6 +33,9 @@ function setup_env() { export EMBEDDER_PORT=6006 export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT/v1/encode" export MM_EMBEDDING_PORT_MICROSERVICE=6000 + export ASR_ENDPOINT=http://$host_ip:7066 + export ASR_SERVICE_PORT=3001 + export ASR_SERVICE_ENDPOINT="http://${host_ip}:${ASR_SERVICE_PORT}/v1/audio/transcriptions" export REDIS_URL="redis://${host_ip}:6379" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" @@ -238,14 +240,29 @@ function validate_megaservice() { "multimodalqna-backend-server" \ '{"messages": "What is the revenue of Nike in 2023?"}' + echo "Validate megaservice with first audio query" + validate_service \ + "http://${host_ip}:8888/v1/multimodalqna" \ + '"time_of_frame_ms":' \ + "multimodalqna" \ + "multimodalqna-backend-server" \ + '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}' + echo "Validate megaservice with follow-up query" validate_service \ "http://${host_ip}:8888/v1/multimodalqna" \ '"content":"' \ "multimodalqna" \ "multimodalqna-backend-server" \ - '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}' + '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": [{"type": "text", "text": "goodbye"}]}]}' + echo "Validate megaservice with multiple text queries" + validate_service \ + "http://${host_ip}:8888/v1/multimodalqna" \ + '"content":"' \ + "multimodalqna" \ + "multimodalqna-backend-server" \ + '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": [{"type": "text", "text": "goodbye"}]}]}' } function validate_delete { diff --git a/MultimodalQnA/ui/gradio/conversation.py b/MultimodalQnA/ui/gradio/conversation.py index 3057e9879d..f080d7a154 100644 --- a/MultimodalQnA/ui/gradio/conversation.py +++ b/MultimodalQnA/ui/gradio/conversation.py @@ -5,7 +5,7 @@ from enum import Enum, auto from typing import List -from utils import get_b64_frame_from_timestamp +from utils import convert_audio_to_base64, get_b64_frame_from_timestamp class SeparatorStyle(Enum): @@ -31,6 +31,7 @@ class Conversation: skip_next: bool = False split_video: str = None image: str = None + audio_query_file: str = None def _template_caption(self): out = "" @@ -41,31 +42,32 @@ def _template_caption(self): def get_prompt(self): messages = self.messages if len(messages) > 1 and messages[1][1] is None: - # Need to do RAG. prompt is the query only - ret = messages[0][1] + # Need to do RAG. If the query is text, prompt is the query only + if self.audio_query_file: + ret = [{"role": "user", "content": [{"type": "audio", "audio": self.get_b64_audio_query()}]}] + else: + ret = messages[0][1] else: # No need to do RAG. Thus, prompt of chatcompletion format conv_dict = [] if self.sep_style == SeparatorStyle.SINGLE: for i, (role, message) in enumerate(messages): if message: - if i != 0: - dic = {"role": role, "content": message} + dic = {"role": role} + if self.audio_query_file: + content = [{"type": "audio", "audio": self.get_b64_audio_query()}] else: - dic = {"role": role} - if self.time_of_frame_ms and self.video_file: - content = [{"type": "text", "text": message}] - if self.base64_frame: - base64_frame = self.base64_frame - else: - base64_frame = get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms) - self.base64_frame = base64_frame - if base64_frame is None: - base64_frame = "" - content.append({"type": "image_url", "image_url": {"url": base64_frame}}) - else: - content = message - dic["content"] = content + content = [{"type": "text", "text": message}] + if i == 0 and self.time_of_frame_ms and self.video_file: + base64_frame = ( + self.base64_frame + if self.base64_frame + else get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms) + ) + if base64_frame is None: + base64_frame = "" + content.append({"type": "image_url", "image_url": {"url": base64_frame}}) + dic["content"] = content conv_dict.append(dic) else: raise ValueError(f"Invalid style: {self.sep_style}") @@ -83,6 +85,12 @@ def get_b64_image(self): b64_img = get_b64_frame_from_timestamp(video_file, time_of_frame_ms) return b64_img + def get_b64_audio_query(self): + b64_audio = None + if self.audio_query_file: + b64_audio = convert_audio_to_base64(self.audio_query_file) + return b64_audio + def to_gradio_chatbot(self): ret = [] for i, (role, msg) in enumerate(self.messages[self.offset :]): @@ -141,6 +149,7 @@ def dict(self): "base64_frame": self.base64_frame, "split_video": self.split_video, "image": self.image, + "audio_query_file": self.audio_query_file, } @@ -157,4 +166,5 @@ def dict(self): base64_frame=None, split_video=None, image=None, + audio_query_file=None, ) diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py index ec6a033ca8..28d3534be5 100644 --- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py +++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py @@ -16,6 +16,7 @@ from utils import build_logger, make_temp_image, moderation_msg, server_error_msg, split_video logger = build_logger("gradio_web_server", "gradio_web_server.log") +logflag = os.getenv("LOGFLAG", False) headers = {"Content-Type": "application/json"} @@ -50,21 +51,28 @@ def clear_history(state, request: gr.Request): if state.image and os.path.exists(state.image): os.remove(state.image) state = multimodalqna_conv.copy() - return (state, state.to_gradio_chatbot(), None, None, None) + (disable_btn,) * 1 + return (state, state.to_gradio_chatbot(), None, None, None, None) + (disable_btn,) * 1 -def add_text(state, text, request: gr.Request): +def add_text(state, text, audio, request: gr.Request): logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}") - if len(text) <= 0: + if audio: + state.audio_query_file = audio + state.append_message(state.roles[0], "--input placeholder--") + state.append_message(state.roles[1], None) + state.skip_next = False + return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1 + elif len(text) <= 0: state.skip_next = True - return (state, state.to_gradio_chatbot(), None) + (no_change_btn,) * 1 + return (state, state.to_gradio_chatbot(), None, None) + (no_change_btn,) * 1 text = text[:2000] # Hard cut-off state.append_message(state.roles[0], text) state.append_message(state.roles[1], None) state.skip_next = False - return (state, state.to_gradio_chatbot(), None) + (disable_btn,) * 1 + + return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1 def http_bot(state, request: gr.Request): @@ -72,6 +80,7 @@ def http_bot(state, request: gr.Request): logger.info(f"http_bot. ip: {request.client.host}") url = gateway_addr is_very_first_query = False + is_audio_query = state.audio_query_file is not None if state.skip_next: # This generate call is skipped due to invalid inputs path_to_sub_videos = state.get_path_to_subvideos() @@ -84,13 +93,13 @@ def http_bot(state, request: gr.Request): new_state = multimodalqna_conv.copy() new_state.append_message(new_state.roles[0], state.messages[-2][1]) new_state.append_message(new_state.roles[1], None) + new_state.audio_query_file = state.audio_query_file state = new_state # Construct prompt prompt = state.get_prompt() # Make requests - pload = { "messages": prompt, } @@ -99,6 +108,7 @@ def http_bot(state, request: gr.Request): logger.info(f"==== url request ====\n{gateway_addr}") state.messages[-1][-1] = "▌" + yield (state, state.to_gradio_chatbot(), state.split_video, state.image) + (disable_btn,) * 1 try: @@ -108,8 +118,9 @@ def http_bot(state, request: gr.Request): json=pload, timeout=100, ) - print(response.status_code) - print(response.json()) + logger.info(response.status_code) + if logflag: + logger.info(response.json()) if response.status_code == 200: response = response.json() @@ -152,6 +163,11 @@ def http_bot(state, request: gr.Request): return state.messages[-1][-1] = message + + if is_audio_query: + state.messages[-2][-1] = metadata.get("audio", "--transcribed audio not available--") + state.audio_query_file = None + yield ( state, state.to_gradio_chatbot(), @@ -188,10 +204,11 @@ def ingest_gen_transcript(filepath, filetype, request: gr.Request): "files": open(dest, "rb"), } response = requests.post(dataprep_gen_transcript_addr, headers=headers, files=files) - print(response.status_code) + logger.info(response.status_code) if response.status_code == 200: response = response.json() - print(response) + if logflag: + logger.info(response) yield (gr.Textbox(visible=True, value=f"The {filetype} ingestion is done. Saving your uploaded {filetype}...")) time.sleep(2) fn_no_ext = Path(dest).stem @@ -242,10 +259,11 @@ def ingest_gen_caption(filepath, filetype, request: gr.Request): "files": open(dest, "rb"), } response = requests.post(dataprep_gen_caption_addr, headers=headers, files=files) - print(response.status_code) + logger.info(response.status_code) if response.status_code == 200: response = response.json() - print(response) + if logflag: + logger.info(response) yield (gr.Textbox(visible=True, value=f"The {filetype} ingestion is done. Saving your uploaded {filetype}...")) time.sleep(2) fn_no_ext = Path(dest).stem @@ -299,10 +317,11 @@ def ingest_with_text(filepath, text, request: gr.Request): response = requests.post(dataprep_ingest_addr, headers=headers, files=files) finally: os.remove(text_dest) - print(response.status_code) + logger.info(response.status_code) if response.status_code == 200: response = response.json() - print(response) + if logflag: + logger.info(response) yield (gr.Textbox(visible=True, value="Image ingestion is done. Saving your uploaded image...")) time.sleep(2) fn_no_ext = Path(dest).stem @@ -436,21 +455,26 @@ def select_upload_type(choice, request: gr.Request): with gr.Blocks() as qna: state = gr.State(multimodalqna_conv.copy()) with gr.Row(): - with gr.Column(scale=4): + with gr.Column(scale=2): video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media") image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media") - with gr.Column(scale=7): + with gr.Column(scale=9): chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", height=390) with gr.Row(): - with gr.Column(scale=6): - # textbox.render() - textbox = gr.Textbox( - # show_label=False, - # container=False, - label="Query", - info="Enter a text query below", - # submit_btn=False, - ) + with gr.Column(scale=8): + with gr.Tabs(): + with gr.TabItem("Text Query"): + textbox = gr.Textbox( + show_label=False, + container=True, + ) + with gr.TabItem("Audio Query"): + audio = gr.Audio( + type="filepath", + sources=["microphone", "upload"], + show_label=False, + container=False, + ) with gr.Column(scale=1, min_width=100): with gr.Row(): submit_btn = gr.Button(value="Send", variant="primary", interactive=True) @@ -462,13 +486,13 @@ def select_upload_type(choice, request: gr.Request): [ state, ], - [state, chatbot, textbox, video, image, clear_btn], + [state, chatbot, textbox, audio, video, image, clear_btn], ) submit_btn.click( add_text, - [state, textbox], - [state, chatbot, textbox, clear_btn], + [state, textbox, audio], + [state, chatbot, textbox, audio, clear_btn], ).then( http_bot, [ diff --git a/MultimodalQnA/ui/gradio/utils.py b/MultimodalQnA/ui/gradio/utils.py index 7a730a7ed4..3d7be10118 100644 --- a/MultimodalQnA/ui/gradio/utils.py +++ b/MultimodalQnA/ui/gradio/utils.py @@ -163,7 +163,7 @@ def delete_split_video(video_path): def convert_img_to_base64(image): - "Convert image to base64 string" + """Convert image to base64 string.""" _, buffer = cv2.imencode(".png", image) encoded_string = base64.b64encode(buffer) return encoded_string.decode("utf-8") @@ -180,3 +180,9 @@ def get_b64_frame_from_timestamp(video_path, timestamp_in_ms, maintain_aspect_ra b64_img_str = convert_img_to_base64(frame) return b64_img_str return None + + +def convert_audio_to_base64(audio_path): + """Convert .wav file to base64 string.""" + encoded_string = base64.b64encode(open(audio_path, "rb").read()) + return encoded_string.decode("utf-8") diff --git a/Translation/translation.yaml b/Translation/translation.yaml deleted file mode 100644 index f3a07da966..0000000000 --- a/Translation/translation.yaml +++ /dev/null @@ -1,54 +0,0 @@ - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -opea_micro_services: - tgi-service: - host: ${TGI_SERVICE_IP} - ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.6 - volumes: - - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - model-id: ${LLM_MODEL_ID} - llm: - host: ${LLM_SERVICE_HOST_IP} - ports: ${LLM_SERVICE_PORT} - image: opea/llm-tgi:latest - endpoint: /v1/chat/completions - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - ui: - host: ${UI_SERVICE_HOST_IP} - ports: - - "5173:5173" - image: opea/translation-ui:latest - environment: - - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT} - -opea_mega_service: - host: ${MEGA_SERVICE_HOST_IP} - ports: ${MEGA_SERVICE_PORT} - image: opea/translation:latest - endpoint: /v1/translation - mega_flow: - - llm