From 674867354a39501bf65cafebb350ef1b4f9bc469 Mon Sep 17 00:00:00 2001
From: NickLucche <nlucches@redhat.com>
Date: Wed, 22 Jan 2025 14:54:15 +0000
Subject: [PATCH] fix smoke tests

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 extras/print_gpu_memory_stats.py | 30 ++++++++++++++++++++++++++++++
 extras/smoke-test.sh             | 26 +++++++++++++++++---------
 2 files changed, 47 insertions(+), 9 deletions(-)
 create mode 100644 extras/print_gpu_memory_stats.py

diff --git a/extras/print_gpu_memory_stats.py b/extras/print_gpu_memory_stats.py
new file mode 100644
index 0000000000000..8af0e74549c20
--- /dev/null
+++ b/extras/print_gpu_memory_stats.py
@@ -0,0 +1,30 @@
+import torch
+
+
+def print_gpu_memory_stats():
+    if not torch.cuda.is_available():
+        print("No GPU available")
+        return
+
+    for i in range(torch.cuda.device_count()):
+        device_name = torch.cuda.get_device_name(i)
+        # Convert to GB
+        total_memory = torch.cuda.get_device_properties(i).total_memory / (1024
+                                                                           **3)
+        allocated = torch.cuda.memory_allocated(i) / (1024**3)
+        reserved = torch.cuda.memory_reserved(i) / (1024**3)
+        max_allocated = torch.cuda.max_memory_allocated(i) / (1024**3)
+        max_reserved = torch.cuda.max_memory_reserved(i) / (1024**3)
+        free_memory = reserved - allocated
+
+        print(f"Device {i}: {device_name}")
+        print(f"  Total Memory:       {total_memory:.2f} GB")
+        print(f"  Allocated Memory:   {allocated:.2f} GB")
+        print(f"  Reserved Memory:    {reserved:.2f} GB")
+        print(f"  Free Memory:        {free_memory:.2f} GB")
+        print(f"  Max Allocated:      {max_allocated:.2f} GB")
+        print(f"  Max Reserved:       {max_reserved:.2f} GB")
+        print("-" * 40)
+
+
+print_gpu_memory_stats()
diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh
index 4d982e196e4ec..26de7f53f4ef9 100644
--- a/extras/smoke-test.sh
+++ b/extras/smoke-test.sh
@@ -16,10 +16,10 @@ function wait_for(){
     # shellcheck disable=SC2124
     command=$@
 
-    max_retries=10
+    max_retries=15
     until $command ; do
         echo "Waiting for $name to be up (retries_left=$max_retries)..."
-        sleep 30
+        sleep 20
         max_retries=$((max_retries-1))
         if [[ max_retries -le 0 ]]; then
             echo "Timed out waiting for $name server" >&2
@@ -29,11 +29,17 @@ function wait_for(){
     done
 }
 
+function gpu_memory_stats(){
+    # In case `nvidia-smi` is missing
+    script_dir=$(dirname "$(realpath "$0")")
+    python "$script_dir"/print_gpu_memory_stats.py
+}
+
 # stop the server on any errors
-trap 'kill -9 $server_pid && exit 1' ERR
+trap 'kill $server_pid && exit 1' ERR
 
 # spin up the OpenAPI server in the background
-python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT &
+python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT --model facebook/opt-125m --enforce-eager &
 server_pid=$!
 server_url="http://localhost:$HTTP_PORT"
 
@@ -50,14 +56,14 @@ curl -v --no-progress-meter --fail-with-body \
 }' \
   "${server_url}/v1/completions" | python -m json.tool
 
-echo "OpenAI API success" && kill -9 $server_pid
-
+# Wait for gracious termination to clean up gpu memory
+echo "OpenAI API success" && kill $server_pid && wait $server_pid
+gpu_memory_stats
 
 # spin up the grpc server in the background
-python -m vllm_tgis_adapter --grpc-port $GRPC_PORT &
+python -m vllm_tgis_adapter --grpc-port $GRPC_PORT --model facebook/opt-125m --enforce-eager &
 server_pid=$!
 server_url="localhost:$GRPC_PORT"
-
 # get grpcurl
 curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \
   https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz
@@ -72,4 +78,6 @@ wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapt
     "$server_url" \
     fmaas.GenerationService/Generate
 
-echo "GRPC API success" && kill -9 $server_pid
+# Wait for gracious termination to clean up gpu memory
+echo "GRPC API success" && kill $server_pid && wait $server_pid
+gpu_memory_stats