fix smoke tests

Signed-off-by: NickLucche <[email protected]>
opendatahub-io · Jan 23, 2025 · 9e5a88d · 9e5a88d
1 parent 5ed86b7
commit 9e5a88d
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 9 deletions.
diff --git a/extras/print_gpu_memory_stats.py b/extras/print_gpu_memory_stats.py
@@ -0,0 +1,30 @@
+import torch
+
+
+def print_gpu_memory_stats():
+    if not torch.cuda.is_available():
+        print("No GPU available")
+        return
+
+    for i in range(torch.cuda.device_count()):
+        device_name = torch.cuda.get_device_name(i)
+        # Convert to GB
+        total_memory = torch.cuda.get_device_properties(i).total_memory / (1024
+                                                                           **3)
+        allocated = torch.cuda.memory_allocated(i) / (1024**3)
+        reserved = torch.cuda.memory_reserved(i) / (1024**3)
+        max_allocated = torch.cuda.max_memory_allocated(i) / (1024**3)
+        max_reserved = torch.cuda.max_memory_reserved(i) / (1024**3)
+        free_memory = reserved - allocated
+
+        print(f"Device {i}: {device_name}")
+        print(f"  Total Memory:       {total_memory:.2f} GB")
+        print(f"  Allocated Memory:   {allocated:.2f} GB")
+        print(f"  Reserved Memory:    {reserved:.2f} GB")
+        print(f"  Free Memory:        {free_memory:.2f} GB")
+        print(f"  Max Allocated:      {max_allocated:.2f} GB")
+        print(f"  Max Reserved:       {max_reserved:.2f} GB")
+        print("-" * 40)
+
+
+print_gpu_memory_stats()
diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh
@@ -16,10 +16,10 @@ function wait_for(){
     # shellcheck disable=SC2124
     command=$@
 
-    max_retries=10
+    max_retries=15
     until $command ; do
         echo "Waiting for $name to be up (retries_left=$max_retries)..."
-        sleep 30
+        sleep 20
         max_retries=$((max_retries-1))
         if [[ max_retries -le 0 ]]; then
             echo "Timed out waiting for $name server" >&2
@@ -29,11 +29,17 @@ function wait_for(){
     done
 }
 
+function gpu_memory_stats(){
+    # In case `nvidia-smi` is missing
+    script_dir=$(dirname "$(realpath "$0")")
+    python "$script_dir"/print_gpu_memory_stats.py
+}
+
 # stop the server on any errors
-trap 'kill -9 $server_pid && exit 1' ERR
+trap 'kill $server_pid && exit 1' ERR
 
 # spin up the OpenAPI server in the background
-python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT &
+python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT --model facebook/opt-125m --enforce-eager &
 server_pid=$!
 server_url="http://localhost:$HTTP_PORT"
 
@@ -50,14 +56,14 @@ curl -v --no-progress-meter --fail-with-body \
 }' \
   "${server_url}/v1/completions" | python -m json.tool
 
-echo "OpenAI API success" && kill -9 $server_pid
-
+# Wait for gracious termination to clean up gpu memory
+echo "OpenAI API success" && kill $server_pid && wait $server_pid
+gpu_memory_stats
 
 # spin up the grpc server in the background
-python -m vllm_tgis_adapter --grpc-port $GRPC_PORT &
+python -m vllm_tgis_adapter --grpc-port $GRPC_PORT --model facebook/opt-125m --enforce-eager &
 server_pid=$!
 server_url="localhost:$GRPC_PORT"
-
 # get grpcurl
 curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \
   https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz
@@ -72,4 +78,6 @@ wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapt
     "$server_url" \
     fmaas.GenerationService/Generate
 
-echo "GRPC API success" && kill -9 $server_pid
+# Wait for gracious termination to clean up gpu memory
+echo "GRPC API success" && kill $server_pid && wait $server_pid
+gpu_memory_stats