From 674867354a39501bf65cafebb350ef1b4f9bc469 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 22 Jan 2025 14:54:15 +0000 Subject: [PATCH] fix smoke tests Signed-off-by: NickLucche --- extras/print_gpu_memory_stats.py | 30 ++++++++++++++++++++++++++++++ extras/smoke-test.sh | 26 +++++++++++++++++--------- 2 files changed, 47 insertions(+), 9 deletions(-) create mode 100644 extras/print_gpu_memory_stats.py diff --git a/extras/print_gpu_memory_stats.py b/extras/print_gpu_memory_stats.py new file mode 100644 index 0000000000000..8af0e74549c20 --- /dev/null +++ b/extras/print_gpu_memory_stats.py @@ -0,0 +1,30 @@ +import torch + + +def print_gpu_memory_stats(): + if not torch.cuda.is_available(): + print("No GPU available") + return + + for i in range(torch.cuda.device_count()): + device_name = torch.cuda.get_device_name(i) + # Convert to GB + total_memory = torch.cuda.get_device_properties(i).total_memory / (1024 + **3) + allocated = torch.cuda.memory_allocated(i) / (1024**3) + reserved = torch.cuda.memory_reserved(i) / (1024**3) + max_allocated = torch.cuda.max_memory_allocated(i) / (1024**3) + max_reserved = torch.cuda.max_memory_reserved(i) / (1024**3) + free_memory = reserved - allocated + + print(f"Device {i}: {device_name}") + print(f" Total Memory: {total_memory:.2f} GB") + print(f" Allocated Memory: {allocated:.2f} GB") + print(f" Reserved Memory: {reserved:.2f} GB") + print(f" Free Memory: {free_memory:.2f} GB") + print(f" Max Allocated: {max_allocated:.2f} GB") + print(f" Max Reserved: {max_reserved:.2f} GB") + print("-" * 40) + + +print_gpu_memory_stats() diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh index 4d982e196e4ec..26de7f53f4ef9 100644 --- a/extras/smoke-test.sh +++ b/extras/smoke-test.sh @@ -16,10 +16,10 @@ function wait_for(){ # shellcheck disable=SC2124 command=$@ - max_retries=10 + max_retries=15 until $command ; do echo "Waiting for $name to be up (retries_left=$max_retries)..." - sleep 30 + sleep 20 max_retries=$((max_retries-1)) if [[ max_retries -le 0 ]]; then echo "Timed out waiting for $name server" >&2 @@ -29,11 +29,17 @@ function wait_for(){ done } +function gpu_memory_stats(){ + # In case `nvidia-smi` is missing + script_dir=$(dirname "$(realpath "$0")") + python "$script_dir"/print_gpu_memory_stats.py +} + # stop the server on any errors -trap 'kill -9 $server_pid && exit 1' ERR +trap 'kill $server_pid && exit 1' ERR # spin up the OpenAPI server in the background -python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT & +python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT --model facebook/opt-125m --enforce-eager & server_pid=$! server_url="http://localhost:$HTTP_PORT" @@ -50,14 +56,14 @@ curl -v --no-progress-meter --fail-with-body \ }' \ "${server_url}/v1/completions" | python -m json.tool -echo "OpenAI API success" && kill -9 $server_pid - +# Wait for gracious termination to clean up gpu memory +echo "OpenAI API success" && kill $server_pid && wait $server_pid +gpu_memory_stats # spin up the grpc server in the background -python -m vllm_tgis_adapter --grpc-port $GRPC_PORT & +python -m vllm_tgis_adapter --grpc-port $GRPC_PORT --model facebook/opt-125m --enforce-eager & server_pid=$! server_url="localhost:$GRPC_PORT" - # get grpcurl curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \ https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz @@ -72,4 +78,6 @@ wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapt "$server_url" \ fmaas.GenerationService/Generate -echo "GRPC API success" && kill -9 $server_pid +# Wait for gracious termination to clean up gpu memory +echo "GRPC API success" && kill $server_pid && wait $server_pid +gpu_memory_stats