From 18eaf8447a4547310c7dc357c3797cec0f37c49f Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 24 Jan 2025 16:41:06 -0800 Subject: [PATCH] More aggressively tune the GPU and benchmark (#143) Summary: Through the OSS metrics tracking we found the day-to-day run variance is larger than we expected (~4%) where we are shooting for 2% max variance. This PR will tune the GPU more aggressively and run gemm and softmax benchmarks with cudagraph to see if metrics can be more stabilized. Pull Request resolved: https://github.com/pytorch-labs/tritonbench/pull/143 Test Plan: CI Reviewed By: adamomainz Differential Revision: D68644171 Pulled By: xuzhao9 fbshipit-source-id: ea3b34836da536719176d36d1b301f048d8038cd --- .ci/gpu/reset-gcp-h100.sh | 13 +++++++++++++ .ci/gpu/tune-gcp-h100.sh | 16 ++++++++++++++++ .github/workflows/_linux-benchmark-h100.yml | 10 ++++++++-- benchmarks/nightly/run.py | 2 ++ 4 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 .ci/gpu/reset-gcp-h100.sh create mode 100644 .ci/gpu/tune-gcp-h100.sh diff --git a/.ci/gpu/reset-gcp-h100.sh b/.ci/gpu/reset-gcp-h100.sh new file mode 100644 index 00000000..9e8e0639 --- /dev/null +++ b/.ci/gpu/reset-gcp-h100.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Script to tune NVIDIA H100 GPU on GCP +# To reset GPU status + +# Reset GPU and Memory clocks +sudo nvidia-smi -rgc +sudo nvidia-smi -rmc + +# Restore the default power limit (500W) +sudo nvidia-smi -pl 500 + +# Disable persistent mode +sudo nvidia-smi -pm 0 diff --git a/.ci/gpu/tune-gcp-h100.sh b/.ci/gpu/tune-gcp-h100.sh new file mode 100644 index 00000000..806348af --- /dev/null +++ b/.ci/gpu/tune-gcp-h100.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# Script to tune NVIDIA H100 GPU on GCP +# To stablize performance + +set -ex + +# Enable persistent mode +sudo nvidia-smi -pm 1 +# Lock power limit to 650W +sudo nvidia-smi -pl 650 + +# Default Memory Frequency: 2619 MHz +# Default Graphics Frequency: 1980 MHz +sudo nvidia-smi -lgc 1980,1980 +sudo nvidia-smi -lmc 2619,2619 +sudo nvidia-smi -ac 2619,1980 diff --git a/.github/workflows/_linux-benchmark-h100.yml b/.github/workflows/_linux-benchmark-h100.yml index ccfc4d6e..cc7dd3cb 100644 --- a/.github/workflows/_linux-benchmark-h100.yml +++ b/.github/workflows/_linux-benchmark-h100.yml @@ -36,7 +36,7 @@ jobs: submodules: recursive - name: Tune Nvidia GPU run: | - sudo nvidia-smi -pm 1 + bash .ci/gpu/tune-gcp-h100.sh sudo ldconfig nvidia-smi - name: Benchmarking @@ -52,4 +52,10 @@ jobs: run: | . "${SETUP_SCRIPT}" latest_result_json=$(find ./benchmark-output/ -name "result.json" | sort -r | head -n 1) - python .ci/upload/scribe.py --json ${latest_result_json} + python ./.ci/upload/scribe.py --json ${latest_result_json} + - name: Restore Nvidia GPU + if: always() + run: | + bash .ci/gpu/reset-gcp-h100.sh + sudo ldconfig + nvidia-smi diff --git a/benchmarks/nightly/run.py b/benchmarks/nightly/run.py index 2cae1df8..55252965 100644 --- a/benchmarks/nightly/run.py +++ b/benchmarks/nightly/run.py @@ -46,6 +46,7 @@ def setup_tritonbench_cwd(): "latency,gbps", "--num-inputs", "6", + "--cudagraph", ], "bf16_gemm": [ "--op", @@ -58,6 +59,7 @@ def setup_tritonbench_cwd(): "latency,tflops", "--num-inputs", "4", + "--cudagraph", ], }