From 18eaf8447a4547310c7dc357c3797cec0f37c49f Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@meta.com>
Date: Fri, 24 Jan 2025 16:41:06 -0800
Subject: [PATCH] More aggressively tune the GPU and benchmark (#143)

Summary:
Through the OSS metrics tracking we found the day-to-day run variance is larger than we expected (~4%) where we are shooting for 2% max variance.

This PR will tune the GPU more aggressively and run gemm and softmax benchmarks with cudagraph to see if metrics can be more stabilized.

Pull Request resolved: https://github.com/pytorch-labs/tritonbench/pull/143

Test Plan: CI

Reviewed By: adamomainz

Differential Revision: D68644171

Pulled By: xuzhao9

fbshipit-source-id: ea3b34836da536719176d36d1b301f048d8038cd
---
 .ci/gpu/reset-gcp-h100.sh                   | 13 +++++++++++++
 .ci/gpu/tune-gcp-h100.sh                    | 16 ++++++++++++++++
 .github/workflows/_linux-benchmark-h100.yml | 10 ++++++++--
 benchmarks/nightly/run.py                   |  2 ++
 4 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 .ci/gpu/reset-gcp-h100.sh
 create mode 100644 .ci/gpu/tune-gcp-h100.sh

diff --git a/.ci/gpu/reset-gcp-h100.sh b/.ci/gpu/reset-gcp-h100.sh
new file mode 100644
index 00000000..9e8e0639
--- /dev/null
+++ b/.ci/gpu/reset-gcp-h100.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Script to tune NVIDIA H100 GPU on GCP
+# To reset GPU status
+
+# Reset GPU and Memory clocks
+sudo nvidia-smi -rgc
+sudo nvidia-smi -rmc
+
+# Restore the default power limit (500W)
+sudo nvidia-smi -pl 500
+
+# Disable persistent mode
+sudo nvidia-smi -pm 0
diff --git a/.ci/gpu/tune-gcp-h100.sh b/.ci/gpu/tune-gcp-h100.sh
new file mode 100644
index 00000000..806348af
--- /dev/null
+++ b/.ci/gpu/tune-gcp-h100.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# Script to tune NVIDIA H100 GPU on GCP
+# To stablize performance
+
+set -ex
+
+# Enable persistent mode
+sudo nvidia-smi -pm 1
+# Lock power limit to 650W
+sudo nvidia-smi -pl 650
+
+# Default Memory Frequency: 2619 MHz
+# Default Graphics Frequency: 1980 MHz
+sudo nvidia-smi -lgc 1980,1980
+sudo nvidia-smi -lmc 2619,2619
+sudo nvidia-smi -ac 2619,1980
diff --git a/.github/workflows/_linux-benchmark-h100.yml b/.github/workflows/_linux-benchmark-h100.yml
index ccfc4d6e..cc7dd3cb 100644
--- a/.github/workflows/_linux-benchmark-h100.yml
+++ b/.github/workflows/_linux-benchmark-h100.yml
@@ -36,7 +36,7 @@ jobs:
           submodules: recursive
       - name: Tune Nvidia GPU
         run: |
-          sudo nvidia-smi -pm 1
+          bash .ci/gpu/tune-gcp-h100.sh
           sudo ldconfig
           nvidia-smi
       - name: Benchmarking
@@ -52,4 +52,10 @@ jobs:
         run: |
           . "${SETUP_SCRIPT}"
           latest_result_json=$(find ./benchmark-output/ -name "result.json"  | sort -r | head -n 1)
-          python .ci/upload/scribe.py --json ${latest_result_json}
+          python ./.ci/upload/scribe.py --json ${latest_result_json}
+      - name: Restore Nvidia GPU
+        if: always()
+        run: |
+          bash .ci/gpu/reset-gcp-h100.sh
+          sudo ldconfig
+          nvidia-smi
diff --git a/benchmarks/nightly/run.py b/benchmarks/nightly/run.py
index 2cae1df8..55252965 100644
--- a/benchmarks/nightly/run.py
+++ b/benchmarks/nightly/run.py
@@ -46,6 +46,7 @@ def setup_tritonbench_cwd():
         "latency,gbps",
         "--num-inputs",
         "6",
+        "--cudagraph",
     ],
     "bf16_gemm": [
         "--op",
@@ -58,6 +59,7 @@ def setup_tritonbench_cwd():
         "latency,tflops",
         "--num-inputs",
         "4",
+        "--cudagraph",
     ],
 }