PR #21163: [GPU] Redefine the flag xla_gpu_cudnn_gemm_fusion_level.

Imported from GitHub PR #21163 The levels defined so far were used for testing/benchmarking. The new definitions will help the architecture-targeted deployment of the feature. This change also lets the relevant tests run manually on Ampere+ GPUs - previously they were skipped before Hopper. Copybara import of the project: -- 6bcca3c by Ilia Sergachev <[email protected]>: [GPU] Redefine the flag xla_gpu_cudnn_gemm_fusion_level. The levels defined so far were used for testing/benchmarking. The new definitions will help the architecture-targeted deployment of the feature. This change also lets the relevant tests run manually on Ampere+ GPUs - previously they were skipped before Hopper. -- 91ea952 by Ilia Sergachev <[email protected]>: add missing build dependency Merging this change closes #21163 COPYBARA_INTEGRATE_REVIEW=#21163 from openxla:cudnn_gemm_redefine_levels 91ea952 PiperOrigin-RevId: 714287649
openxla · Jan 11, 2025 · fcb4fc0 · fcb4fc0
1 parent 39b0baa
commit fcb4fc0
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 58 deletions.
diff --git a/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -52,10 +52,13 @@ bool GemmFusionAutotunerImpl::AddLibConfigs(
     std::vector<BackendConfig>& configs) {
   // Add cuDNN plans, if available.
   auto cc = std::get<se::CudaComputeCapability>(GetComputeCapability());
-  bool is_hopper = !config_.IsDeviceless() && cc.IsAtLeastHopper();
   bool is_cudnn_enabled =
-      debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0 && is_hopper &&
-      GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9;
+      !config_.IsDeviceless() &&
+      GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9 &&
+      ((cc.IsAtLeastAmpere() &&
+        debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 1) ||
+       (cc.IsAtLeastBlackwell() &&
+        debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0));
   if ((IsFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
       (IsFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
        algorithm_util::IsSupportedByCudnn(

diff --git a/xla/service/gpu/fusions/BUILD b/xla/service/gpu/fusions/BUILD
@@ -475,6 +475,7 @@ xla_test(
         "//xla/service/gpu/runtime:thunk",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/service/gpu/transforms:cudnn_fusion_compiler",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",

diff --git a/xla/service/gpu/fusions/cudnn_test.cc b/xla/service/gpu/fusions/cudnn_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -63,14 +64,14 @@ class CuDnnFusionTest : public GpuCodegenTest {
     // Let this group of tests just use first available plan skipping
     // autotuning.
     debug_options.set_xla_gpu_autotune_level(0);
-    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(1);
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
     return debug_options;
   }
-  bool IsAtLeastHopperWithCuDnn9() {
+  bool IsAtLeastAmpereWithCuDnn9() {
     se::StreamExecutor* executor = backend().default_stream_executor();
     return executor->GetDeviceDescription()
                .cuda_compute_capability()
-               .IsAtLeastHopper() &&
+               .IsAtLeastAmpere() &&
            GetDnnVersionInfoOrDefault(executor).major_version() >= 9;
   }
   bool IsAtLeastCuDnn91() {
@@ -82,9 +83,9 @@ class CuDnnFusionTest : public GpuCodegenTest {
 
  protected:
   void SetUp() override {
-    if (!IsAtLeastHopperWithCuDnn9()) {
+    if (!IsAtLeastAmpereWithCuDnn9()) {
       GTEST_SKIP()
-          << "cuDNN GEMM fusion is not enabled before Hopper / cuDNN 9.";
+          << "cuDNN GEMM fusion is not tested before Ampere / cuDNN 9.";
     }
   }
 };
@@ -609,17 +610,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-class CuDnnFusionLevel2Test : public CuDnnFusionExecutionTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        CuDnnFusionExecutionTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
-    return debug_options;
-  }
-};
-
-TEST_F(CuDnnFusionLevel2Test, BroadcastToDim2ExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, BroadcastToDim2ExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
   p0 = f16[16,32,128] parameter(0)
@@ -642,7 +633,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, BroadcastToDim1ExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, BroadcastToDim1ExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
   p0 = f16[16,32,128] parameter(0)
@@ -665,7 +656,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, BroadcastToDim0ExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, BroadcastToDim0ExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
   p0 = bf16[32,128] parameter(0)
@@ -685,7 +676,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, BroadcastTo2DimsExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, BroadcastTo2DimsExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
   p0 = f16[16,32,128] parameter(0)
@@ -708,7 +699,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, BroadcastTo3DimsExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, BroadcastTo3DimsExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
   p0 = f16[16,32,128] parameter(0)
@@ -731,7 +722,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, ConstantExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, ConstantExecutesCorrectly) {
   if (!IsAtLeastCuDnn91()) {
     GTEST_SKIP() << "Fused scalar constants require cuDNN 9.1+.";
   }
@@ -760,7 +751,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, ClampExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, ClampExecutesCorrectly) {
   if (!IsAtLeastCuDnn91()) {
     GTEST_SKIP() << "Clamp test requires cuDNN 9.1+.";
   }
@@ -789,7 +780,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, DotF8ExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, DotF8ExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 
 fusion1 {
@@ -814,7 +805,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel2Test, SlicingExecutesCorrectly) {
+TEST_F(CuDnnFusionExecutionTest, SlicingExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
   p0 = f16[11,23,64] parameter(0)
@@ -834,17 +825,7 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-class CuDnnFusionLevel3Test : public CuDnnFusionExecutionTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        CuDnnFusionExecutionTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(3);
-    return debug_options;
-  }
-};
-
-TEST_F(CuDnnFusionLevel3Test,
+TEST_F(CuDnnFusionExecutionTest,
        DotWithSplitNonContractingInputExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
@@ -867,7 +848,7 @@ ENTRY r {
                             ErrorSpec{/*aabs=*/1, /*arel=*/1e-3}));
 }
 
-TEST_F(CuDnnFusionLevel3Test,
+TEST_F(CuDnnFusionExecutionTest,
        DotWithSplitNonContractingInOutExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
@@ -1098,7 +1079,6 @@ class CuDnnFusionRewriteTest : public CuDnnFusionTest {
     // Reset autotuning level to default.
     debug_options.set_xla_gpu_autotune_level(
         GetDebugOptionsFromFlags().xla_gpu_autotune_level());
-    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(1);
     debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
@@ -1131,6 +1111,12 @@ TEST_F(CuDnnFusionRewriteTest, AutotuningPicksCuDnnForS8BF16OnHopper) {
   // The test case relies on measurements by the autotuner and current
   // performance comparison of the backends. May need to be updated if
   // the situation changes.
+  if (backend()
+          .default_stream_executor()
+          ->GetDeviceDescription()
+          .cuda_compute_capability() != se::CudaComputeCapability::Hopper()) {
+    GTEST_SKIP() << "The test is for Hopper.";
+  }
   MatchOptimizedHlo(R"(
 e {
   p0 = bf16[720,720,720] parameter(0)

diff --git a/xla/service/gpu/transforms/cudnn_fusion_compiler.cc b/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
@@ -176,13 +176,6 @@ inline std::optional<fe::DataType_t> GetComputeDataType(
   return compute_dtype;
 }
 
-int FusionLevel(const HloInstruction& hlo) {
-  return hlo.GetModule()
-      ->config()
-      .debug_options()
-      .xla_gpu_cudnn_gemm_fusion_level();
-};
-
 // Extracts dimensions and strides from HLO tensors in the format expected by
 // cuDNN.
 class GemmDimensionAdapter {
@@ -277,9 +270,6 @@ class GemmDimensionAdapter {
         if (spec->size() == 1) {
           // The dimension is not split, nothing to do.
         } else if (spec->size() == 2) {
-          if (FusionLevel(hlo) < 3) {
-            return std::nullopt;
-          }
           if (!dims.lhs_batch_dimensions().empty()) {
             VLOG(8) << "Noncontracting dimension split is not compatible with "
                        "batch dimensions.";
@@ -498,8 +488,7 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
         return std::nullopt;
       }
       continue;
-    } else if (FusionLevel(fusion) >= 2 &&
-               HloPredicateIsOp<HloOpcode::kConstant>(hlo)) {
+    } else if (HloPredicateIsOp<HloOpcode::kConstant>(hlo)) {
       if (const auto const_tensor = HandleConstantHloToCudnnGraph(*hlo, graph);
           const_tensor.has_value()) {
         hlo_to_cudnn[hlo] = const_tensor.value();
@@ -508,9 +497,8 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       }
     } else if (HloPredicateIsOp<HloOpcode::kReshape, HloOpcode::kBitcast,
                                 HloOpcode::kTranspose, HloOpcode::kCopy>(hlo) ||
-               (FusionLevel(fusion) >= 2 &&
-                (HloPredicateIsOp<HloOpcode::kBroadcast, HloOpcode::kSlice>(
-                    hlo)))) {
+               ((HloPredicateIsOp<HloOpcode::kBroadcast, HloOpcode::kSlice>(
+                   hlo)))) {
       // All these are accounted for separately as transformations of strides.
       hlo_to_cudnn[hlo] = operand(0);
     } else if (hlo->IsElementwise()) {

diff --git a/xla/xla.proto b/xla/xla.proto
@@ -904,9 +904,8 @@ message DebugOptions {
   // Let GEMM fusion autotuning probe cuDNN as a backend.
   // Current levels:
   // 0: Disabled.
-  // 1: Fusions of GEMM, elementwise, transpose/reshape operations.
-  // 2: + Broadcasts, slicing.
-  // 3: + Nontrivial noncontracting dimension reshapes/transposes.
+  // 1: Enabled on Blackwell+ GPUs.
+  // 2: Enabled on all supported GPUs (Ampere+).
   int32 xla_gpu_cudnn_gemm_fusion_level = 285;
 
   // This instructs the runtime whether to use