Skip to content

Commit

Permalink
PR #21163: [GPU] Redefine the flag xla_gpu_cudnn_gemm_fusion_level.
Browse files Browse the repository at this point in the history
Imported from GitHub PR #21163

The levels defined so far were used for testing/benchmarking. The new definitions will help the architecture-targeted deployment of the feature.

This change also lets the relevant tests run manually on Ampere+ GPUs - previously they were skipped before Hopper.
Copybara import of the project:

--
6bcca3c by Ilia Sergachev <[email protected]>:

[GPU] Redefine the flag xla_gpu_cudnn_gemm_fusion_level.

The levels defined so far were used for testing/benchmarking. The new
definitions will help the architecture-targeted deployment of the
feature.

This change also lets the relevant tests run manually on Ampere+ GPUs -
previously they were skipped before Hopper.

--
91ea952 by Ilia Sergachev <[email protected]>:

add missing build dependency

Merging this change closes #21163

COPYBARA_INTEGRATE_REVIEW=#21163 from openxla:cudnn_gemm_redefine_levels 91ea952
PiperOrigin-RevId: 714287649
  • Loading branch information
sergachev authored and Google-ML-Automation committed Jan 11, 2025
1 parent 39b0baa commit fcb4fc0
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 58 deletions.
9 changes: 6 additions & 3 deletions xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,13 @@ bool GemmFusionAutotunerImpl::AddLibConfigs(
std::vector<BackendConfig>& configs) {
// Add cuDNN plans, if available.
auto cc = std::get<se::CudaComputeCapability>(GetComputeCapability());
bool is_hopper = !config_.IsDeviceless() && cc.IsAtLeastHopper();
bool is_cudnn_enabled =
debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0 && is_hopper &&
GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9;
!config_.IsDeviceless() &&
GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9 &&
((cc.IsAtLeastAmpere() &&
debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 1) ||
(cc.IsAtLeastBlackwell() &&
debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0));
if ((IsFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
(IsFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
algorithm_util::IsSupportedByCudnn(
Expand Down
1 change: 1 addition & 0 deletions xla/service/gpu/fusions/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ xla_test(
"//xla/service/gpu/runtime:thunk",
"//xla/service/gpu/tests:gpu_codegen_test",
"//xla/service/gpu/transforms:cudnn_fusion_compiler",
"//xla/stream_executor:device_description",
"//xla/stream_executor:dnn",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
Expand Down
60 changes: 23 additions & 37 deletions xla/service/gpu/fusions/cudnn_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ limitations under the License.
#include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
#include "xla/service/pattern_matcher.h"
#include "xla/service/pattern_matcher_gmock.h"
#include "xla/stream_executor/device_description.h"
#include "xla/stream_executor/dnn.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_memory_allocator.h"
Expand All @@ -63,14 +64,14 @@ class CuDnnFusionTest : public GpuCodegenTest {
// Let this group of tests just use first available plan skipping
// autotuning.
debug_options.set_xla_gpu_autotune_level(0);
debug_options.set_xla_gpu_cudnn_gemm_fusion_level(1);
debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
return debug_options;
}
bool IsAtLeastHopperWithCuDnn9() {
bool IsAtLeastAmpereWithCuDnn9() {
se::StreamExecutor* executor = backend().default_stream_executor();
return executor->GetDeviceDescription()
.cuda_compute_capability()
.IsAtLeastHopper() &&
.IsAtLeastAmpere() &&
GetDnnVersionInfoOrDefault(executor).major_version() >= 9;
}
bool IsAtLeastCuDnn91() {
Expand All @@ -82,9 +83,9 @@ class CuDnnFusionTest : public GpuCodegenTest {

protected:
void SetUp() override {
if (!IsAtLeastHopperWithCuDnn9()) {
if (!IsAtLeastAmpereWithCuDnn9()) {
GTEST_SKIP()
<< "cuDNN GEMM fusion is not enabled before Hopper / cuDNN 9.";
<< "cuDNN GEMM fusion is not tested before Ampere / cuDNN 9.";
}
}
};
Expand Down Expand Up @@ -609,17 +610,7 @@ ENTRY e {
EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

class CuDnnFusionLevel2Test : public CuDnnFusionExecutionTest {
public:
DebugOptions GetDebugOptionsForTest() const override {
DebugOptions debug_options =
CuDnnFusionExecutionTest::GetDebugOptionsForTest();
debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
return debug_options;
}
};

TEST_F(CuDnnFusionLevel2Test, BroadcastToDim2ExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, BroadcastToDim2ExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
p0 = f16[16,32,128] parameter(0)
Expand All @@ -642,7 +633,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, BroadcastToDim1ExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, BroadcastToDim1ExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
p0 = f16[16,32,128] parameter(0)
Expand All @@ -665,7 +656,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, BroadcastToDim0ExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, BroadcastToDim0ExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
p0 = bf16[32,128] parameter(0)
Expand All @@ -685,7 +676,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, BroadcastTo2DimsExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, BroadcastTo2DimsExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
p0 = f16[16,32,128] parameter(0)
Expand All @@ -708,7 +699,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, BroadcastTo3DimsExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, BroadcastTo3DimsExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
p0 = f16[16,32,128] parameter(0)
Expand All @@ -731,7 +722,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, ConstantExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, ConstantExecutesCorrectly) {
if (!IsAtLeastCuDnn91()) {
GTEST_SKIP() << "Fused scalar constants require cuDNN 9.1+.";
}
Expand Down Expand Up @@ -760,7 +751,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, ClampExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, ClampExecutesCorrectly) {
if (!IsAtLeastCuDnn91()) {
GTEST_SKIP() << "Clamp test requires cuDNN 9.1+.";
}
Expand Down Expand Up @@ -789,7 +780,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, DotF8ExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, DotF8ExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
Expand All @@ -814,7 +805,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel2Test, SlicingExecutesCorrectly) {
TEST_F(CuDnnFusionExecutionTest, SlicingExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
p0 = f16[11,23,64] parameter(0)
Expand All @@ -834,17 +825,7 @@ ENTRY e {
ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
}

class CuDnnFusionLevel3Test : public CuDnnFusionExecutionTest {
public:
DebugOptions GetDebugOptionsForTest() const override {
DebugOptions debug_options =
CuDnnFusionExecutionTest::GetDebugOptionsForTest();
debug_options.set_xla_gpu_cudnn_gemm_fusion_level(3);
return debug_options;
}
};

TEST_F(CuDnnFusionLevel3Test,
TEST_F(CuDnnFusionExecutionTest,
DotWithSplitNonContractingInputExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
Expand All @@ -867,7 +848,7 @@ ENTRY r {
ErrorSpec{/*aabs=*/1, /*arel=*/1e-3}));
}

TEST_F(CuDnnFusionLevel3Test,
TEST_F(CuDnnFusionExecutionTest,
DotWithSplitNonContractingInOutExecutesCorrectly) {
EXPECT_TRUE(RunAndCompare(R"(
fusion1 {
Expand Down Expand Up @@ -1098,7 +1079,6 @@ class CuDnnFusionRewriteTest : public CuDnnFusionTest {
// Reset autotuning level to default.
debug_options.set_xla_gpu_autotune_level(
GetDebugOptionsFromFlags().xla_gpu_autotune_level());
debug_options.set_xla_gpu_cudnn_gemm_fusion_level(1);
debug_options.set_xla_gpu_cublas_fallback(false);
return debug_options;
}
Expand Down Expand Up @@ -1131,6 +1111,12 @@ TEST_F(CuDnnFusionRewriteTest, AutotuningPicksCuDnnForS8BF16OnHopper) {
// The test case relies on measurements by the autotuner and current
// performance comparison of the backends. May need to be updated if
// the situation changes.
if (backend()
.default_stream_executor()
->GetDeviceDescription()
.cuda_compute_capability() != se::CudaComputeCapability::Hopper()) {
GTEST_SKIP() << "The test is for Hopper.";
}
MatchOptimizedHlo(R"(
e {
p0 = bf16[720,720,720] parameter(0)
Expand Down
18 changes: 3 additions & 15 deletions xla/service/gpu/transforms/cudnn_fusion_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,6 @@ inline std::optional<fe::DataType_t> GetComputeDataType(
return compute_dtype;
}

int FusionLevel(const HloInstruction& hlo) {
return hlo.GetModule()
->config()
.debug_options()
.xla_gpu_cudnn_gemm_fusion_level();
};

// Extracts dimensions and strides from HLO tensors in the format expected by
// cuDNN.
class GemmDimensionAdapter {
Expand Down Expand Up @@ -277,9 +270,6 @@ class GemmDimensionAdapter {
if (spec->size() == 1) {
// The dimension is not split, nothing to do.
} else if (spec->size() == 2) {
if (FusionLevel(hlo) < 3) {
return std::nullopt;
}
if (!dims.lhs_batch_dimensions().empty()) {
VLOG(8) << "Noncontracting dimension split is not compatible with "
"batch dimensions.";
Expand Down Expand Up @@ -498,8 +488,7 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
return std::nullopt;
}
continue;
} else if (FusionLevel(fusion) >= 2 &&
HloPredicateIsOp<HloOpcode::kConstant>(hlo)) {
} else if (HloPredicateIsOp<HloOpcode::kConstant>(hlo)) {
if (const auto const_tensor = HandleConstantHloToCudnnGraph(*hlo, graph);
const_tensor.has_value()) {
hlo_to_cudnn[hlo] = const_tensor.value();
Expand All @@ -508,9 +497,8 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
}
} else if (HloPredicateIsOp<HloOpcode::kReshape, HloOpcode::kBitcast,
HloOpcode::kTranspose, HloOpcode::kCopy>(hlo) ||
(FusionLevel(fusion) >= 2 &&
(HloPredicateIsOp<HloOpcode::kBroadcast, HloOpcode::kSlice>(
hlo)))) {
((HloPredicateIsOp<HloOpcode::kBroadcast, HloOpcode::kSlice>(
hlo)))) {
// All these are accounted for separately as transformations of strides.
hlo_to_cudnn[hlo] = operand(0);
} else if (hlo->IsElementwise()) {
Expand Down
5 changes: 2 additions & 3 deletions xla/xla.proto
Original file line number Diff line number Diff line change
Expand Up @@ -904,9 +904,8 @@ message DebugOptions {
// Let GEMM fusion autotuning probe cuDNN as a backend.
// Current levels:
// 0: Disabled.
// 1: Fusions of GEMM, elementwise, transpose/reshape operations.
// 2: + Broadcasts, slicing.
// 3: + Nontrivial noncontracting dimension reshapes/transposes.
// 1: Enabled on Blackwell+ GPUs.
// 2: Enabled on all supported GPUs (Ampere+).
int32 xla_gpu_cudnn_gemm_fusion_level = 285;

// This instructs the runtime whether to use
Expand Down

0 comments on commit fcb4fc0

Please sign in to comment.