ROCm · i-chaochen · Jun 20, 2024 · May 28, 2024 · May 24, 2024 · May 27, 2024
diff --git a/xla/lit.cfg.py b/xla/lit.cfg.py
@@ -43,8 +43,13 @@
     ("%PYTHON", os.getenv("PYTHON", sys.executable)),
 ])
 
+if lit_config.params.get('PTX') == 'GCN':
+    config.available_features.add("IS_ROCM")
+
+
 # Include additional substitutions that may be defined via params
 config.substitutions.extend(
     ("%%{%s}" % key, val)
     for key, val in lit_config.params.items()
 )
+
diff --git a/xla/service/gpu/tests/add_preds.hlo b/xla/service/gpu/tests/add_preds.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
-// CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
+// CHECK: define{{( amdgpu_kernel)?}} void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
 // CHECK:   %[[A:.*]] = load {{.*}} ptr %[[ARG0]]
 // CHECK:   %[[B:.*]] = load {{.*}} ptr %[[ARG1]]
 // CHECK:   or {{.*}} %[[A]], %[[B]]

diff --git a/xla/service/gpu/tests/dot_bf16.hlo b/xla/service/gpu/tests/dot_bf16.hlo
@@ -1,5 +1,6 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
 
 
 // CHECK-SM70: custom-call(f32

diff --git a/xla/service/gpu/tests/fused_scatter.hlo b/xla/service/gpu/tests/fused_scatter.hlo
@@ -2,7 +2,7 @@
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-// CHECK:       define void @wrapped_scatter
+// CHECK:       define{{( amdgpu_kernel)?}} void @wrapped_scatter
 // CHECK:         %[[VAL_70:.*]] = alloca i32, align 4
 // CHECK-PTX:     %[[VAL_71:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
 // CHECK-GCN:     %[[VAL_71:.*]] = call i32 @llvm.amdgcn.workgroup.id.x

diff --git a/xla/service/gpu/tests/launch_dimensions.hlo b/xla/service/gpu/tests/launch_dimensions.hlo
@@ -2,7 +2,7 @@
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
 
-// CHECK-LABEL:   define void @wrapped_b
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @wrapped_b
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
@@ -27,7 +27,7 @@ ENTRY main {
 
 // This tests that we cap grid launch code when few_waves is enabled.
 
-// CHECK-LABEL:   define void @wrapped_b
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @wrapped_b
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
@@ -53,7 +53,7 @@ ENTRY main {
 // This tests that we cap grid launch code when few_waves is enabled
 // and scalar broadcast are present.
 
-// CHECK-LABEL:   define void @fusion_3
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion_3
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -84,7 +84,7 @@ ENTRY main {
 // This tests that we enable few_waves in a simple fusion. It is the baseline
 // for the tests below.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -113,7 +113,7 @@ ENTRY main {
 
 // This tests that we keep few_waves enabled for large constants.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -141,7 +141,7 @@ ENTRY main {
 
 // This tests that we disable few_waves if a non-elementwise op is present.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 195313}
@@ -175,7 +175,7 @@ ENTRY main {
 // - the fusion is not row-vectorizable
 // It serves as a baseline for the tests below.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 7813}
@@ -219,7 +219,7 @@ ENTRY main {
 // - the fusion IS row-vectorizable
 // In this case, the block count is changed from 7813 to 2000.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 2000}
@@ -260,7 +260,7 @@ ENTRY main {
 // - the fusion is not row-vectorizable
 // In this case, the block count is changed from 7813 to 1008.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -300,7 +300,7 @@ ENTRY main {
 // This tests the GELU kernel. The original kernel that
 // motivated few_waves implementation.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]