diff --git a/xla/lit.cfg.py b/xla/lit.cfg.py index ebb8bd434f382..d148814cdfc73 100644 --- a/xla/lit.cfg.py +++ b/xla/lit.cfg.py @@ -43,8 +43,13 @@ ("%PYTHON", os.getenv("PYTHON", sys.executable)), ]) +if lit_config.params.get('PTX') == 'GCN': + config.available_features.add("IS_ROCM") + + # Include additional substitutions that may be defined via params config.substitutions.extend( ("%%{%s}" % key, val) for key, val in lit_config.params.items() ) + diff --git a/xla/service/gpu/tests/add_preds.hlo b/xla/service/gpu/tests/add_preds.hlo index 120b6a5ad686b..d86113ae2ad60 100644 --- a/xla/service/gpu/tests/add_preds.hlo +++ b/xla/service/gpu/tests/add_preds.hlo @@ -1,6 +1,6 @@ // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s -// CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]], +// CHECK: define{{( amdgpu_kernel)?}} void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]], // CHECK: %[[A:.*]] = load {{.*}} ptr %[[ARG0]] // CHECK: %[[B:.*]] = load {{.*}} ptr %[[ARG1]] // CHECK: or {{.*}} %[[A]], %[[B]] diff --git a/xla/service/gpu/tests/dot_bf16.hlo b/xla/service/gpu/tests/dot_bf16.hlo index a2a2c34c37eb8..ad3792ccd27ba 100644 --- a/xla/service/gpu/tests/dot_bf16.hlo +++ b/xla/service/gpu/tests/dot_bf16.hlo @@ -1,5 +1,6 @@ -// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 -// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 +// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %} +// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %} +// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %} // CHECK-SM70: custom-call(f32 diff --git a/xla/service/gpu/tests/fused_scatter.hlo b/xla/service/gpu/tests/fused_scatter.hlo index 9a30436ebfa38..f8cb266bc4c67 100644 --- a/xla/service/gpu/tests/fused_scatter.hlo +++ b/xla/service/gpu/tests/fused_scatter.hlo @@ -2,7 +2,7 @@ // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py -// CHECK: define void @wrapped_scatter +// CHECK: define{{( amdgpu_kernel)?}} void @wrapped_scatter // CHECK: %[[VAL_70:.*]] = alloca i32, align 4 // CHECK-PTX: %[[VAL_71:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x // CHECK-GCN: %[[VAL_71:.*]] = call i32 @llvm.amdgcn.workgroup.id.x diff --git a/xla/service/gpu/tests/launch_dimensions.hlo b/xla/service/gpu/tests/launch_dimensions.hlo index bcfa37733f7e6..3d05dcf9892ad 100644 --- a/xla/service/gpu/tests/launch_dimensions.hlo +++ b/xla/service/gpu/tests/launch_dimensions.hlo @@ -2,7 +2,7 @@ // This tests that we do not increase the grid launch size when // few_waves is enabled. -// CHECK-LABEL: define void @wrapped_b +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @wrapped_b // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] @@ -27,7 +27,7 @@ ENTRY main { // This tests that we cap grid launch code when few_waves is enabled. -// CHECK-LABEL: define void @wrapped_b +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @wrapped_b // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] @@ -53,7 +53,7 @@ ENTRY main { // This tests that we cap grid launch code when few_waves is enabled // and scalar broadcast are present. -// CHECK-LABEL: define void @fusion_3 +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion_3 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -84,7 +84,7 @@ ENTRY main { // This tests that we enable few_waves in a simple fusion. It is the baseline // for the tests below. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -113,7 +113,7 @@ ENTRY main { // This tests that we keep few_waves enabled for large constants. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -141,7 +141,7 @@ ENTRY main { // This tests that we disable few_waves if a non-elementwise op is present. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 195313} @@ -175,7 +175,7 @@ ENTRY main { // - the fusion is not row-vectorizable // It serves as a baseline for the tests below. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 7813} @@ -219,7 +219,7 @@ ENTRY main { // - the fusion IS row-vectorizable // In this case, the block count is changed from 7813 to 2000. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 2000} @@ -260,7 +260,7 @@ ENTRY main { // - the fusion is not row-vectorizable // In this case, the block count is changed from 7813 to 1008. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -300,7 +300,7 @@ ENTRY main { // This tests the GELU kernel. The original kernel that // motivated few_waves implementation. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] diff --git a/xla/service/gpu/tests/reduce_atomic_min.hlo b/xla/service/gpu/tests/reduce_atomic_min.hlo index c7165c9e11763..46e1240f947f8 100644 --- a/xla/service/gpu/tests/reduce_atomic_min.hlo +++ b/xla/service/gpu/tests/reduce_atomic_min.hlo @@ -39,7 +39,7 @@ ENTRY reduce.1 { // CHECK: wrapped_reduce.in_bounds-after: ; preds = %[[VAL_9]], %[[VAL_11:.*]] // CHECK: ret void // CHECK: wrapped_reduce.in_bounds-true: ; preds = %[[VAL_11]] -// CHECK: %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load !5 +// CHECK: %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load // CHECK: store float %[[VAL_12]], ptr %[[VAL_14:.*]], align 4 // CHECK: br label %[[VAL_10]] // CHECK: entry: @@ -67,11 +67,12 @@ ENTRY reduce.1 { // CHECK: %[[VAL_36:.*]] = alloca float, align 4 // CHECK: %[[VAL_37:.*]] = alloca float, align 4 // CHECK: %[[VAL_38:.*]] = alloca float, align 4 -// CHECK: %[[LOOP3_I_2:loop3.invar_address.*]] = alloca i32, align 4 +// CHECK: %[[LOOP3_I_2:loop[23].invar_address.*]] = alloca i32, align 4 +// CHECK-GCN: %[[VAL_42:return_buffer.*]] = alloca float, align 4 // CHECK: %[[LOOP2_I_2:loop2.invar_address.*]] = alloca i32, align 4 -// CHECK: %[[VAL_42:return_buffer.*]] = alloca float, align 4 -// CHECK: %[[VAL_40:.*]] = alloca i32, align 4 -// CHECK: %[[VAL_43:.*]] = alloca i32, align 4 +// CHECK-PTX: %[[VAL_42:return_buffer.*]] = alloca float, align 4 +// CHECK-PTX: %[[VAL_40:.*]] = alloca i32, align 4 +// CHECK-PTX: %[[VAL_43:.*]] = alloca i32, align 4 // CHECK: %partial_reduction_result = alloca float, align 4 // CHECK: %reduction_input_address = alloca float, align 4 // CHECK-PTX: %[[VAL_47:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !4 @@ -81,158 +82,245 @@ ENTRY reduce.1 { // CHECK: reduce-group-0-after: ; preds = %[[VAL_51:.*]], %[[VAL_52:.*]] // CHECK: ret void // CHECK: reduce-group-0-true: ; preds = %[[VAL_52]] -// CHECK: %[[VAL_53:.*]] = load float, ptr %[[VAL_54:.*]], align 4, !invariant.load !5 -// CHECK: store float %[[VAL_53]], ptr %partial_reduction_result, align 4 +// CHECK: %[[VAL_53:.*]] = load float, ptr %[[VAL_54:.*]], align 4, !invariant.load !{{[0-9]}} +// CHECK: store float %[[VAL_53]], ptr{{.*}} %partial_reduction_result, align 4 // CHECK-PTX: %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !6 // CHECK-GCN: %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x // CHECK-PTX: %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !7 // CHECK-GCN: %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x // CHECK: %thread.id.2 = urem i32 %thread.id.x, 1024 // CHECK: %lane_id = urem i32 %thread.id.x, 32 -// CHECK: %[[VAL_63:.*]] = udiv i32 %block.id.x, 1 -// CHECK: %[[VECTOR_OFFSET:.*]] = urem i32 %[[VAL_63]], 1 +// CHECK-PTX: %[[VAL_63:.*]] = udiv i32 %block.id.x, 1 +// CHECK-PTX: %[[VECTOR_OFFSET:.*]] = urem i32 %[[VAL_63]], 1 // CHECK: %[[VAL_63_2:.*]] = udiv i32 %block.id.x, 1 // CHECK: %[[VAL_64:.*]] = urem i32 %[[VAL_63_2]], 19 // CHECK: %[[VAL_65:.*]] = udiv i32 %block.id.x, 19 // CHECK: %[[VAL_66:.*]] = urem i32 %[[VAL_65]], 1 // CHECK: %[[VAL_67:.*]] = udiv i32 %block.id.x, 19 // CHECK: %[[VAL_68:.*]] = icmp eq i32 %[[VAL_64]], 18 -// CHECK: %tile_bound.2 = select i1 %[[VAL_68]], i32 2544, i32 8192 +// CHECK-PTX: %tile_bound.2 = select i1 %[[VAL_68]], i32 2544, i32 8192 +// CHECK-GCN: %tile_bound.2 = select i1 %[[VAL_68]], i32 5088, i32 16384 // CHECK: %tile_origin.0 = mul i32 %[[VAL_67]], 1 // CHECK: %tile_origin.1 = mul i32 %[[VAL_66]], 1 -// CHECK: %tile_origin.2 = mul i32 %[[VAL_64]], 8192 -// CHECK: %tile_origin.3 = mul i32 %[[VECTOR_OFFSET]], 2 -// CHECK: %[[VAL_81:.*]] = icmp eq i32 8192, %tile_bound.2 +// CHECK-PTX: %tile_origin.2 = mul i32 %[[VAL_64]], 8192 +// CHECK-GCN: %tile_origin.2 = mul i32 %[[VAL_64]], 16384 +// CHECK-PTX: %tile_origin.3 = mul i32 %[[VECTOR_OFFSET]], 2 +// CHECK-PTX: %[[VAL_81:.*]] = icmp eq i32 8192, %tile_bound.2 +// CHECK-GCN: %[[VAL_81:.*]] = icmp eq i32 16384, %tile_bound.2 // CHECK: br i1 %[[VAL_81]], label %[[VAL_82:.*]], label %[[VAL_83:.*]] // CHECK: is_full_tile-after: ; preds = %[[VAL_84:.*]], %[[VAL_85:.*]] -// CHECK: %[[VAL_86:.*]] = load float, ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_87:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_86]], i32 16, i32 31) -// CHECK: store float %[[VAL_87]], ptr %[[VAL_37]], align 4 -// CHECK: call void @[[MIN:Min.*]](ptr %partial_reduction_result, ptr %[[VAL_37]], ptr %[[VAL_36]]) -// CHECK: %[[VAL_88:.*]] = load float, ptr %[[VAL_36]], align 4 -// CHECK: store float %[[VAL_88]], ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_89:.*]] = load float, ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_90:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_89]], i32 8, i32 31) -// CHECK: store float %[[VAL_90]], ptr %[[VAL_35]], align 4 -// CHECK: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_35]], ptr %[[VAL_34]]) -// CHECK: %[[VAL_91:.*]] = load float, ptr %[[VAL_34]], align 4 -// CHECK: store float %[[VAL_91]], ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_92:.*]] = load float, ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_93:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_92]], i32 4, i32 31) -// CHECK: store float %[[VAL_93]], ptr %[[VAL_33]], align 4 -// CHECK: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_33]], ptr %[[VAL_32]]) -// CHECK: %[[VAL_94:.*]] = load float, ptr %[[VAL_32]], align 4 -// CHECK: store float %[[VAL_94]], ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_95:.*]] = load float, ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_96:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_95]], i32 2, i32 31) -// CHECK: store float %[[VAL_96]], ptr %[[VAL_31]], align 4 -// CHECK: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_31]], ptr %[[VAL_30]]) -// CHECK: %[[VAL_97:.*]] = load float, ptr %[[VAL_30]], align 4 -// CHECK: store float %[[VAL_97]], ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_98:.*]] = load float, ptr %partial_reduction_result, align 4 -// CHECK: %[[VAL_99:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_98]], i32 1, i32 31) -// CHECK: store float %[[VAL_99]], ptr %[[VAL_29]], align 4 -// CHECK: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_29]], ptr %[[VAL_28]]) -// CHECK: %[[VAL_100:.*]] = load float, ptr %[[VAL_28]], align 4 -// CHECK: store float %[[VAL_100]], ptr %partial_reduction_result, align 4 +// CHECK: %[[VAL_86:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-PTX: %[[VAL_87:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_86]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_87_1:.*]] = bitcast float %[[VAL_86]] to i32 +// CHECK-GCN: %[[VAL_87_2:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_87:.*]] = bitcast i32 %[[VAL_87_2]] to float +// CHECK: store float %[[VAL_87]], ptr{{.*}} %[[VAL_37]], align 4 +// CHECK-GCN: %[[VAL_88_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_88_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_37]] to ptr +// CHECK-GCN: %[[VAL_88_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_36]] to ptr +// CHECK-PTX: call void @[[MIN:Min.*]](ptr %partial_reduction_result, ptr %[[VAL_37]], ptr %[[VAL_36]]) +// CHECK-GCN: call void @[[MIN:Min.*]](ptr %[[VAL_88_1]], ptr %[[VAL_88_2]], ptr %[[VAL_88_3]]) +// CHECK: %[[VAL_88:.*]] = load float, ptr{{.*}} %[[VAL_36]], align 4 +// CHECK: store float %[[VAL_88]], ptr{{.*}} %partial_reduction_result, align 4 +// CHECK: %[[VAL_89:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-PTX: %[[VAL_90:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_89]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_90_1:.*]] = bitcast float %[[VAL_89]] to i32 +// CHECK-GCN: %[[VAL_90_2:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_90:.*]] = bitcast i32 %[[VAL_90_2]] to float +// CHECK: store float %[[VAL_90]], ptr{{.*}} %[[VAL_35]], align 4 +// CHECK-GCN: %[[VAL_91_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_91_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_35]] to ptr +// CHECK-GCN: %[[VAL_91_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_34]] to ptr +// CHECK-PTX: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_35]], ptr %[[VAL_34]]) +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_91_1]], ptr %[[VAL_91_2]], ptr %[[VAL_91_3]]) +// CHECK: %[[VAL_91:.*]] = load float, ptr{{.*}} %[[VAL_34]], align 4 +// CHECK: store float %[[VAL_91]], ptr{{.*}} %partial_reduction_result, align 4 +// CHECK: %[[VAL_92:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-PTX: %[[VAL_93:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_92]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_93_1:.*]] = bitcast float %[[VAL_92]] to i32 +// CHECK-GCN: %[[VAL_93_2:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_93:.*]] = bitcast i32 %[[VAL_93_2]] to float +// CHECK: store float %[[VAL_93]], ptr{{.*}} %[[VAL_33]], align 4 +// CHECK-GCN: %[[VAL_94_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_94_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_33]] to ptr +// CHECK-GCN: %[[VAL_94_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_32]] to ptr +// CHECK-PTX: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_33]], ptr %[[VAL_32]]) +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_94_1]], ptr %[[VAL_94_2]], ptr %[[VAL_94_3]]) +// CHECK: %[[VAL_94:.*]] = load float, ptr{{.*}} %[[VAL_32]], align 4 +// CHECK: store float %[[VAL_94]], ptr{{.*}} %partial_reduction_result, align 4 +// CHECK: %[[VAL_95:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-PTX: %[[VAL_96:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_95]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_96_1:.*]] = bitcast float %[[VAL_95]] to i32 +// CHECK-GCN: %[[VAL_96_2:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_96:.*]] = bitcast i32 %[[VAL_96_2]] to float +// CHECK: store float %[[VAL_96]], ptr{{.*}} %[[VAL_31]], align 4 +// CHECK-GCN: %[[VAL_97_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_97_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_31]] to ptr +// CHECK-GCN: %[[VAL_97_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_30]] to ptr +// CHECK-PTX: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_31]], ptr %[[VAL_30]]) +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_97_1]], ptr %[[VAL_97_2]], ptr %[[VAL_97_3]]) +// CHECK: %[[VAL_97:.*]] = load float, ptr{{.*}} %[[VAL_30]], align 4 +// CHECK: store float %[[VAL_97]], ptr{{.*}} %partial_reduction_result, align 4 +// CHECK: %[[VAL_98:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-PTX: %[[VAL_99:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_98]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_99_1:.*]] = bitcast float %[[VAL_98]] to i32 +// CHECK-GCN: %[[VAL_99_2:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_99:.*]] = bitcast i32 %[[VAL_99_2]] to float +// CHECK: store float %[[VAL_99]], ptr{{.*}} %[[VAL_29]], align 4 +// CHECK-GCN: %[[VAL_100_1:.*]] = addrspacecast ptr{{.*}} %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_100_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_29]] to ptr +// CHECK-GCN: %[[VAL_100_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-PTX: call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_29]], ptr %[[VAL_28]]) +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_100_1]], ptr %[[VAL_100_2]], ptr %[[VAL_100_3]]) +// CHECK: %[[VAL_100:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 +// CHECK: store float %[[VAL_100]], ptr{{.*}} %partial_reduction_result, align 4 // CHECK: %[[VAL_101:.*]] = udiv i32 %thread.id.2, 32 // CHECK: br i1 true, label %[[VAL_105:.*]], label %[[VAL_51]] + // CHECK: thread_in_bounds-after: // CHECK: br label %[[VAL_50]] // CHECK: is_full_tile-true: -// CHECK: store i32 0, ptr %[[VAL_43]], align 4 +// CHECK-PTX: store i32 0, ptr{{.*}} %[[VAL_43]], align 4 +// CHECK-GCN: store i32 0, ptr{{.*}} %[[LOOP2_I_2]], align 4 // CHECK: br label %[[VAL_107:.*]] // CHECK: loop2.loop_header: ; preds = %[[VAL_108:.*]], %[[VAL_82]] -// CHECK: %[[VAL_109:.*]] = load i32, ptr %[[VAL_43]], align 4 -// CHECK: %[[VAL_110:.*]] = icmp uge i32 %[[VAL_109]], 8 +// CHECK-PTX: %[[VAL_109:.*]] = load i32, ptr %[[VAL_43]], align 4 +// CHECK-GCN: %[[VAL_109:.*]] = load i32, ptr{{.*}} %[[LOOP2_I_2]], align 4 +// CHECK: %[[VAL_110:.*]] = icmp uge i32 %[[VAL_109]], // CHECK: br i1 %[[VAL_110]], label %loop2.loop_exit, label %loop2.loop_body + // CHECK: loop2.loop_body: ; preds = %[[VAL_107]] // CHECK: %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_109]], 1 -// CHECK: store i32 %[[VAL_111]], ptr %[[VAL_43]], align 4 +// CHECK-PTX: store i32 %[[VAL_111]], ptr %[[VAL_43]], align 4 +// CHECK-GCN: store i32 %[[VAL_111]], ptr{{.*}} %[[LOOP2_I_2]], align 4 // CHECK: %[[OFFSET_2:.*]] = add i32 %loop2.indvar, %thread.id.2 -// CHECK: store i32 0, ptr %loop3.invar_address, align 4 -// CHECK: br label %loop3.loop_header -// CHECK: loop3.loop_header: -// CHECK: %loop3.indvar = load i32, ptr %loop3.invar_address, align 4 -// CHECK: %[[LOOP3_OOB:.*]] = icmp uge i32 %loop3.indvar, 2 -// CHECK: br i1 %[[LOOP3_OOB]], label %loop3.loop_exit, label %loop3.loop_body -// CHECK: loop3.loop_body: -// CHECK: %[[LOOP3_INC:.*]] = add nuw nsw i32 %loop3.indvar, 1 -// CHECK: store i32 %[[LOOP3_INC]], ptr %loop3.invar_address, align 4 -// CHECK: %[[START_0:.*]] = add i32 %tile_origin.0, 0 -// CHECK: %[[START_1:.*]] = add i32 %tile_origin.1, 0 -// CHECK: %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]] -// CHECK: %[[START_3:.*]] = add i32 %tile_origin.3, %loop3.indvar -// CHECK: %[[VAL_113:.*]] = mul nuw nsw i32 %[[START_3]], 1 -// CHECK: %[[VAL_114:.*]] = add nuw nsw i32 0, %[[VAL_113]] -// CHECK: %[[VAL_115:.*]] = mul nuw nsw i32 %[[START_2]], 2 -// CHECK: %[[VAL_116:.*]] = add nuw nsw i32 %[[VAL_114]], %[[VAL_115]] -// CHECK: %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[VAL_116]] -// CHECK: %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !5 -// CHECK: store float %[[VAL_121]], ptr %reduction_input_address, align 4 -// CHECK: call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_42]]) -// CHECK: %[[VAL_123:.*]] = load float, ptr %[[VAL_42]], align 4 -// CHECK: store float %[[VAL_123]], ptr %partial_reduction_result, align 4 -// CHECK: br label %loop3.loop_header -// CHECK: loop3.loop_exit: -// CHECK: br label %loop2.loop_header +// CHECK-GCN: %[[START_0:.*]] = add i32 %tile_origin.0, 0 +// CHECK-GCN: %[[START_1:.*]] = add i32 %tile_origin.1, 0 +// CHECK-GCN: %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]] +// CHECK-GCN: %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[START_2]] +// CHECK-GCN: %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !3 +// CHECK-GCN: store float %[[VAL_121]], ptr{{.*}} %reduction_input_address, align 4 +// CHECK-GCN: %[[VAL_123_1:.*]] = addrspacecast ptr addrspace(5) %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_123_2:.*]] = addrspacecast ptr addrspace(5) %reduction_input_address to ptr +// CHECK-GCN: %[[VAL_123_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_42]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_123_1]], ptr %[[VAL_123_2]], ptr %[[VAL_123_3]]) +// CHECK-GCN: %[[VAL_123:.*]] = load float, ptr{{.*}} %[[VAL_42]], align 4 +// CHECK-GCN: store float %[[VAL_123]], ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-GCN: br label %loop2.loop_header +// CHECK-PTX: store i32 0, ptr %loop3.invar_address, align 4 +// CHECK-PTX: br label %loop3.loop_header + +// CHECK-PTX: loop3.loop_header: +// CHECK-PTX: %loop3.indvar = load i32, ptr %loop3.invar_address, align 4 +// CHECK-PTX: %[[LOOP3_OOB:.*]] = icmp uge i32 %loop3.indvar, 2 +// CHECK-PTX: br i1 %[[LOOP3_OOB]], label %loop3.loop_exit, label %loop3.loop_body +// CHECK-PTX: loop3.loop_body: +// CHECK-PTX: %[[LOOP3_INC:.*]] = add nuw nsw i32 %loop3.indvar, 1 +// CHECK-PTX: store i32 %[[LOOP3_INC]], ptr %loop3.invar_address, align 4 +// CHECK-PTX: %[[START_0:.*]] = add i32 %tile_origin.0, 0 +// CHECK-PTX: %[[START_1:.*]] = add i32 %tile_origin.1, 0 +// CHECK-PTX: %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]] +// CHECK-PTX: %[[START_3:.*]] = add i32 %tile_origin.3, %loop3.indvar +// CHECK-PTX: %[[VAL_113:.*]] = mul nuw nsw i32 %[[START_3]], 1 +// CHECK-PTX: %[[VAL_114:.*]] = add nuw nsw i32 0, %[[VAL_113]] +// CHECK-PTX: %[[VAL_115:.*]] = mul nuw nsw i32 %[[START_2]], 2 +// CHECK-PTX: %[[VAL_116:.*]] = add nuw nsw i32 %[[VAL_114]], %[[VAL_115]] +// CHECK-PTX: %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[VAL_116]] +// CHECK-PTX: %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !5 +// CHECK-PTX: store float %[[VAL_121]], ptr %reduction_input_address, align 4 +// CHECK-PTX: call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_42]]) +// CHECK-PTX: %[[VAL_123:.*]] = load float, ptr %[[VAL_42]], align 4 +// CHECK-PTX: store float %[[VAL_123]], ptr %partial_reduction_result, align 4 +// CHECK-PTX: br label %loop3.loop_header +// CHECK-PTX: loop3.loop_exit: +// CHECK-PTX: br label %loop2.loop_header + // CHECK: loop2.loop_exit: // CHECK: br label %is_full_tile-after + // CHECK: is_full_tile-false: -// CHECK: store i32 0, ptr %[[LOOP2_I_2]], align 4 +// CHECK-PTX: store i32 0, ptr %[[LOOP2_I_2]], align 4 +// CHECK-GCN: store i32 0, ptr{{.*}} %[[LOOP3_I_2]], align 4 // CHECK: br label %[[VAL_134:.*]] -// CHECK: loop2.loop_header4: -// CHECK: %[[VAL_136:.*]] = load i32, ptr %[[LOOP2_I_2]], align 4 -// CHECK: %[[VAL_137:.*]] = icmp uge i32 %[[VAL_136]], 8 + +// CHECK: loop2.loop_header{{(4|3)}}: +// CHECK-PTX: %[[VAL_136:.*]] = load i32, ptr %[[LOOP2_I_2]], align 4 +// CHECK-GCN: %[[VAL_136:.*]] = load i32, ptr{{.*}} %[[LOOP3_I_2]], align 4 +// CHECK: %[[VAL_137:.*]] = icmp uge i32 %[[VAL_136]], {{(8|16384)}} // CHECK: br i1 %[[VAL_137]], label %[[VAL_84]], label %[[VAL_138:.*]] -// CHECK: loop2.loop_body5: + +// CHECK: loop2.loop_body{{(5|4)}}: // CHECK: %[[VAL_139:.*]] = add nuw nsw i32 %[[VAL_136]], 1 -// CHECK: store i32 %[[VAL_139]], ptr %[[LOOP2_I_2]], align 4 +// CHECK-PTX: store i32 %[[VAL_139]], ptr %[[LOOP2_I_2]], align 4 +// CHECK-GCN: store i32 %[[VAL_139]], ptr{{.*}} %[[LOOP3_I_2]], align 4 // CHECK: %[[VAL_141:.*]] = add i32 %[[VAL_136]], %thread.id.2 // CHECK: %[[VAL_144:.*]] = icmp ult i32 %[[VAL_141]], %tile_bound.2 // CHECK: br i1 %[[VAL_144]], label %x_in_tile-true, label %x_in_tile-after + // CHECK: x_in_tile-after: -// CHECK: br label %loop2.loop_header4 -// CHECK: loop2.loop_exit3: +// CHECK: br label %loop2.loop_header{{(4|3)}} + +// CHECK: loop2.loop_exit{{(3|2)}}: // CHECK: br label %is_full_tile-after + // CHECK: x_in_tile-true: ; preds = %[[VAL_138]] -// CHECK: store i32 0, ptr %[[LOOP3_I_2]], align 4 -// CHECK: br label %loop3.loop_header10 -// CHECK: loop3.loop_header10: -// CHECK: %[[VAL_145:.*]] = load i32, ptr %[[LOOP3_I_2]], align 4 -// CHECK: %[[VAL_146:.*]] = icmp uge i32 %[[VAL_145]], 2 -// CHECK: br i1 %[[VAL_146]], label %loop3.loop_exit9, label %loop3.loop_body11 -// CHECK: loop3.loop_body11: -// CHECK: %[[VAL_147:.*]] = add nuw nsw i32 %[[VAL_145]], 1 -// CHECK: store i32 %[[VAL_147]], ptr %[[LOOP3_I_2]], align 4 -// CHECK: %[[IDX0:.*]] = add i32 %tile_origin.0, 0 -// CHECK: %[[IDX1:.*]] = add i32 %tile_origin.1, 0 -// CHECK: %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]] -// CHECK: %[[IDX3:.*]] = add i32 %tile_origin.3, %[[VAL_145]] -// CHECK: %[[VAL_148:.*]] = mul nuw nsw i32 %[[IDX3]], 1 -// CHECK: %[[VAL_149:.*]] = add nuw nsw i32 0, %[[VAL_148]] -// CHECK: %[[VAL_150:.*]] = mul nuw nsw i32 %[[IDX2]], 2 -// CHECK: %[[VAL_151:.*]] = add nuw nsw i32 %[[VAL_149]], %[[VAL_150]] -// CHECK: %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[VAL_151]] -// CHECK: %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !5 -// CHECK: store float %[[VAL_156]], ptr %reduction_input_address, align 4 -// CHECK: call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_38]]) -// CHECK: %[[VAL_158:.*]] = load float, ptr %[[VAL_38]], align 4 -// CHECK: store float %[[VAL_158]], ptr %partial_reduction_result, align 4 -// CHECK: br label %loop3.loop_header10 -// CHECK: loop3.loop_exit9: -// CHECK: br label %x_in_tile-after +// CHECK-GCN: %[[IDX0:.*]] = add i32 %tile_origin.0, 0 +// CHECK-GCN: %[[IDX1:.*]] = add i32 %tile_origin.1, 0 +// CHECK-GCN: %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]] +// CHECK-GCN: %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[IDX2]] +// CHECK-GCN: %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !3 +// CHECK-GCN: store float %[[VAL_156]], ptr{{.*}} %reduction_input_address, align 4 +// CHECK-GCN: %[[VAL_158_1:.*]] = addrspacecast ptr addrspace(5) %partial_reduction_result to ptr +// CHECK-GCN: %[[VAL_158_2:.*]] = addrspacecast ptr addrspace(5) %reduction_input_address to ptr +// CHECK-GCN: %[[VAL_158_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_38]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_158_1]], ptr %[[VAL_158_2]], ptr %[[VAL_158_3]]) +// CHECK-GCN: %[[VAL_158:.*]] = load float, ptr{{.*}} %[[VAL_38]], align 4 +// CHECK-GCN: store float %[[VAL_158]], ptr{{.*}} %partial_reduction_result, align 4 +// CHECK-GCN: br label %x_in_tile-after +// CHECK-PTX: store i32 0, ptr %[[LOOP3_I_2]], align 4 +// CHECK-PTX: br label %loop3.loop_header10 + +// CHECK-PTX: loop3.loop_header10: +// CHECK-PTX: %[[VAL_145:.*]] = load i32, ptr %[[LOOP3_I_2]], align 4 +// CHECK-PTX: %[[VAL_146:.*]] = icmp uge i32 %[[VAL_145]], 2 +// CHECK-PTX: br i1 %[[VAL_146]], label %loop3.loop_exit9, label %loop3.loop_body11 + +// CHECK-PTX: loop3.loop_body11: +// CHECK-PTX: %[[VAL_147:.*]] = add nuw nsw i32 %[[VAL_145]], 1 +// CHECK-PTX: store i32 %[[VAL_147]], ptr %[[LOOP3_I_2]], align 4 +// CHECK-PTX: %[[IDX0:.*]] = add i32 %tile_origin.0, 0 +// CHECK-PTX: %[[IDX1:.*]] = add i32 %tile_origin.1, 0 +// CHECK-PTX: %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]] +// CHECK-PTX: %[[IDX3:.*]] = add i32 %tile_origin.3, %[[VAL_145]] +// CHECK-PTX: %[[VAL_148:.*]] = mul nuw nsw i32 %[[IDX3]], 1 +// CHECK-PTX: %[[VAL_149:.*]] = add nuw nsw i32 0, %[[VAL_148]] +// CHECK-PTX: %[[VAL_150:.*]] = mul nuw nsw i32 %[[IDX2]], 2 +// CHECK-PTX: %[[VAL_151:.*]] = add nuw nsw i32 %[[VAL_149]], %[[VAL_150]] +// CHECK-PTX: %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[VAL_151]] +// CHECK-PTX: %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !5 +// CHECK-PTX: store float %[[VAL_156]], ptr %reduction_input_address, align 4 +// CHECK-PTX: call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_38]]) +// CHECK-PTX: %[[VAL_158:.*]] = load float, ptr %[[VAL_38]], align 4 +// CHECK-PTX: store float %[[VAL_158]], ptr %partial_reduction_result, align 4 +// CHECK-PTX: br label %loop3.loop_header10 + +// CHECK-PTX: loop3.loop_exit9: +// CHECK-PTX: br label %x_in_tile-after + // CHECK: thread_in_bounds-true: // CHECK: %[[VAL_166:.*]] = icmp eq i32 %lane_id, 0 // CHECK: br i1 %[[VAL_166]], label %[[VAL_167:.*]], label %[[VAL_168:.*]] + // CHECK: intra_warp_reduce_write-after: ; preds = %[[VAL_167]], %[[VAL_105]] -// CHECK: call void @llvm.nvvm.barrier0() +// CHECK-GCM: fence syncscope("workgroup") seq_cst +// CHECK-GCM: call void @llvm.amdgcn.s.barrier() +// CHECK-PTX: call void @llvm.nvvm.barrier0() // CHECK: %[[VAL_169:.*]] = icmp eq i32 %[[VAL_101]], 0 // CHECK: br i1 %[[VAL_169]], label %inter_warp_reduce-true, label %inter_warp_reduce-after // CHECK: inter_warp_reduce-after: ; preds = %[[VAL_171:.*]], %[[VAL_168]] // CHECK: br label %[[VAL_51]] // CHECK: intra_warp_reduce_write-true: ; preds = %[[VAL_105]] -// CHECK: %[[VAL_172:.*]] = load float, ptr %partial_reduction_result, align 4 +// CHECK: %[[VAL_172:.*]] = load float, ptr{{.*}} %partial_reduction_result, align 4 // CHECK: %[[VAL_173:.*]] = getelementptr inbounds [1 x [32 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %[[VAL_101]] // CHECK: %[[VAL_174:.*]] = addrspacecast ptr addrspace(3) %[[VAL_173]] to ptr // CHECK: store float %[[VAL_172]], ptr %[[VAL_174]], align 4 @@ -240,38 +328,76 @@ ENTRY reduce.1 { // CHECK: inter_warp_reduce-true: ; preds = %[[VAL_168]] // CHECK: %[[VAL_175:.*]] = getelementptr inbounds [1 x [32 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %lane_id // CHECK: %[[VAL_176:.*]] = addrspacecast ptr addrspace(3) %[[VAL_175]] to ptr -// CHECK: store float %[[VAL_53]], ptr %[[VAL_27]], align 4 +// CHECK-GCN: %[[VAL_176_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_27]] to ptr +// CHECK-GCN: store float %[[VAL_53]], ptr{{.*}} %[[VAL_176_1]], align 4 +// CHECK-PTX: store float %[[VAL_53]], ptr %[[VAL_27]], align 4 // CHECK: %[[VAL_177:.*]] = icmp ult i32 %thread.id.2, 32 -// CHECK: %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_27]] +// CHECK-GCN: %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_176_1]] +// CHECK-PTX: %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_27]] // CHECK: %[[VAL_179:.*]] = load float, ptr %[[VAL_178]], align 4 -// CHECK: %[[VAL_180:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_179]], i32 16, i32 31) -// CHECK: store float %[[VAL_180]], ptr %[[VAL_26]], align 4 -// CHECK: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_26]], ptr %[[VAL_25]]) -// CHECK: %[[VAL_181:.*]] = load float, ptr %[[VAL_25]], align 4 +// CHECK-GCN: %[[VAL_179_1:.*]] = bitcast float %[[VAL_179]] to i32 +// CHECK-GCN: %[[VAL_180:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_179_1]], i32 16) +// CHECK-GCN: %[[VAL_180_1:.*]] = bitcast i32 %[[VAL_180]] to float +// CHECK-GCN: store float %[[VAL_180_1]], ptr{{.*}} %[[VAL_26]], align 4 +// CHECK-PTX: %[[VAL_180:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_179]], i32 16, i32 31) +// CHECK-PTX: store float %[[VAL_180]], ptr %[[VAL_26]], align 4 +// CHECK-GCN: %[[VAL_181_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_26]] to ptr +// CHECK-GCN: %[[VAL_181_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_25]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_181_2]], ptr %[[VAL_181_3]]) +// CHECK-PTX: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_26]], ptr %[[VAL_25]]) +// CHECK: %[[VAL_181:.*]] = load float, ptr{{.*}} %[[VAL_25]], align 4 // CHECK: store float %[[VAL_181]], ptr %[[VAL_178]], align 4 // CHECK: %[[VAL_182:.*]] = load float, ptr %[[VAL_178]], align 4 -// CHECK: %[[VAL_183:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_182]], i32 8, i32 31) -// CHECK: store float %[[VAL_183]], ptr %[[VAL_24]], align 4 -// CHECK: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_24]], ptr %[[VAL_23]]) -// CHECK: %[[VAL_184:.*]] = load float, ptr %[[VAL_23]], align 4 +// CHECK-GCN: %[[VAL_182_1:.*]] = bitcast float %[[VAL_182]] to i32 +// CHECK-GCN: %[[VAL_183:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_182_1]], i32 8) +// CHECK-GCN: %[[VAL_183_1:.*]] = bitcast i32 %[[VAL_183]] to float +// CHECK-GCN: store float %[[VAL_183_1]], ptr{{.*}} %[[VAL_24]], align 4 +// CHECK-PTX: %[[VAL_183:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_182]], i32 8, i32 31) +// CHECK-PTX: store float %[[VAL_183]], ptr %[[VAL_24]], align 4 +// CHECK-GCN: %[[VAL_184_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_24]] to ptr +// CHECK-GCN: %[[VAL_184_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_23]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_184_2]], ptr %[[VAL_184_3]]) +// CHECK-PTX: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_24]], ptr %[[VAL_23]]) +// CHECK: %[[VAL_184:.*]] = load float, ptr{{.*}} %[[VAL_23]], align 4 // CHECK: store float %[[VAL_184]], ptr %[[VAL_178]], align 4 // CHECK: %[[VAL_185:.*]] = load float, ptr %[[VAL_178]], align 4 -// CHECK: %[[VAL_186:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_185]], i32 4, i32 31) -// CHECK: store float %[[VAL_186]], ptr %[[VAL_22]], align 4 -// CHECK: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_22]], ptr %[[VAL_21]]) -// CHECK: %[[VAL_187:.*]] = load float, ptr %[[VAL_21]], align 4 +// CHECK-GCN: %[[VAL_185_1:.*]] = bitcast float %[[VAL_185]] to i32 +// CHECK-GCN: %[[VAL_186:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_185_1]], i32 4) +// CHECK-GCN: %[[VAL_186_1:.*]] = bitcast i32 %[[VAL_186]] to float +// CHECK-GCN: store float %[[VAL_186_1]], ptr{{.*}} %[[VAL_22]], align 4 +// CHECK-PTX: %[[VAL_186:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_185]], i32 4, i32 31) +// CHECK-PTX: store float %[[VAL_186]], ptr %[[VAL_22]], align 4 +// CHECK-GCN: %[[VAL_187_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_22]] to ptr +// CHECK-GCN: %[[VAL_187_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_21]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_187_2]], ptr %[[VAL_187_3]]) +// CHECK-PTX: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_22]], ptr %[[VAL_21]]) +// CHECK: %[[VAL_187:.*]] = load float, ptr{{.*}} %[[VAL_21]], align 4 // CHECK: store float %[[VAL_187]], ptr %[[VAL_178]], align 4 // CHECK: %[[VAL_188:.*]] = load float, ptr %[[VAL_178]], align 4 -// CHECK: %[[VAL_189:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_188]], i32 2, i32 31) -// CHECK: store float %[[VAL_189]], ptr %[[VAL_20]], align 4 -// CHECK: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_20]], ptr %[[VAL_19]]) -// CHECK: %[[VAL_190:.*]] = load float, ptr %[[VAL_19]], align 4 +// CHECK-GCN: %[[VAL_188_1:.*]] = bitcast float %[[VAL_188]] to i32 +// CHECK-GCN: %[[VAL_189:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_188_1]], i32 2) +// CHECK-GCN: %[[VAL_189_1:.*]] = bitcast i32 %[[VAL_189]] to float +// CHECK-GCN: store float %[[VAL_189_1]], ptr{{.*}} %[[VAL_20]], align 4 +// CHECK-PTX: %[[VAL_189:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_188]], i32 2, i32 31) +// CHECK-PTX: store float %[[VAL_189]], ptr %[[VAL_20]], align 4 +// CHECK-GCN: %[[VAL_190_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_20]] to ptr +// CHECK-GCN: %[[VAL_190_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_19]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_190_2]], ptr %[[VAL_190_3]]) +// CHECK-PTX: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_20]], ptr %[[VAL_19]]) +// CHECK: %[[VAL_190:.*]] = load float, ptr{{.*}} %[[VAL_19]], align 4 // CHECK: store float %[[VAL_190]], ptr %[[VAL_178]], align 4 // CHECK: %[[VAL_191:.*]] = load float, ptr %[[VAL_178]], align 4 -// CHECK: %[[VAL_192:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_191]], i32 1, i32 31) -// CHECK: store float %[[VAL_192]], ptr %[[VAL_18]], align 4 -// CHECK: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_18]], ptr %[[VAL_17]]) -// CHECK: %[[VAL_193:.*]] = load float, ptr %[[VAL_17]], align 4 +// CHECK-GCN: %[[VAL_191_1:.*]] = bitcast float %[[VAL_191]] to i32 +// CHECK-GCN: %[[VAL_192:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_191_1]], i32 1) +// CHECK-GCN: %[[VAL_192_1:.*]] = bitcast i32 %[[VAL_192]] to float +// CHECK-GCN: store float %[[VAL_192_1]], ptr{{.*}} %[[VAL_18]], align 4 +// CHECK-PTX: %[[VAL_192:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_191]], i32 1, i32 31) +// CHECK-PTX: store float %[[VAL_192]], ptr %[[VAL_18]], align 4 +// CHECK-GCN: %[[VAL_193_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_18]] to ptr +// CHECK-GCN: %[[VAL_193_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_17]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_193_2]], ptr %[[VAL_193_3]]) +// CHECK-PTX: call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_18]], ptr %[[VAL_17]]) +// CHECK: %[[VAL_193:.*]] = load float, ptr{{.*}} %[[VAL_17]], align 4 // CHECK: store float %[[VAL_193]], ptr %[[VAL_178]], align 4 // CHECK: %[[VAL_194:.*]] = icmp eq i32 %thread.id.2, 0 // CHECK: br i1 %[[VAL_194]], label %[[VAL_195:.*]], label %[[VAL_171]] @@ -280,29 +406,38 @@ ENTRY reduce.1 { // CHECK: reduction_write_output-true: // CHECK: %[[VAL_200:.*]] = load float, ptr %[[VAL_178]], align 4 // CHECK: %[[VAL_201:.*]] = load i32, ptr %[[VAL_202:.*]], align 4 -// CHECK: store i32 %[[VAL_201]], ptr %[[VAL_16]], align 4 +// CHECK: store i32 %[[VAL_201]], ptr{{.*}} %[[VAL_16]], align 4 // CHECK: br label %[[VAL_203:.*]] // CHECK: atomic_op_loop_exit: ; preds = %[[VAL_204:.*]], %[[VAL_203]] // CHECK: br label %[[VAL_171]] // CHECK: atomic_op_loop_body: ; preds = %[[VAL_204]], %[[VAL_195]] -// CHECK: %[[VAL_205:.*]] = load i32, ptr %[[VAL_16]], align 4 -// CHECK: store i32 %[[VAL_205]], ptr %[[VAL_15]], align 4 -// CHECK: call void @[[MIN]](ptr %[[VAL_15]], ptr %[[VAL_178]], ptr %[[VAL_15]]) -// CHECK: %[[VAL_206:.*]] = load i32, ptr %[[VAL_15]], align 4 +// CHECK: %[[VAL_205:.*]] = load i32, ptr{{.*}} %[[VAL_16]], align 4 +// CHECK: store i32 %[[VAL_205]], ptr{{.*}} %[[VAL_15]], align 4 +// CHECK-GCN: %[[VAL_206_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_15]] to ptr +// CHECK-GCN: %[[VAL_206_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_15]] to ptr +// CHECK-GCN: call void @[[MIN]](ptr %[[VAL_206_1]], ptr %[[VAL_178]], ptr %[[VAL_206_2]]) +// CHECK-PTX: call void @[[MIN]](ptr %[[VAL_15]], ptr %[[VAL_178]], ptr %[[VAL_15]]) +// CHECK: %[[VAL_206:.*]] = load i32, ptr{{.*}} %[[VAL_15]], align 4 // CHECK: %[[VAL_207:.*]] = icmp eq i32 %[[VAL_205]], %[[VAL_206]] // CHECK: br i1 %[[VAL_207]], label %atomic_op_loop_exit, label %atomic_op_loop_cas // CHECK: atomic_op_loop_cas: ; preds = %[[VAL_203]] -// CHECK: %[[VAL_208:.*]] = cmpxchg ptr %[[VAL_202]], i32 %[[VAL_205]], i32 %[[VAL_206]] seq_cst seq_cst, align 4 +// CHECK: %[[VAL_208:.*]] = cmpxchg ptr %[[VAL_202]], i32 %[[VAL_205]], i32 %[[VAL_206]]{{.*}} seq_cst seq_cst, align 4 // CHECK: %[[VAL_209:.*]] = extractvalue { i32, i1 } %[[VAL_208]], 0 -// CHECK: store i32 %[[VAL_209]], ptr %[[VAL_16]], align 4 +// CHECK: store i32 %[[VAL_209]], ptr{{.*}} %[[VAL_16]], align 4 // CHECK: %[[VAL_210:.*]] = extractvalue { i32, i1 } %[[VAL_208]], 1 // CHECK: br i1 %[[VAL_210]], label %atomic_op_loop_exit, label %atomic_op_loop_body // CHECK: entry: // CHECK: %[[VAL_211:.*]] = alloca float, align 4 // CHECK: %[[VAL_212:.*]] = load float, ptr %[[VAL_213:.*]], align 4 // CHECK: %[[VAL_214:.*]] = load float, ptr %[[VAL_215:.*]], align 4 -// CHECK: %[[VAL_216:.*]] = call float @llvm.minimum.f32(float %[[VAL_212]], float %[[VAL_214]]) -// CHECK: store float %[[VAL_216]], ptr %[[VAL_211]], align 4 -// CHECK: %[[VAL_217:.*]] = load float, ptr %[[VAL_211]], align 4 +// CHECK-PTX: %[[VAL_216:.*]] = call float @llvm.minimum.f32(float %[[VAL_212]], float %[[VAL_214]]) +// CHECK-GCN: %[[VAL_216_1:.*]] = fcmp une float %[[VAL_212]], %[[VAL_212]] +// CHECK-GCN: %[[VAL_216_2:.*]] = fcmp oeq float %[[VAL_214]], %[[VAL_214]] +// CHECK-GCN: %[[VAL_216_3:.*]] = fcmp ole float %[[VAL_212]], %[[VAL_214]] +// CHECK-GCN: %[[VAL_216_4:.*]] = and i1 %[[VAL_216_2]], %[[VAL_216_3]] +// CHECK-GCN: %[[VAL_216_5:.*]] = or i1 %[[VAL_216_1]], %[[VAL_216_4]] +// CHECK-GCN: %[[VAL_216:.*]] = select i1 %[[VAL_216_5]], float %[[VAL_212]], float %[[VAL_214]] +// CHECK: store float %[[VAL_216]], ptr{{.*}} %[[VAL_211]], align 4 +// CHECK: %[[VAL_217:.*]] = load float, ptr{{.*}} %[[VAL_211]], align 4 // CHECK: store float %[[VAL_217]], ptr %[[VAL_218:.*]], align 4 // CHECK: ret void diff --git a/xla/service/gpu/tests/reduce_column_layout_change.hlo b/xla/service/gpu/tests/reduce_column_layout_change.hlo index cb30643886de4..122929f3df280 100644 --- a/xla/service/gpu/tests/reduce_column_layout_change.hlo +++ b/xla/service/gpu/tests/reduce_column_layout_change.hlo @@ -43,7 +43,7 @@ ENTRY kernel_entry { // CHECK: ret void // CHECK: reduce-group-0-true: ; preds = %[[VAL_20]] // CHECK: %[[VAL_21:.*]] = load float, ptr @0, align 4 -// CHECK: store float %[[VAL_21]], ptr %[[VAL_13]], align 4 +// CHECK: store float %[[VAL_21]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4 // CHECK-PTX: %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3 // CHECK-GCN: %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x // CHECK-PTX: %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4 @@ -62,71 +62,84 @@ ENTRY kernel_entry { // CHECK: %tile_origin.0 = mul i32 %[[VAL_27]], 1 // CHECK: %tile_origin.1 = mul i32 %[[VAL_26]], 4096 // CHECK: %tile_origin.2 = mul i32 %[[VAL_24]], 32 -// CHECK: store i32 %thread.id.1, ptr %[[VAL_12]], align 4 +// CHECK: store i32 %thread.id.1, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4 // CHECK: br label %[[VAL_29:.*]] // CHECK: loop1.loop_header: ; preds = %[[VAL_30:.*]], %[[VAL_17]] -// CHECK: %[[VAL_31:.*]] = load i32, ptr %[[VAL_12]], align 4 +// CHECK: %[[VAL_31:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4 // CHECK: %[[VAL_32:.*]] = icmp uge i32 %[[VAL_31]], %tile_bound.1 // CHECK: br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]] // CHECK: loop1.loop_body: ; preds = %[[VAL_29]] // CHECK: %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32 -// CHECK: store i32 %[[VAL_35]], ptr %[[VAL_12]], align 4 -// CHECK: store i32 0, ptr %[[VAL_11]], align 4 +// CHECK: store i32 %[[VAL_35]], ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4 +// CHECK: store i32 0, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4 // CHECK: br label %[[VAL_37:.*]] // CHECK: loop2.loop_header: ; preds = %[[VAL_38:.*]], %[[VAL_34]] -// CHECK: %[[VAL_39:.*]] = load i32, ptr %[[VAL_11]], align 4 +// CHECK: %[[VAL_39:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4 // CHECK: %[[VAL_40:.*]] = icmp uge i32 %[[VAL_39]], 32 // CHECK: br i1 %[[VAL_40]], label %[[VAL_30]], label %[[VAL_41:.*]] // CHECK: loop2.loop_body: ; preds = %[[VAL_37]] // CHECK: %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32 -// CHECK: store i32 %[[VAL_42]], ptr %[[VAL_11]], align 4 +// CHECK: store i32 %[[VAL_42]], ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4 // CHECK: %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2 // CHECK: %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32 // CHECK: br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]] // CHECK: x_in_tile-after: ; preds = %[[VAL_46]], %[[VAL_41]] -// CHECK: br label %[[VAL_37]], !llvm.loop !5 +// CHECK: br label %[[VAL_37]], !llvm.loop // CHECK: loop2.loop_exit: ; preds = %[[VAL_37]] -// CHECK: br label %[[VAL_29]], !llvm.loop !8 +// CHECK: br label %[[VAL_29]], !llvm.loop // CHECK: loop1.loop_exit: ; preds = %[[VAL_29]] -// CHECK: %[[VAL_47:.*]] = load float, ptr %[[VAL_13]], align 4 +// CHECK: %[[VAL_47:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4 // CHECK: %[[VAL_48:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.2, i32 %thread.id.1 // CHECK: %[[VAL_49:.*]] = addrspacecast ptr addrspace(3) %[[VAL_48]] to ptr // CHECK: store float %[[VAL_47]], ptr %[[VAL_49]], align 4 -// CHECK: call void @llvm.nvvm.barrier0() +// CHECK-PTX: call void @llvm.nvvm.barrier0() +// CHECK-GCN: call void @llvm.amdgcn.s.barrier() // CHECK: %[[VAL_50:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %thread.id.2 // CHECK: %[[VAL_51:.*]] = addrspacecast ptr addrspace(3) %[[VAL_50]] to ptr // CHECK: %[[VAL_52:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31) -// CHECK: store float %[[VAL_53]], ptr %[[VAL_9]], align 4 -// CHECK: call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]]) -// CHECK: %[[VAL_54:.*]] = load float, ptr %[[VAL_8]], align 4 +// CHECK-PTX: %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_53_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_53:.*]] = bitcast i32 +// CHECK: store float %[[VAL_53]], ptr{{( addrspace\(5\))?}} %[[VAL_9]], align 4 +// CHECK-PTX: call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]]) +// CHECK: %[[VAL_54:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_8]], align 4 // CHECK: store float %[[VAL_54]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_55:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31) -// CHECK: store float %[[VAL_56]], ptr %[[VAL_7]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]]) -// CHECK: %[[VAL_57:.*]] = load float, ptr %[[VAL_6]], align 4 -// CHECK: store float %[[VAL_57]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_58:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31) -// CHECK: store float %[[VAL_59]], ptr %[[VAL_5]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]]) -// CHECK: %[[VAL_60:.*]] = load float, ptr %[[VAL_4]], align 4 -// CHECK: store float %[[VAL_60]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_61:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31) -// CHECK: store float %[[VAL_62]], ptr %[[VAL_3]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]]) -// CHECK: %[[VAL_63:.*]] = load float, ptr %[[VAL_2]], align 4 -// CHECK: store float %[[VAL_63]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_64:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31) -// CHECK: store float %[[VAL_65]], ptr %[[VAL_1]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]]) -// CHECK: %[[VAL_66:.*]] = load float, ptr %[[VAL_0]], align 4 -// CHECK: store float %[[VAL_66]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32 -// CHECK: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1 +// CHECK: %[[VAL_55:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_56_1_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_56:.*]] = bitcast i32 +// CHECK: store float %[[VAL_56]], ptr{{( addrspace\(5\))?}} %[[VAL_7]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]]) +// CHECK: %[[VAL_57:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_6]], align 4 +// CHECK: store float %[[VAL_57]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: %[[VAL_58:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_59_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_59:.*]] = bitcast i32 +// CHECK: store float %[[VAL_59]], ptr{{( addrspace\(5\))?}} %[[VAL_5]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]]) +// CHECK: %[[VAL_60:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_4]], align 4 +// CHECK: store float %[[VAL_60]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: %[[VAL_61:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_62_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_62:.*]] = bitcast i32 +// CHECK: store float %[[VAL_62]], ptr{{( addrspace\(5\))?}} %[[VAL_3]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]]) +// CHECK: %[[VAL_63:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_2]], align 4 +// CHECK: store float %[[VAL_63]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: %[[VAL_64:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_65_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_65:.*]] = bitcast i32 +// CHECK: store float %[[VAL_65]], ptr{{( addrspace\(5\))?}} %[[VAL_1]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]]) +// CHECK: %[[VAL_66:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_0]], align 4 +// CHECK: store float %[[VAL_66]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32 +// CHECK-PTX: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1 +// CHECK-GCN: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1 +// CHECK-GCN: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32 // CHECK: %[[VAL_69:.*]] = and i1 %[[VAL_67]], %[[VAL_68]] // CHECK: %[[VAL_70:.*]] = icmp eq i32 %lane_id, 0 // CHECK: %[[VAL_71:.*]] = and i1 %[[VAL_69]], %[[VAL_70]] @@ -158,11 +171,11 @@ ENTRY kernel_entry { // CHECK: %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_73]], 1 // CHECK: %[[VAL_95:.*]] = add nuw nsw i32 0, %[[VAL_94]] // CHECK: %[[VAL_96:.*]] = getelementptr inbounds [12 x [3 x [32 x [16 x [32 x [4 x [3 x [12 x float]]]]]]]], ptr %[[VAL_97:.*]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_91]], i32 %[[VAL_89]], i32 %[[VAL_85]], i32 %[[VAL_84]], i32 %[[VAL_82]], i32 %[[VAL_80]], i32 %[[VAL_78]] -// CHECK: %[[VAL_98:.*]] = load float, ptr %[[VAL_96]], align 4, !invariant.load !9 -// CHECK: store float %[[VAL_98]], ptr %[[VAL_14]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]]) -// CHECK: %[[VAL_99:.*]] = load float, ptr %[[VAL_10]], align 4 -// CHECK: store float %[[VAL_99]], ptr %[[VAL_13]], align 4 +// CHECK: %[[VAL_98:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_96]], align 4, !invariant.load +// CHECK: store float %[[VAL_98]], ptr{{( addrspace\(5\))?}} %[[VAL_14]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]]) +// CHECK: %[[VAL_99:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_10]], align 4 +// CHECK: store float %[[VAL_99]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4 // CHECK: br label %[[VAL_38]] // CHECK: reduction_write_output-true: ; preds = %[[VAL_33]] // CHECK: %[[VAL_100:.*]] = add i32 %tile_origin.2, %thread.id.1 @@ -180,15 +193,15 @@ ENTRY kernel_entry { // CHECK: %[[VAL_112:.*]] = mul nuw nsw i32 %tile_origin.0, 1 // CHECK: %[[VAL_113:.*]] = add nuw nsw i32 0, %[[VAL_112]] // CHECK: %[[VAL_114:.*]] = getelementptr inbounds [12 x [16 x [4 x [3 x [32 x float]]]]], ptr %[[VAL_115:.*]], i32 0, i32 %[[VAL_103]], i32 %[[VAL_110]], i32 %[[VAL_107]], i32 %[[VAL_105]], i32 %[[VAL_109]] -// CHECK: %[[VAL_116:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: store float %[[VAL_116]], ptr %[[VAL_114]], align 4 +// CHECK: %[[VAL_116:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: store float %[[VAL_116]], ptr{{( addrspace\(5\))?}} %[[VAL_114]], align 4 // CHECK: br label %[[VAL_19]] // CHECK: entry: // CHECK: %[[VAL_117:.*]] = alloca float, align 4 -// CHECK: %[[VAL_118:.*]] = load float, ptr %[[VAL_119:.*]], align 4 -// CHECK: %[[VAL_120:.*]] = load float, ptr %[[VAL_121:.*]], align 4 +// CHECK: %[[VAL_118:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_119:.*]], align 4 +// CHECK: %[[VAL_120:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_121:.*]], align 4 // CHECK: %[[VAL_122:.*]] = fadd float %[[VAL_118]], %[[VAL_120]] -// CHECK: store float %[[VAL_122]], ptr %[[VAL_117]], align 4 -// CHECK: %[[VAL_123:.*]] = load float, ptr %[[VAL_117]], align 4 -// CHECK: store float %[[VAL_123]], ptr %[[VAL_124:.*]], align 4 +// CHECK: store float %[[VAL_122]], ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4 +// CHECK: %[[VAL_123:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4 +// CHECK: store float %[[VAL_123]], ptr{{( addrspace\(5\))?}} %[[VAL_124:.*]], align 4 // CHECK: ret void diff --git a/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo b/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo index 736e583ad4c3c..35ba85befe94a 100644 --- a/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo +++ b/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo @@ -43,13 +43,13 @@ ENTRY reduce.1 { // CHECK: %[[VAL_20:.*]] = alloca %[[VAL_1]], align 8 // CHECK: %[[VAL_21:.*]] = alloca %[[VAL_1]], align 8 // CHECK: %[[VAL_22:.*]] = alloca %[[VAL_1]], align 8 -// CHECK: %[[VAL_23:.*]] = alloca i32, align 4 -// CHECK: %[[VAL_24:.*]] = alloca i32, align 4 -// CHECK: %[[VAL_25:.*]] = alloca %[[VAL_1]], align 8 -// CHECK: %[[VAL_26:.*]] = alloca i32, align 4 -// CHECK: %[[VAL_27:.*]] = alloca i32, align 4 -// CHECK: %[[VAL_28:.*]] = alloca %[[VAL_1]], align 8 -// CHECK: %[[VAL_29:.*]] = alloca %[[VAL_1]], align 8 +// CHECK-PTX: %[[VAL_23:.*]] = alloca i32, align 4 +// CHECK-PTX: %[[VAL_24:.*]] = alloca i32, align 4 +// CHECK-DAG: %[[VAL_25:.*]] = alloca %[[VAL_1]], align 8 +// CHECK-DAG: %[[VAL_26:.*]] = alloca i32, align 4 +// CHECK-DAG: %[[VAL_27:.*]] = alloca i32, align 4 +// CHECK-DAG: %[[VAL_28:.*]] = alloca %[[VAL_1]], align 8 +// CHECK-DAG: %[[VAL_29:.*]] = alloca %[[VAL_1]], align 8 // CHECK-PTX: %[[VAL_30:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2 // CHECK-GCN: %[[VAL_30:.*]] = call i32 @llvm.amdgcn.workgroup.id.y // CHECK: %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0 @@ -57,8 +57,8 @@ ENTRY reduce.1 { // CHECK: reduce-group-0-after: ; preds = %thread_in_bounds-after, %[[VAL_34:.*]] // CHECK: ret void // CHECK: reduce-group-0-true: ; preds = %[[VAL_34]] -// CHECK: %[[VAL_35:.*]] = load %[[VAL_1]], ptr %[[VAL_36:.*]], align 1, !invariant.load !3 -// CHECK: store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_28]], align 1 +// CHECK: %[[VAL_35:.*]] = load %[[VAL_1]], ptr %[[VAL_36:.*]], align 1, !invariant.load !{{[0-9]}} +// CHECK: store %[[VAL_1]] %[[VAL_35]], ptr{{.*}} %[[VAL_28]], align 1 // CHECK-PTX: %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !4 // CHECK-GCN: %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x // CHECK-PTX: %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2 @@ -69,221 +69,313 @@ ENTRY reduce.1 { // CHECK: %[[VAL_38:.*]] = urem i32 %[[VAL_37]], 1 // CHECK: %[[VAL_39:.*]] = udiv i32 %block.id.x, 1 // CHECK: %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1 -// CHECK: %[[VAL_41:.*]] = udiv i32 %block.id.x, 1 -// CHECK: %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 1 +// CHECK-PTX: %[[VAL_41:.*]] = udiv i32 %block.id.x, 1 +// CHECK-PTX: %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 1 // CHECK: %[[VAL_43:.*]] = udiv i32 %block.id.x, 1 -// CHECK: %[[VAL_44:.*]] = icmp eq i32 %[[VAL_40]], 0 -// CHECK: %tile_bound.2 = select i1 %[[VAL_44]], i32 5000, i32 5120 +// CHECK-PTX: %[[VAL_44:.*]] = icmp eq i32 %[[VAL_40]], 0 +// CHECK-GCN: %[[VAL_44:.*]] = icmp eq i32 %[[VAL_38]], 0 +// CHECK-PTX: %tile_bound.2 = select i1 %[[VAL_44]], i32 5000, i32 5120 +// CHECK-GCN: %tile_bound.2 = select i1 %[[VAL_44]], i32 10000, i32 10240 // CHECK: %tile_origin.0 = mul i32 %[[VAL_43]], 1 -// CHECK: %tile_origin.1 = mul i32 %[[VAL_42]], 1 -// CHECK: %tile_origin.2 = mul i32 %[[VAL_40]], 5120 -// CHECK: %tile_origin.3 = mul i32 %[[VAL_38]], 2 -// CHECK: %[[VAL_45:.*]] = icmp eq i32 5120, %tile_bound.2 +// CHECK-PTX: %tile_origin.1 = mul i32 %[[VAL_42]], 1 +// CHECK-GCN: %tile_origin.1 = mul i32 %[[VAL_40]], 1 +// CHECK-PTX: %tile_origin.2 = mul i32 %[[VAL_40]], 5120 +// CHECK-GCN: %tile_origin.2 = mul i32 %[[VAL_38]], 10240 +// CHECK-PTX: %tile_origin.3 = mul i32 %[[VAL_38]], 2 +// CHECK-PTX: %[[VAL_45:.*]] = icmp eq i32 5120, %tile_bound.2 +// CHECK-GCN: %[[VAL_45:.*]] = icmp eq i32 10240, %tile_bound.2 // CHECK: br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_47:.*]] // CHECK: is_full_tile-after: ; preds = %[[VAL_48:.*]], %[[VAL_49:.*]] -// CHECK: %[[VAL_50:.*]] = load i128, ptr %[[VAL_28]], align 16 +// CHECK: %[[VAL_50:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}} // CHECK: %[[VAL_51:.*]] = bitcast i128 %[[VAL_50]] to <4 x i32> // CHECK: %[[VAL_52:.*]] = extractelement <4 x i32> %[[VAL_51]], i64 0 -// CHECK: %[[VAL_53:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_52]], i32 16, i32 31) +// CHECK-PTX: %[[VAL_53:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_52]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_53:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_52]], i32 16) // CHECK: %[[VAL_54:.*]] = insertelement <4 x i32> %[[VAL_51]], i32 %[[VAL_53]], i64 0 // CHECK: %[[VAL_55:.*]] = extractelement <4 x i32> %[[VAL_54]], i64 1 -// CHECK: %[[VAL_56:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_55]], i32 16, i32 31) +// CHECK-PTX: %[[VAL_56:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_55]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_56:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_55]], i32 16) // CHECK: %[[VAL_57:.*]] = insertelement <4 x i32> %[[VAL_54]], i32 %[[VAL_56]], i64 1 // CHECK: %[[VAL_58:.*]] = extractelement <4 x i32> %[[VAL_57]], i64 2 -// CHECK: %[[VAL_59:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_58]], i32 16, i32 31) +// CHECK-PTX: %[[VAL_59:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_58]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_59:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_58]], i32 16) // CHECK: %[[VAL_60:.*]] = insertelement <4 x i32> %[[VAL_57]], i32 %[[VAL_59]], i64 2 // CHECK: %[[VAL_61:.*]] = extractelement <4 x i32> %[[VAL_60]], i64 3 -// CHECK: %[[VAL_62:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_61]], i32 16, i32 31) +// CHECK-PTX: %[[VAL_62:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_61]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_62:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_61]], i32 16) // CHECK: %[[VAL_63:.*]] = insertelement <4 x i32> %[[VAL_60]], i32 %[[VAL_62]], i64 3 // CHECK: %[[VAL_64:.*]] = bitcast <4 x i32> %[[VAL_63]] to i128 -// CHECK: store i128 %[[VAL_64]], ptr %[[VAL_21]], align 16 -// CHECK: call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_21]], ptr %[[VAL_20]]) -// CHECK: %[[VAL_65:.*]] = load %[[VAL_1]], ptr %[[VAL_20]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_65]], ptr %[[VAL_28]], align 1 -// CHECK: %[[VAL_66:.*]] = load i128, ptr %[[VAL_28]], align 16 +// CHECK: store i128 %[[VAL_64]], ptr{{.*}} %[[VAL_21]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_65_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_65_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_21]] to ptr +// CHECK-GCN: %[[VAL_65_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_20]] to ptr +// CHECK-GCN: call void @[[SUM:Sum.*]](ptr %[[VAL_65_1]], ptr %[[VAL_65_2]], ptr %[[VAL_65_3]]) +// CHECK-PTX: call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_21]], ptr %[[VAL_20]]) +// CHECK: %[[VAL_65:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_20]], align 1 +// CHECK: store %[[VAL_1]] %[[VAL_65]], ptr{{.*}} %[[VAL_28]], align 1 +// CHECK: %[[VAL_66:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}} // CHECK: %[[VAL_67:.*]] = bitcast i128 %[[VAL_66]] to <4 x i32> // CHECK: %[[VAL_68:.*]] = extractelement <4 x i32> %[[VAL_67]], i64 0 -// CHECK: %[[VAL_69:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_68]], i32 8, i32 31) +// CHECK-PTX: %[[VAL_69:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_68]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_69:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_68]], i32 8) // CHECK: %[[VAL_70:.*]] = insertelement <4 x i32> %[[VAL_67]], i32 %[[VAL_69]], i64 0 // CHECK: %[[VAL_71:.*]] = extractelement <4 x i32> %[[VAL_70]], i64 1 -// CHECK: %[[VAL_72:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_71]], i32 8, i32 31) +// CHECK-PTX: %[[VAL_72:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_71]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_72:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_71]], i32 8) // CHECK: %[[VAL_73:.*]] = insertelement <4 x i32> %[[VAL_70]], i32 %[[VAL_72]], i64 1 // CHECK: %[[VAL_74:.*]] = extractelement <4 x i32> %[[VAL_73]], i64 2 -// CHECK: %[[VAL_75:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_74]], i32 8, i32 31) +// CHECK-PTX: %[[VAL_75:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_74]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_75:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_74]], i32 8) // CHECK: %[[VAL_76:.*]] = insertelement <4 x i32> %[[VAL_73]], i32 %[[VAL_75]], i64 2 // CHECK: %[[VAL_77:.*]] = extractelement <4 x i32> %[[VAL_76]], i64 3 -// CHECK: %[[VAL_78:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_77]], i32 8, i32 31) +// CHECK-PTX: %[[VAL_78:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_77]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_78:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_77]], i32 8) // CHECK: %[[VAL_79:.*]] = insertelement <4 x i32> %[[VAL_76]], i32 %[[VAL_78]], i64 3 // CHECK: %[[VAL_80:.*]] = bitcast <4 x i32> %[[VAL_79]] to i128 -// CHECK: store i128 %[[VAL_80]], ptr %[[VAL_19]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_19]], ptr %[[VAL_18]]) -// CHECK: %[[VAL_81:.*]] = load %[[VAL_1]], ptr %[[VAL_18]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_81]], ptr %[[VAL_28]], align 1 -// CHECK: %[[VAL_82:.*]] = load i128, ptr %[[VAL_28]], align 16 +// CHECK: store i128 %[[VAL_80]], ptr{{.*}} %[[VAL_19]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_81_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_81_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_19]] to ptr +// CHECK-GCN: %[[VAL_81_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_18]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_81_1]], ptr %[[VAL_81_2]], ptr %[[VAL_81_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_19]], ptr %[[VAL_18]]) +// CHECK: %[[VAL_81:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_18]], align 1 +// CHECK: store %[[VAL_1]] %[[VAL_81]], ptr{{.*}} %[[VAL_28]], align 1 +// CHECK: %[[VAL_82:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}} // CHECK: %[[VAL_83:.*]] = bitcast i128 %[[VAL_82]] to <4 x i32> // CHECK: %[[VAL_84:.*]] = extractelement <4 x i32> %[[VAL_83]], i64 0 -// CHECK: %[[VAL_85:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_84]], i32 4, i32 31) +// CHECK-PTX: %[[VAL_85:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_84]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_85:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_84]], i32 4) // CHECK: %[[VAL_86:.*]] = insertelement <4 x i32> %[[VAL_83]], i32 %[[VAL_85]], i64 0 // CHECK: %[[VAL_87:.*]] = extractelement <4 x i32> %[[VAL_86]], i64 1 -// CHECK: %[[VAL_88:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_87]], i32 4, i32 31) +// CHECK-PTX: %[[VAL_88:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_87]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_88:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_87]], i32 4) // CHECK: %[[VAL_89:.*]] = insertelement <4 x i32> %[[VAL_86]], i32 %[[VAL_88]], i64 1 // CHECK: %[[VAL_90:.*]] = extractelement <4 x i32> %[[VAL_89]], i64 2 -// CHECK: %[[VAL_91:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_90]], i32 4, i32 31) +// CHECK-PTX: %[[VAL_91:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_90]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_91:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_90]], i32 4) // CHECK: %[[VAL_92:.*]] = insertelement <4 x i32> %[[VAL_89]], i32 %[[VAL_91]], i64 2 // CHECK: %[[VAL_93:.*]] = extractelement <4 x i32> %[[VAL_92]], i64 3 -// CHECK: %[[VAL_94:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_93]], i32 4, i32 31) +// CHECK-PTX: %[[VAL_94:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_93]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_94:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_93]], i32 4) // CHECK: %[[VAL_95:.*]] = insertelement <4 x i32> %[[VAL_92]], i32 %[[VAL_94]], i64 3 // CHECK: %[[VAL_96:.*]] = bitcast <4 x i32> %[[VAL_95]] to i128 -// CHECK: store i128 %[[VAL_96]], ptr %[[VAL_17]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_17]], ptr %[[VAL_16]]) -// CHECK: %[[VAL_97:.*]] = load %[[VAL_1]], ptr %[[VAL_16]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_97]], ptr %[[VAL_28]], align 1 -// CHECK: %[[VAL_98:.*]] = load i128, ptr %[[VAL_28]], align 16 +// CHECK: store i128 %[[VAL_96]], ptr{{.*}} %[[VAL_17]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_98_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_98_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_17]] to ptr +// CHECK-GCN: %[[VAL_98_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_16]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_98_1]], ptr %[[VAL_98_2]], ptr %[[VAL_98_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_17]], ptr %[[VAL_16]]) +// CHECK: %[[VAL_97:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_16]], align 1 +// CHECK: store %[[VAL_1]] %[[VAL_97]], ptr{{.*}} %[[VAL_28]], align 1 +// CHECK: %[[VAL_98:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}} // CHECK: %[[VAL_99:.*]] = bitcast i128 %[[VAL_98]] to <4 x i32> // CHECK: %[[VAL_100:.*]] = extractelement <4 x i32> %[[VAL_99]], i64 0 -// CHECK: %[[VAL_101:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_100]], i32 2, i32 31) +// CHECK-PTX: %[[VAL_101:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_100]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_101:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_100]], i32 2) // CHECK: %[[VAL_102:.*]] = insertelement <4 x i32> %[[VAL_99]], i32 %[[VAL_101]], i64 0 // CHECK: %[[VAL_103:.*]] = extractelement <4 x i32> %[[VAL_102]], i64 1 -// CHECK: %[[VAL_104:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_103]], i32 2, i32 31) +// CHECK-PTX: %[[VAL_104:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_103]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_104:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_103]], i32 2) // CHECK: %[[VAL_105:.*]] = insertelement <4 x i32> %[[VAL_102]], i32 %[[VAL_104]], i64 1 // CHECK: %[[VAL_106:.*]] = extractelement <4 x i32> %[[VAL_105]], i64 2 -// CHECK: %[[VAL_107:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_106]], i32 2, i32 31) +// CHECK-PTX: %[[VAL_107:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_106]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_107:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_106]], i32 2) // CHECK: %[[VAL_108:.*]] = insertelement <4 x i32> %[[VAL_105]], i32 %[[VAL_107]], i64 2 // CHECK: %[[VAL_109:.*]] = extractelement <4 x i32> %[[VAL_108]], i64 3 -// CHECK: %[[VAL_110:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_109]], i32 2, i32 31) +// CHECK-PTX: %[[VAL_110:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_109]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_110:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_109]], i32 2) // CHECK: %[[VAL_111:.*]] = insertelement <4 x i32> %[[VAL_108]], i32 %[[VAL_110]], i64 3 // CHECK: %[[VAL_112:.*]] = bitcast <4 x i32> %[[VAL_111]] to i128 -// CHECK: store i128 %[[VAL_112]], ptr %[[VAL_15]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_15]], ptr %[[VAL_14]]) -// CHECK: %[[VAL_113:.*]] = load %[[VAL_1]], ptr %[[VAL_14]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_113]], ptr %[[VAL_28]], align 1 -// CHECK: %[[VAL_114:.*]] = load i128, ptr %[[VAL_28]], align 16 +// CHECK: store i128 %[[VAL_112]], ptr{{.*}} %[[VAL_15]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_113_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_113_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_15]] to ptr +// CHECK-GCN: %[[VAL_113_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_14]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_113_1]], ptr %[[VAL_113_2]], ptr %[[VAL_113_3]]) +// CHECK_PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_15]], ptr %[[VAL_14]]) +// CHECK: %[[VAL_113:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_14]], align 1 +// CHECK: store %[[VAL_1]] %[[VAL_113]], ptr{{.*}} %[[VAL_28]], align 1 +// CHECK: %[[VAL_114:.*]] = load i128, ptr{{.*}} %[[VAL_28]], align {{(16|8)}} // CHECK: %[[VAL_115:.*]] = bitcast i128 %[[VAL_114]] to <4 x i32> // CHECK: %[[VAL_116:.*]] = extractelement <4 x i32> %[[VAL_115]], i64 0 -// CHECK: %[[VAL_117:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_116]], i32 1, i32 31) +// CHECK-PTX: %[[VAL_117:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_116]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_117:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_116]], i32 1) // CHECK: %[[VAL_118:.*]] = insertelement <4 x i32> %[[VAL_115]], i32 %[[VAL_117]], i64 0 // CHECK: %[[VAL_119:.*]] = extractelement <4 x i32> %[[VAL_118]], i64 1 -// CHECK: %[[VAL_120:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_119]], i32 1, i32 31) +// CHECK-PTX: %[[VAL_120:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_119]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_120:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_119]], i32 1) // CHECK: %[[VAL_121:.*]] = insertelement <4 x i32> %[[VAL_118]], i32 %[[VAL_120]], i64 1 // CHECK: %[[VAL_122:.*]] = extractelement <4 x i32> %[[VAL_121]], i64 2 -// CHECK: %[[VAL_123:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_122]], i32 1, i32 31) +// CHECK-PTX: %[[VAL_123:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_122]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_123:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_122]], i32 1) // CHECK: %[[VAL_124:.*]] = insertelement <4 x i32> %[[VAL_121]], i32 %[[VAL_123]], i64 2 // CHECK: %[[VAL_125:.*]] = extractelement <4 x i32> %[[VAL_124]], i64 3 -// CHECK: %[[VAL_126:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_125]], i32 1, i32 31) +// CHECK-PTX: %[[VAL_126:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_125]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_126:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_125]], i32 1) // CHECK: %[[VAL_127:.*]] = insertelement <4 x i32> %[[VAL_124]], i32 %[[VAL_126]], i64 3 // CHECK: %[[VAL_128:.*]] = bitcast <4 x i32> %[[VAL_127]] to i128 -// CHECK: store i128 %[[VAL_128]], ptr %[[VAL_13]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_13]], ptr %[[VAL_12]]) -// CHECK: %[[VAL_129:.*]] = load %[[VAL_1]], ptr %[[VAL_12]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_129]], ptr %[[VAL_28]], align 1 +// CHECK: store i128 %[[VAL_128]], ptr{{.*}} %[[VAL_13]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_129_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_129_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_13]] to ptr +// CHECK-GCN: %[[VAL_129_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_12]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_129_1]], ptr %[[VAL_129_2]], ptr %[[VAL_129_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_13]], ptr %[[VAL_12]]) +// CHECK: %[[VAL_129:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_12]], align 1 +// CHECK: store %[[VAL_1]] %[[VAL_129]], ptr{{.*}} %[[VAL_28]], align 1 // CHECK: %[[VAL_130:.*]] = udiv i32 %thread.id.2, 32 // CHECK: br i1 true, label %thread_in_bounds-true, label %thread_in_bounds-after + // CHECK: thread_in_bounds-after: ; preds = %[[VAL_131:.*]], %[[VAL_132:.*]] // CHECK: br label %[[VAL_33]] + // CHECK: is_full_tile-true: ; preds = %[[VAL_32]] -// CHECK: store i32 0, ptr %[[VAL_27]], align 4 +// CHECK: store i32 0, ptr{{.*}} %[[VAL_27]], align 4 // CHECK: br label %[[VAL_133:.*]] + // CHECK: loop2.loop_header: ; preds = %[[VAL_134:.*]], %[[VAL_46]] -// CHECK: %[[VAL_135:.*]] = load i32, ptr %[[VAL_27]], align 4 -// CHECK: %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 5120 +// CHECK: %[[VAL_135:.*]] = load i32, ptr{{.*}} %[[VAL_27]], align 4 +// CHECK-PTX: %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 5120 +// CHECK-GCN: %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 10240 // CHECK: br i1 %[[VAL_136]], label %[[VAL_49]], label %[[VAL_137:.*]] + // CHECK: loop2.loop_body: ; preds = %[[VAL_133]] // CHECK: %[[VAL_138:.*]] = add nuw nsw i32 %[[VAL_135]], 640 -// CHECK: store i32 %[[VAL_138]], ptr %[[VAL_27]], align 4 +// CHECK: store i32 %[[VAL_138]], ptr{{.*}} %[[VAL_27]], align 4 // CHECK: %[[VAL_140:.*]] = add i32 %[[VAL_135]], %thread.id.2 -// CHECK: store i32 0, ptr %[[VAL_26]], align 4 +// CHECK-GCN: %[[VAL_147:.*]] = add i32 %tile_origin.0, 0 +// CHECK-GCN: %[[VAL_148:.*]] = add i32 %tile_origin.1, 0 +// CHECK-GCN: %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]] +// CHECK-GCN: %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_149]] +// CHECK-GCN: %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !2 +// CHECK-GCN: store %[[VAL_1]] %[[VAL_162]], ptr{{.*}} %[[VAL_29]], align 1 +// CHECK-GCN: %[[VAL_163_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_163_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_29]] to ptr +// CHECK-GCN: %[[VAL_163_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_25]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_163_1]], ptr %[[VAL_163_2]], ptr %[[VAL_163_3]]) +// CHECK-GCN: %[[VAL_163:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_25]], align 1 +// CHECK-GCN: store %[[VAL_1]] %[[VAL_163]], ptr{{.*}} %[[VAL_28]], align 1 +// CHECK-PTX: store i32 0, ptr %[[VAL_26]], align 4 // CHECK: br label %[[VAL_141:.*]] -// CHECK: loop3.loop_header: ; preds = %[[VAL_142:.*]], %[[VAL_137]] -// CHECK: %[[VAL_143:.*]] = load i32, ptr %[[VAL_26]], align 4 -// CHECK: %[[VAL_144:.*]] = icmp uge i32 %[[VAL_143]], 2 -// CHECK: br i1 %[[VAL_144]], label %[[VAL_134]], label %[[VAL_142]] -// CHECK: loop3.loop_body: ; preds = %[[VAL_141]] -// CHECK: %[[VAL_145:.*]] = add nuw nsw i32 %[[VAL_143]], 1 -// CHECK: store i32 %[[VAL_145]], ptr %[[VAL_26]], align 4 -// CHECK: %[[VAL_147:.*]] = add i32 %tile_origin.0, 0 -// CHECK: %[[VAL_148:.*]] = add i32 %tile_origin.1, 0 -// CHECK: %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]] -// CHECK: %[[VAL_150:.*]] = add i32 %tile_origin.3, %[[VAL_143]] -// CHECK: %[[VAL_151:.*]] = mul nuw nsw i32 %[[VAL_150]], 1 -// CHECK: %[[VAL_152:.*]] = add nuw nsw i32 0, %[[VAL_151]] -// CHECK: %[[VAL_153:.*]] = mul nuw nsw i32 %[[VAL_149]], 2 -// CHECK: %[[VAL_154:.*]] = add nuw nsw i32 %[[VAL_152]], %[[VAL_153]] -// CHECK: %[[VAL_155:.*]] = udiv i32 %[[VAL_154]], 10000 -// CHECK: %[[VAL_156:.*]] = mul nuw nsw i32 %[[VAL_148]], 1 -// CHECK: %[[VAL_157:.*]] = add nuw nsw i32 0, %[[VAL_156]] -// CHECK: %[[VAL_158:.*]] = mul nuw nsw i32 %[[VAL_147]], 1 -// CHECK: %[[VAL_159:.*]] = add nuw nsw i32 0, %[[VAL_158]] -// CHECK: %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_154]] -// CHECK: %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !3 -// CHECK: store %[[VAL_1]] %[[VAL_162]], ptr %[[VAL_29]], align 1 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_25]]) -// CHECK: %[[VAL_163:.*]] = load %[[VAL_1]], ptr %[[VAL_25]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_163]], ptr %[[VAL_28]], align 1 -// CHECK: br label %[[VAL_141]], !llvm.loop !5 -// CHECK: loop3.loop_exit: ; preds = %[[VAL_141]] -// CHECK: br label %[[VAL_133]], !llvm.loop !7 + +// CHECK-PTX: loop3.loop_header: ; preds = %[[VAL_142:.*]], %[[VAL_137]] +// CHECK-PTX: %[[VAL_143:.*]] = load i32, ptr %[[VAL_26]], align 4 +// CHECK-PTX: %[[VAL_144:.*]] = icmp uge i32 %[[VAL_143]], 2 +// CHECK-PTX: br i1 %[[VAL_144]], label %[[VAL_134]], label %[[VAL_142]] + +// CHECK-PTX: loop3.loop_body: ; preds = %[[VAL_141]] +// CHECK-PTX: %[[VAL_145:.*]] = add nuw nsw i32 %[[VAL_143]], 1 +// CHECK-PTX: store i32 %[[VAL_145]], ptr %[[VAL_26]], align 4 +// CHECK-PTX: %[[VAL_147:.*]] = add i32 %tile_origin.0, 0 +// CHECK-PTX: %[[VAL_148:.*]] = add i32 %tile_origin.1, 0 +// CHECK-PTX: %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]] +// CHECK-PTX: %[[VAL_150:.*]] = add i32 %tile_origin.3, %[[VAL_143]] +// CHECK-PTX: %[[VAL_151:.*]] = mul nuw nsw i32 %[[VAL_150]], 1 +// CHECK-PTX: %[[VAL_152:.*]] = add nuw nsw i32 0, %[[VAL_151]] +// CHECK-PTX: %[[VAL_153:.*]] = mul nuw nsw i32 %[[VAL_149]], 2 +// CHECK-PTX: %[[VAL_154:.*]] = add nuw nsw i32 %[[VAL_152]], %[[VAL_153]] +// CHECK-PTX: %[[VAL_155:.*]] = udiv i32 %[[VAL_154]], 10000 +// CHECK-PTX: %[[VAL_156:.*]] = mul nuw nsw i32 %[[VAL_148]], 1 +// CHECK-PTX: %[[VAL_157:.*]] = add nuw nsw i32 0, %[[VAL_156]] +// CHECK-PTX: %[[VAL_158:.*]] = mul nuw nsw i32 %[[VAL_147]], 1 +// CHECK-PTX: %[[VAL_159:.*]] = add nuw nsw i32 0, %[[VAL_158]] +// CHECK-PTX: %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_154]] +// CHECK-PTX: %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !3 +// CHECK-PTX: store %[[VAL_1]] %[[VAL_162]], ptr %[[VAL_29]], align 1 +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_25]]) +// CHECK-PTX: %[[VAL_163:.*]] = load %[[VAL_1]], ptr %[[VAL_25]], align 1 +// CHECK-PTX: store %[[VAL_1]] %[[VAL_163]], ptr %[[VAL_28]], align 1 +// CHECK-PTX: br label %[[VAL_141]], !llvm.loop !5 + +// CHECK-PTX: loop3.loop_exit: ; preds = %[[VAL_141]] +// CHECK-PTX: br label %[[VAL_133]], !llvm.loop !7 + // CHECK: loop2.loop_exit: ; preds = %[[VAL_133]] // CHECK: br label %[[VAL_132]] + // CHECK: is_full_tile-false: ; preds = %[[VAL_32]] -// CHECK: store i32 0, ptr %[[VAL_24]], align 4 +// CHECK-PTX: store i32 0, ptr %[[VAL_24]], align 4 +// CHECK-GCN: store i32 0, ptr{{.*}} %[[VAL_26]], align 4 // CHECK: br label %[[VAL_164:.*]] -// CHECK: loop2.loop_header4: ; preds = %[[VAL_165:.*]], %[[VAL_47]] -// CHECK: %[[VAL_166:.*]] = load i32, ptr %[[VAL_24]], align 4 -// CHECK: %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 5120 + +// CHECK: loop2.loop_header{{4|3}}: ; preds = %[[VAL_165:.*]], %[[VAL_47]] +// CHECK-PTX: %[[VAL_166:.*]] = load i32, ptr %[[VAL_24]], align 4 +// CHECK-PTX: %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 5120 +// CHECK-GCN: %[[VAL_166:.*]] = load i32, ptr{{.*}} %[[VAL_26]], align 4 +// CHECK-GCN: %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 10240 // CHECK: br i1 %[[VAL_167]], label %[[VAL_48]], label %[[VAL_168:.*]] -// CHECK: loop2.loop_body5: ; preds = %[[VAL_164]] + +// CHECK: loop2.loop_body{{5|4}}: ; preds = %[[VAL_164]] // CHECK: %[[VAL_169:.*]] = add nuw nsw i32 %[[VAL_166]], 640 -// CHECK: store i32 %[[VAL_169]], ptr %[[VAL_24]], align 4 +// CHECK-PTX: store i32 %[[VAL_169]], ptr %[[VAL_24]], align 4 +// CHECK-GCN: store i32 %[[VAL_169]], ptr{{.*}} %[[VAL_26]], align 4 // CHECK: %[[VAL_171:.*]] = add i32 %[[VAL_166]], %thread.id.2 // CHECK: %[[VAL_172:.*]] = icmp ult i32 %[[VAL_171]], %tile_bound.2 // CHECK: br i1 %[[VAL_172]], label %[[VAL_173:.*]], label %[[VAL_165]] + // CHECK: x_in_tile-after: ; preds = %[[VAL_174:.*]], %[[VAL_168]] -// CHECK: br label %[[VAL_164]], !llvm.loop !9 -// CHECK: loop2.loop_exit3: ; preds = %[[VAL_164]] +// CHECK: br label %[[VAL_164]], !llvm.loop !{{9|7}} + +// CHECK: loop2.loop_exit{{3|2}}: ; preds = %[[VAL_164]] // CHECK: br label %[[VAL_132]] + // CHECK: x_in_tile-true: ; preds = %[[VAL_168]] -// CHECK: store i32 0, ptr %[[VAL_23]], align 4 -// CHECK: br label %[[VAL_175:.*]] -// CHECK: loop3.loop_header10: ; preds = %[[VAL_176:.*]], %[[VAL_173]] -// CHECK: %[[VAL_177:.*]] = load i32, ptr %[[VAL_23]], align 4 -// CHECK: %[[VAL_178:.*]] = icmp uge i32 %[[VAL_177]], 2 -// CHECK: br i1 %[[VAL_178]], label %[[VAL_174]], label %[[VAL_176]] -// CHECK: loop3.loop_body11: ; preds = %[[VAL_175]] -// CHECK: %[[VAL_179:.*]] = add nuw nsw i32 %[[VAL_177]], 1 -// CHECK: store i32 %[[VAL_179]], ptr %[[VAL_23]], align 4 -// CHECK: %[[VAL_181:.*]] = add i32 %tile_origin.0, 0 -// CHECK: %[[VAL_182:.*]] = add i32 %tile_origin.1, 0 -// CHECK: %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]] -// CHECK: %[[VAL_184:.*]] = add i32 %tile_origin.3, %[[VAL_177]] -// CHECK: %[[VAL_185:.*]] = mul nuw nsw i32 %[[VAL_184]], 1 -// CHECK: %[[VAL_186:.*]] = add nuw nsw i32 0, %[[VAL_185]] -// CHECK: %[[VAL_187:.*]] = mul nuw nsw i32 %[[VAL_183]], 2 -// CHECK: %[[VAL_188:.*]] = add nuw nsw i32 %[[VAL_186]], %[[VAL_187]] -// CHECK: %[[VAL_189:.*]] = udiv i32 %[[VAL_188]], 10000 -// CHECK: %[[VAL_190:.*]] = mul nuw nsw i32 %[[VAL_182]], 1 -// CHECK: %[[VAL_191:.*]] = add nuw nsw i32 0, %[[VAL_190]] -// CHECK: %[[VAL_192:.*]] = mul nuw nsw i32 %[[VAL_181]], 1 -// CHECK: %[[VAL_193:.*]] = add nuw nsw i32 0, %[[VAL_192]] -// CHECK: %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_188]] -// CHECK: %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !3 -// CHECK: store %[[VAL_1]] %[[VAL_195]], ptr %[[VAL_29]], align 1 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_22]]) -// CHECK: %[[VAL_196:.*]] = load %[[VAL_1]], ptr %[[VAL_22]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_196]], ptr %[[VAL_28]], align 1 -// CHECK: br label %[[VAL_175]], !llvm.loop !10 -// CHECK: loop3.loop_exit9: ; preds = %[[VAL_175]] -// CHECK: br label %[[VAL_165]] +// CHECK-GCN: %[[VAL_181:.*]] = add i32 %tile_origin.0, 0 +// CHECK-GCN: %[[VAL_182:.*]] = add i32 %tile_origin.1, 0 +// CHECK-GCN: %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]] +// CHECK-GCN: %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_183]] +// CHECK-GCN: %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !2 +// CHECK-GCN: store %[[VAL_1]] %[[VAL_195]], ptr{{.*}} %[[VAL_29]], align 1 +// CHECK-GCN: %[[VAL_196_1:.*]] = addrspacecast ptr{{.*}} %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_196_2:.*]] = addrspacecast ptr{{.*}} %[[VAL_29]] to ptr +// CHECK-GCN: %[[VAL_196_3:.*]] = addrspacecast ptr{{.*}} %[[VAL_22]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_196_1]], ptr %[[VAL_196_2]], ptr %[[VAL_196_3]]) +// CHECK-GCN: %[[VAL_196:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_22]], align 1 +// CHECK-GCN: store %[[VAL_1]] %[[VAL_196]], ptr{{.*}} %[[VAL_28]], align 1 +// CHECK-PTX: store i32 0, ptr %[[VAL_23]], align 4 +// CHECK: br label %[[VAL_175:.*]] + +// CHECK-PTX: loop3.loop_header10: ; preds = %[[VAL_176:.*]], %[[VAL_173]] +// CHECK-PTX: %[[VAL_177:.*]] = load i32, ptr %[[VAL_23]], align 4 +// CHECK-PTX: %[[VAL_178:.*]] = icmp uge i32 %[[VAL_177]], 2 +// CHECK-PTX: br i1 %[[VAL_178]], label %[[VAL_174]], label %[[VAL_176]] +// CHECK-PTX: loop3.loop_body11: ; preds = %[[VAL_175]] +// CHECK-PTX: %[[VAL_179:.*]] = add nuw nsw i32 %[[VAL_177]], 1 +// CHECK-PTX: store i32 %[[VAL_179]], ptr %[[VAL_23]], align 4 +// CHECK-PTX: %[[VAL_181:.*]] = add i32 %tile_origin.0, 0 +// CHECK-PTX: %[[VAL_182:.*]] = add i32 %tile_origin.1, 0 +// CHECK-PTX: %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]] +// CHECK-PTX: %[[VAL_184:.*]] = add i32 %tile_origin.3, %[[VAL_177]] +// CHECK-PTX: %[[VAL_185:.*]] = mul nuw nsw i32 %[[VAL_184]], 1 +// CHECK-PTX: %[[VAL_186:.*]] = add nuw nsw i32 0, %[[VAL_185]] +// CHECK-PTX: %[[VAL_187:.*]] = mul nuw nsw i32 %[[VAL_183]], 2 +// CHECK-PTX: %[[VAL_188:.*]] = add nuw nsw i32 %[[VAL_186]], %[[VAL_187]] +// CHECK-PTX: %[[VAL_189:.*]] = udiv i32 %[[VAL_188]], 10000 +// CHECK-PTX: %[[VAL_190:.*]] = mul nuw nsw i32 %[[VAL_182]], 1 +// CHECK-PTX: %[[VAL_191:.*]] = add nuw nsw i32 0, %[[VAL_190]] +// CHECK-PTX: %[[VAL_192:.*]] = mul nuw nsw i32 %[[VAL_181]], 1 +// CHECK-PTX: %[[VAL_193:.*]] = add nuw nsw i32 0, %[[VAL_192]] +// CHECK-PTX: %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_188]] +// CHECK-PTX: %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !3 +// CHECK-PTX: store %[[VAL_1]] %[[VAL_195]], ptr %[[VAL_29]], align 1 +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_22]]) +// CHECK-PTX: %[[VAL_196:.*]] = load %[[VAL_1]], ptr %[[VAL_22]], align 1 +// CHECK-PTX: store %[[VAL_1]] %[[VAL_196]], ptr %[[VAL_28]], align 1 +// CHECK-PTX: br label %[[VAL_175]], !llvm.loop !10 +// CHECK-PTX: loop3.loop_exit9: ; preds = %[[VAL_175]] +// CHECK-PTX: br label %[[VAL_165]] + // CHECK: thread_in_bounds-true: ; preds = %[[VAL_132]] // CHECK: %[[VAL_197:.*]] = icmp eq i32 %lane_id, 0 // CHECK: br i1 %[[VAL_197]], label %[[VAL_198:.*]], label %[[VAL_199:.*]] // CHECK: intra_warp_reduce_write-after: ; preds = %[[VAL_198]], %thread_in_bounds-true -// CHECK: call void @llvm.nvvm.barrier0() +// CHECK-GCN: fence syncscope("workgroup") seq_cst +// CHECK-GCN: call void @llvm.amdgcn.s.barrier() +// CHECK-PTX: call void @llvm.nvvm.barrier0() // CHECK: %[[VAL_200:.*]] = icmp eq i32 %[[VAL_130]], 0 // CHECK: br i1 %[[VAL_200]], label %[[VAL_201:.*]], label %[[VAL_131]] // CHECK: inter_warp_reduce-after: ; preds = %[[VAL_202:.*]], %[[VAL_199]] // CHECK: br label %thread_in_bounds-after // CHECK: intra_warp_reduce_write-true: ; preds = %thread_in_bounds-true -// CHECK: %[[VAL_203:.*]] = load %[[VAL_1]], ptr %[[VAL_28]], align 1 +// CHECK: %[[VAL_203:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_28]], align 1 // CHECK: %[[VAL_204:.*]] = getelementptr inbounds [1 x [20 x %[[VAL_1]]]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %[[VAL_130]] // CHECK: %[[VAL_205:.*]] = addrspacecast ptr addrspace(3) %[[VAL_204]] to ptr // CHECK: store %[[VAL_1]] %[[VAL_203]], ptr %[[VAL_205]], align 1 @@ -291,103 +383,141 @@ ENTRY reduce.1 { // CHECK: inter_warp_reduce-true: ; preds = %[[VAL_199]] // CHECK: %[[VAL_206:.*]] = getelementptr inbounds [1 x [20 x %[[VAL_1]]]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %lane_id // CHECK: %[[VAL_207:.*]] = addrspacecast ptr addrspace(3) %[[VAL_206]] to ptr -// CHECK: store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_11]], align 1 +// CHECK-GCN: %[[VAL_207_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_11]] to ptr +// CHECK-GCN: store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_207_1]], align 1 +// CHECK-PTX: store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_11]], align 1 // CHECK: %[[VAL_208:.*]] = icmp ult i32 %thread.id.2, 20 -// CHECK: %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_11]] -// CHECK: %[[VAL_210:.*]] = load i128, ptr %[[VAL_209]], align 16 +// CHECK-GCN: %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_207_1]] +// CHECK-PTX: %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_11]] +// CHECK: %[[VAL_210:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}} // CHECK: %[[VAL_211:.*]] = bitcast i128 %[[VAL_210]] to <4 x i32> // CHECK: %[[VAL_212:.*]] = extractelement <4 x i32> %[[VAL_211]], i64 0 -// CHECK: %[[VAL_213:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_212]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_213:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_212]], i32 16) +// CHECK-PTX: %[[VAL_213:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_212]], i32 16, i32 31) // CHECK: %[[VAL_214:.*]] = insertelement <4 x i32> %[[VAL_211]], i32 %[[VAL_213]], i64 0 // CHECK: %[[VAL_215:.*]] = extractelement <4 x i32> %[[VAL_214]], i64 1 -// CHECK: %[[VAL_216:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_215]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_216:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_215]], i32 16) +// CHECK-PTX: %[[VAL_216:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_215]], i32 16, i32 31) // CHECK: %[[VAL_217:.*]] = insertelement <4 x i32> %[[VAL_214]], i32 %[[VAL_216]], i64 1 // CHECK: %[[VAL_218:.*]] = extractelement <4 x i32> %[[VAL_217]], i64 2 -// CHECK: %[[VAL_219:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_218]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_219:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_218]], i32 16) +// CHECK-PTX: %[[VAL_219:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_218]], i32 16, i32 31) // CHECK: %[[VAL_220:.*]] = insertelement <4 x i32> %[[VAL_217]], i32 %[[VAL_219]], i64 2 // CHECK: %[[VAL_221:.*]] = extractelement <4 x i32> %[[VAL_220]], i64 3 -// CHECK: %[[VAL_222:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_221]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_222:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_221]], i32 16) +// CHECK-PTX: %[[VAL_222:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_221]], i32 16, i32 31) // CHECK: %[[VAL_223:.*]] = insertelement <4 x i32> %[[VAL_220]], i32 %[[VAL_222]], i64 3 // CHECK: %[[VAL_224:.*]] = bitcast <4 x i32> %[[VAL_223]] to i128 -// CHECK: store i128 %[[VAL_224]], ptr %[[VAL_10]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_10]], ptr %[[VAL_9]]) -// CHECK: %[[VAL_225:.*]] = load %[[VAL_1]], ptr %[[VAL_9]], align 1 +// CHECK: store i128 %[[VAL_224]], ptr{{.*}} %[[VAL_10]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_225_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_10]] to ptr +// CHECK-GCN: %[[VAL_225_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_9]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_225_1]], ptr %[[VAL_225_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_10]], ptr %[[VAL_9]]) +// CHECK: %[[VAL_225:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_9]], align 1 // CHECK: store %[[VAL_1]] %[[VAL_225]], ptr %[[VAL_209]], align 1 -// CHECK: %[[VAL_226:.*]] = load i128, ptr %[[VAL_209]], align 16 +// CHECK: %[[VAL_226:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}} // CHECK: %[[VAL_227:.*]] = bitcast i128 %[[VAL_226]] to <4 x i32> // CHECK: %[[VAL_228:.*]] = extractelement <4 x i32> %[[VAL_227]], i64 0 -// CHECK: %[[VAL_229:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_228]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_229:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_228]], i32 8) +// CHECK-PTX: %[[VAL_229:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_228]], i32 8, i32 31) // CHECK: %[[VAL_230:.*]] = insertelement <4 x i32> %[[VAL_227]], i32 %[[VAL_229]], i64 0 // CHECK: %[[VAL_231:.*]] = extractelement <4 x i32> %[[VAL_230]], i64 1 -// CHECK: %[[VAL_232:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_231]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_232:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_231]], i32 8) +// CHECK-PTX: %[[VAL_232:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_231]], i32 8, i32 31) // CHECK: %[[VAL_233:.*]] = insertelement <4 x i32> %[[VAL_230]], i32 %[[VAL_232]], i64 1 // CHECK: %[[VAL_234:.*]] = extractelement <4 x i32> %[[VAL_233]], i64 2 -// CHECK: %[[VAL_235:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_234]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_235:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_234]], i32 8) +// CHECK-PTX: %[[VAL_235:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_234]], i32 8, i32 31) // CHECK: %[[VAL_236:.*]] = insertelement <4 x i32> %[[VAL_233]], i32 %[[VAL_235]], i64 2 // CHECK: %[[VAL_237:.*]] = extractelement <4 x i32> %[[VAL_236]], i64 3 -// CHECK: %[[VAL_238:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_237]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_238:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_237]], i32 8) +// CHECK-PTX: %[[VAL_238:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_237]], i32 8, i32 31) // CHECK: %[[VAL_239:.*]] = insertelement <4 x i32> %[[VAL_236]], i32 %[[VAL_238]], i64 3 // CHECK: %[[VAL_240:.*]] = bitcast <4 x i32> %[[VAL_239]] to i128 -// CHECK: store i128 %[[VAL_240]], ptr %[[VAL_8]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_8]], ptr %[[VAL_7]]) -// CHECK: %[[VAL_241:.*]] = load %[[VAL_1]], ptr %[[VAL_7]], align 1 +// CHECK: store i128 %[[VAL_240]], ptr{{.*}} %[[VAL_8]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_241_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_8]] to ptr +// CHECK-GCN: %[[VAL_241_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_7]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_241_1]], ptr %[[VAL_241_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_8]], ptr %[[VAL_7]]) +// CHECK: %[[VAL_241:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_7]], align 1 // CHECK: store %[[VAL_1]] %[[VAL_241]], ptr %[[VAL_209]], align 1 -// CHECK: %[[VAL_242:.*]] = load i128, ptr %[[VAL_209]], align 16 +// CHECK: %[[VAL_242:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}} // CHECK: %[[VAL_243:.*]] = bitcast i128 %[[VAL_242]] to <4 x i32> // CHECK: %[[VAL_244:.*]] = extractelement <4 x i32> %[[VAL_243]], i64 0 -// CHECK: %[[VAL_245:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_244]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_245:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_244]], i32 4) +// CHECK-PTX: %[[VAL_245:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_244]], i32 4, i32 31) // CHECK: %[[VAL_246:.*]] = insertelement <4 x i32> %[[VAL_243]], i32 %[[VAL_245]], i64 0 // CHECK: %[[VAL_247:.*]] = extractelement <4 x i32> %[[VAL_246]], i64 1 -// CHECK: %[[VAL_248:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_247]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_248:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_247]], i32 4) +// CHECK-PTX: %[[VAL_248:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_247]], i32 4, i32 31) // CHECK: %[[VAL_249:.*]] = insertelement <4 x i32> %[[VAL_246]], i32 %[[VAL_248]], i64 1 // CHECK: %[[VAL_250:.*]] = extractelement <4 x i32> %[[VAL_249]], i64 2 -// CHECK: %[[VAL_251:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_250]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_251:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_250]], i32 4) +// CHECK-PTX: %[[VAL_251:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_250]], i32 4, i32 31) // CHECK: %[[VAL_252:.*]] = insertelement <4 x i32> %[[VAL_249]], i32 %[[VAL_251]], i64 2 // CHECK: %[[VAL_253:.*]] = extractelement <4 x i32> %[[VAL_252]], i64 3 -// CHECK: %[[VAL_254:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_253]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_254:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_253]], i32 4) +// CHECK-PTX: %[[VAL_254:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_253]], i32 4, i32 31) // CHECK: %[[VAL_255:.*]] = insertelement <4 x i32> %[[VAL_252]], i32 %[[VAL_254]], i64 3 // CHECK: %[[VAL_256:.*]] = bitcast <4 x i32> %[[VAL_255]] to i128 -// CHECK: store i128 %[[VAL_256]], ptr %[[VAL_6]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_6]], ptr %[[VAL_5]]) -// CHECK: %[[VAL_257:.*]] = load %[[VAL_1]], ptr %[[VAL_5]], align 1 -// CHECK: store %[[VAL_1]] %[[VAL_257]], ptr %[[VAL_209]], align 1 -// CHECK: %[[VAL_258:.*]] = load i128, ptr %[[VAL_209]], align 16 +// CHECK: store i128 %[[VAL_256]], ptr{{.*}} %[[VAL_6]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_257_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_6]] to ptr +// CHECK-GCN: %[[VAL_257_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_5]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_257_1]], ptr %[[VAL_257_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_6]], ptr %[[VAL_5]]) +// CHECK: %[[VAL_257:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_5]], align 1 +// CHECK: store %[[VAL_1]] %[[VAL_257]], ptr{{.*}} %[[VAL_209]], align 1 +// CHECK: %[[VAL_258:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}} // CHECK: %[[VAL_259:.*]] = bitcast i128 %[[VAL_258]] to <4 x i32> // CHECK: %[[VAL_260:.*]] = extractelement <4 x i32> %[[VAL_259]], i64 0 -// CHECK: %[[VAL_261:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_260]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_261:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_260]], i32 2) +// CHECK-PTX: %[[VAL_261:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_260]], i32 2, i32 31) // CHECK: %[[VAL_262:.*]] = insertelement <4 x i32> %[[VAL_259]], i32 %[[VAL_261]], i64 0 // CHECK: %[[VAL_263:.*]] = extractelement <4 x i32> %[[VAL_262]], i64 1 -// CHECK: %[[VAL_264:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_263]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_264:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_263]], i32 2) +// CHECK-PTX: %[[VAL_264:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_263]], i32 2, i32 31) // CHECK: %[[VAL_265:.*]] = insertelement <4 x i32> %[[VAL_262]], i32 %[[VAL_264]], i64 1 // CHECK: %[[VAL_266:.*]] = extractelement <4 x i32> %[[VAL_265]], i64 2 -// CHECK: %[[VAL_267:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_266]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_267:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_266]], i32 2) +// CHECK-PTX: %[[VAL_267:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_266]], i32 2, i32 31) // CHECK: %[[VAL_268:.*]] = insertelement <4 x i32> %[[VAL_265]], i32 %[[VAL_267]], i64 2 // CHECK: %[[VAL_269:.*]] = extractelement <4 x i32> %[[VAL_268]], i64 3 -// CHECK: %[[VAL_270:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_269]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_270:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_269]], i32 2) +// CHECK-PTX: %[[VAL_270:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_269]], i32 2, i32 31) // CHECK: %[[VAL_271:.*]] = insertelement <4 x i32> %[[VAL_268]], i32 %[[VAL_270]], i64 3 // CHECK: %[[VAL_272:.*]] = bitcast <4 x i32> %[[VAL_271]] to i128 -// CHECK: store i128 %[[VAL_272]], ptr %[[VAL_4]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_4]], ptr %[[VAL_3]]) -// CHECK: %[[VAL_273:.*]] = load %[[VAL_1]], ptr %[[VAL_3]], align 1 +// CHECK: store i128 %[[VAL_272]], ptr{{.*}} %[[VAL_4]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_273_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_4]] to ptr +// CHECK-GCN: %[[VAL_273_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_3]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_273_1]], ptr %[[VAL_273_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_4]], ptr %[[VAL_3]]) +// CHECK: %[[VAL_273:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_3]], align 1 // CHECK: store %[[VAL_1]] %[[VAL_273]], ptr %[[VAL_209]], align 1 -// CHECK: %[[VAL_274:.*]] = load i128, ptr %[[VAL_209]], align 16 +// CHECK: %[[VAL_274:.*]] = load i128, ptr %[[VAL_209]], align {{(16|8)}} // CHECK: %[[VAL_275:.*]] = bitcast i128 %[[VAL_274]] to <4 x i32> // CHECK: %[[VAL_276:.*]] = extractelement <4 x i32> %[[VAL_275]], i64 0 -// CHECK: %[[VAL_277:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_276]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_277:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_276]], i32 1) +// CHECK-PTX: %[[VAL_277:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_276]], i32 1, i32 31) // CHECK: %[[VAL_278:.*]] = insertelement <4 x i32> %[[VAL_275]], i32 %[[VAL_277]], i64 0 // CHECK: %[[VAL_279:.*]] = extractelement <4 x i32> %[[VAL_278]], i64 1 -// CHECK: %[[VAL_280:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_279]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_280:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_279]], i32 1) +// CHECK-PTX: %[[VAL_280:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_279]], i32 1, i32 31) // CHECK: %[[VAL_281:.*]] = insertelement <4 x i32> %[[VAL_278]], i32 %[[VAL_280]], i64 1 // CHECK: %[[VAL_282:.*]] = extractelement <4 x i32> %[[VAL_281]], i64 2 -// CHECK: %[[VAL_283:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_282]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_283:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_282]], i32 1) +// CHECK-PTX: %[[VAL_283:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_282]], i32 1, i32 31) // CHECK: %[[VAL_284:.*]] = insertelement <4 x i32> %[[VAL_281]], i32 %[[VAL_283]], i64 2 // CHECK: %[[VAL_285:.*]] = extractelement <4 x i32> %[[VAL_284]], i64 3 -// CHECK: %[[VAL_286:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_285]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_286:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_285]], i32 1) +// CHECK-PTX: %[[VAL_286:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_285]], i32 1, i32 31) // CHECK: %[[VAL_287:.*]] = insertelement <4 x i32> %[[VAL_284]], i32 %[[VAL_286]], i64 3 // CHECK: %[[VAL_288:.*]] = bitcast <4 x i32> %[[VAL_287]] to i128 -// CHECK: store i128 %[[VAL_288]], ptr %[[VAL_2]], align 16 -// CHECK: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_2]], ptr %[[VAL_0]]) -// CHECK: %[[VAL_289:.*]] = load %[[VAL_1]], ptr %[[VAL_0]], align 1 +// CHECK: store i128 %[[VAL_288]], ptr{{.*}} %[[VAL_2]], align {{(16|8)}} +// CHECK-GCN: %[[VAL_289_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_2]] to ptr +// CHECK-GCN: %[[VAL_289_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_0]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_289_1]], ptr %[[VAL_289_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_2]], ptr %[[VAL_0]]) +// CHECK: %[[VAL_289:.*]] = load %[[VAL_1]], ptr{{.*}} %[[VAL_0]], align 1 // CHECK: store %[[VAL_1]] %[[VAL_289]], ptr %[[VAL_209]], align 1 // CHECK: %[[VAL_290:.*]] = icmp eq i32 %thread.id.2, 0 // CHECK: br i1 %[[VAL_290]], label %[[VAL_291:.*]], label %[[VAL_202]] @@ -402,15 +532,23 @@ ENTRY reduce.1 { // CHECK: %[[VAL_298:.*]] = alloca %[[VAL_299:.*]], align 8 // CHECK: %[[VAL_300:.*]] = load %[[VAL_299]], ptr %[[VAL_301:.*]], align 1 // CHECK: %[[VAL_302:.*]] = load %[[VAL_299]], ptr %[[VAL_303:.*]], align 1 -// CHECK: %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0 -// CHECK: %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0 -// CHECK: %[[VAL_306:.*]] = fadd double %[[VAL_304]], %[[VAL_305]] -// CHECK: %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1 -// CHECK: %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1 -// CHECK: %[[VAL_309:.*]] = fadd double %[[VAL_307]], %[[VAL_308]] -// CHECK: %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_306]], 0 -// CHECK: %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_309]], 1 -// CHECK: store %[[VAL_299]] %[[VAL_311]], ptr %[[VAL_298]], align 1 -// CHECK: %[[VAL_312:.*]] = load %[[VAL_299]], ptr %[[VAL_298]], align 1 +// CHECK-GCN: %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1 +// CHECK-GCN: %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1 +// CHECK-PTX: %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0 +// CHECK-PTX: %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0 +// CHECK-GCN: %[[VAL_306:.*]] = fadd double %[[VAL_305]], %[[VAL_304]] +// CHECK-PTX: %[[VAL_306:.*]] = fadd double %[[VAL_304]], %[[VAL_305]] +// CHECK-GCN: %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0 +// CHECK-GCN: %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0 +// CHECK-PTX: %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1 +// CHECK-PTX: %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1 +// CHECK-GCN: %[[VAL_309:.*]] = fadd double %[[VAL_308]], %[[VAL_307]] +// CHECK-PTX: %[[VAL_309:.*]] = fadd double %[[VAL_307]], %[[VAL_308]] +// CHECK-GCN: %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_309]], 0 +// CHECK-GCN: %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_306]], 1 +// CHECK-PTX: %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_306]], 0 +// CHECK-PTX: %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_309]], 1 +// CHECK: store %[[VAL_299]] %[[VAL_311]], ptr{{.*}} %[[VAL_298]], align 1 +// CHECK: %[[VAL_312:.*]] = load %[[VAL_299]], ptr{{.*}} %[[VAL_298]], align 1 // CHECK: store %[[VAL_299]] %[[VAL_312]], ptr %[[VAL_313:.*]], align 1 // CHECK: ret void diff --git a/xla/service/gpu/tests/reduce_row_vectorized.hlo b/xla/service/gpu/tests/reduce_row_vectorized.hlo index bba7986d830fb..c4976c5fc2b3a 100644 --- a/xla/service/gpu/tests/reduce_row_vectorized.hlo +++ b/xla/service/gpu/tests/reduce_row_vectorized.hlo @@ -44,10 +44,10 @@ ENTRY reduce.1 { // CHECK: %[[VAL_19:.*]] = alloca float, align 4 // CHECK: %[[VAL_20:.*]] = alloca float, align 4 // CHECK: %[[VAL_21:.*]] = alloca float, align 4 -// CHECK: %[[VAL_22:.*]] = alloca i32, align 4 +// CHECK-PTX: %[[VAL_22:.*]] = alloca i32, align 4 // CHECK: %[[VAL_23:.*]] = alloca i32, align 4 // CHECK: %[[VAL_24:.*]] = alloca float, align 4 -// CHECK: %[[VAL_25:.*]] = alloca i32, align 4 +// CHECK-PTX: %[[VAL_25:.*]] = alloca i32, align 4 // CHECK: %[[VAL_26:.*]] = alloca i32, align 4 // CHECK: %[[VAL_27:.*]] = alloca i32, align 4 // CHECK: %[[VAL_28:.*]] = alloca float, align 4 @@ -60,7 +60,7 @@ ENTRY reduce.1 { // CHECK: ret void // CHECK: reduce-group-0-true: ; preds = %[[VAL_34]] // CHECK: %[[VAL_35:.*]] = load float, ptr @0, align 4 -// CHECK: store float %[[VAL_35]], ptr %[[VAL_28]], align 4 +// CHECK: store float %[[VAL_35]], ptr{{.*}} %[[VAL_28]], align 4 // CHECK-PTX: %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3 // CHECK-GCN: %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x // CHECK-PTX: %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4 @@ -71,211 +71,335 @@ ENTRY reduce.1 { // CHECK: %lane_id = urem i32 %thread.id.x, 32 // CHECK: %[[VAL_37:.*]] = udiv i32 %block.id.x, 1 // CHECK: %[[VAL_38:.*]] = urem i32 %[[VAL_37]], 1 -// CHECK: %[[VAL_39:.*]] = udiv i32 %block.id.x, 1 -// CHECK: %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1 +// CHECK-PTX: %[[VAL_39:.*]] = udiv i32 %block.id.x, 1 +// CHECK-PTX: %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1 // CHECK: %[[VAL_41:.*]] = udiv i32 %block.id.x, 1 // CHECK: %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 32768 // CHECK: %[[VAL_43:.*]] = udiv i32 %block.id.x, 32768 // CHECK: %tile_origin.0 = mul i32 %[[VAL_43]], 1 // CHECK: %tile_origin.1 = mul i32 %[[VAL_42]], 4 -// CHECK: %tile_origin.2 = mul i32 %[[VAL_40]], 512 -// CHECK: %tile_origin.3 = mul i32 %[[VAL_38]], 2 -// CHECK: store i32 %thread.id.1, ptr %[[VAL_27]], align 4 +// CHECK-PTX: %tile_origin.2 = mul i32 %[[VAL_40]], 512 +// CHECK-GCN: %tile_origin.2 = mul i32 %[[VAL_38]], 1024 +// CHECK-PTX: %tile_origin.3 = mul i32 %[[VAL_38]], 2 +// CHECK: store i32 %thread.id.1, ptr{{.*}} %[[VAL_27]], align 4 // CHECK: br label %[[VAL_44:.*]] + // CHECK: loop1.loop_header: ; preds = %[[VAL_45:.*]], %[[VAL_32]] -// CHECK: %[[VAL_46:.*]] = load i32, ptr %[[VAL_27]], align 4 +// CHECK: %[[VAL_46:.*]] = load i32, ptr{{.*}} %[[VAL_27]], align 4 // CHECK: %[[VAL_47:.*]] = icmp uge i32 %[[VAL_46]], 4 // CHECK: br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_49:.*]] + // CHECK: loop1.loop_body: ; preds = %[[VAL_44]] // CHECK: %[[VAL_50:.*]] = add nuw nsw i32 %[[VAL_46]], 4 -// CHECK: store i32 %[[VAL_50]], ptr %[[VAL_27]], align 4 +// CHECK: store i32 %[[VAL_50]], ptr{{.*}} %[[VAL_27]], align 4 // CHECK: br i1 true, label %[[VAL_52:.*]], label %[[VAL_53:.*]] + // CHECK: is_full_tile-after: ; preds = %[[VAL_54:.*]], %[[VAL_55:.*]] -// CHECK: br label %[[VAL_44]], !llvm.loop !5 +// CHECK: br label %[[VAL_44]], !llvm.loop !{{(5|4)}} + // CHECK: loop1.loop_exit: ; preds = %[[VAL_44]] -// CHECK: %[[VAL_56:.*]] = load float, ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_57:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_56]], i32 16, i32 31) -// CHECK: store float %[[VAL_57]], ptr %[[VAL_20]], align 4 -// CHECK: call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_20]], ptr %[[VAL_19]]) -// CHECK: %[[VAL_58:.*]] = load float, ptr %[[VAL_19]], align 4 -// CHECK: store float %[[VAL_58]], ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_59:.*]] = load float, ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_60:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_59]], i32 8, i32 31) -// CHECK: store float %[[VAL_60]], ptr %[[VAL_18]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_18]], ptr %[[VAL_17]]) -// CHECK: %[[VAL_61:.*]] = load float, ptr %[[VAL_17]], align 4 -// CHECK: store float %[[VAL_61]], ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_62:.*]] = load float, ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_63:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_62]], i32 4, i32 31) -// CHECK: store float %[[VAL_63]], ptr %[[VAL_16]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_16]], ptr %[[VAL_15]]) -// CHECK: %[[VAL_64:.*]] = load float, ptr %[[VAL_15]], align 4 -// CHECK: store float %[[VAL_64]], ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_65:.*]] = load float, ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_66:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_65]], i32 2, i32 31) -// CHECK: store float %[[VAL_66]], ptr %[[VAL_14]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_14]], ptr %[[VAL_13]]) -// CHECK: %[[VAL_67:.*]] = load float, ptr %[[VAL_13]], align 4 -// CHECK: store float %[[VAL_67]], ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_68:.*]] = load float, ptr %[[VAL_28]], align 4 -// CHECK: %[[VAL_69:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_68]], i32 1, i32 31) -// CHECK: store float %[[VAL_69]], ptr %[[VAL_12]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_12]], ptr %[[VAL_11]]) -// CHECK: %[[VAL_70:.*]] = load float, ptr %[[VAL_11]], align 4 -// CHECK: store float %[[VAL_70]], ptr %[[VAL_28]], align 4 +// CHECK: %[[VAL_56:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-GCN: %[[VAL_57_1:.*]] = bitcast float %[[VAL_56]] to i32 +// CHECK-GCN: %[[VAL_57_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_57_1]], i32 16) +// CHECK-GCN: %[[VAL_57:.*]] = bitcast i32 %[[VAL_57_2]] to float +// CHECK-PTX: %[[VAL_57:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_56]], i32 16, i32 31) +// CHECK: store float %[[VAL_57]], ptr{{.*}} %[[VAL_20]], align 4 +// CHECK-GCN: %[[VAL_58_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_58_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_20]] to ptr +// CHECK-GCN: %[[VAL_58_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_19]] to ptr +// CHECK-GCN: call void @[[SUM:Sum.*]](ptr %[[VAL_58_1]], ptr %[[VAL_58_2]], ptr %[[VAL_58_3]]) +// CHECK-PTX: call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_20]], ptr %[[VAL_19]]) +// CHECK: %[[VAL_58:.*]] = load float, ptr{{.*}} %[[VAL_19]], align 4 +// CHECK: store float %[[VAL_58]], ptr{{.*}} %[[VAL_28]], align 4 +// CHECK: %[[VAL_59:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-GCN: %[[VAL_60_1:.*]] = bitcast float %[[VAL_59]] to i32 +// CHECK-GCN: %[[VAL_60_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_60_1]], i32 8) +// CHECK-GCN: %[[VAL_60:.*]] = bitcast i32 %[[VAL_60_2]] to float +// CHECK-PTX: %[[VAL_60:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_59]], i32 8, i32 31) +// CHECK: store float %[[VAL_60]], ptr{{.*}} %[[VAL_18]], align 4 +// CHECK-GCN: %[[VAL_61_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_61_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_18]] to ptr +// CHECK-GCN: %[[VAL_61_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_17]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_61_1]], ptr %[[VAL_61_2]], ptr %[[VAL_61_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_18]], ptr %[[VAL_17]]) +// CHECK: %[[VAL_61:.*]] = load float, ptr{{.*}} %[[VAL_17]], align 4 +// CHECK: store float %[[VAL_61]], ptr{{.*}} %[[VAL_28]], align 4 +// CHECK: %[[VAL_62:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-GCN: %[[VAL_63_1:.*]] = bitcast float %[[VAL_62]] to i32 +// CHECK-GCN: %[[VAL_63_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_63_1]], i32 4) +// CHECK-GCN: %[[VAL_63:.*]] = bitcast i32 %[[VAL_63_2]] to float +// CHECK-PTX: %[[VAL_63:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_62]], i32 4, i32 31) +// CHECK: store float %[[VAL_63]], ptr{{.*}} %[[VAL_16]], align 4 +// CHECK-GCN: %[[VAL_64_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_64_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_16]] to ptr +// CHECK-GCN: %[[VAL_64_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_15]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_64_1]], ptr %[[VAL_64_2]], ptr %[[VAL_64_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_16]], ptr %[[VAL_15]]) +// CHECK: %[[VAL_64:.*]] = load float, ptr{{.*}} %[[VAL_15]], align 4 +// CHECK: store float %[[VAL_64]], ptr{{.*}} %[[VAL_28]], align 4 +// CHECK: %[[VAL_65:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-GCN: %[[VAL_66_1:.*]] = bitcast float %[[VAL_65]] to i32 +// CHECK-GCN: %[[VAL_66_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_66_1]], i32 2) +// CHECK-GCN: %[[VAL_66:.*]] = bitcast i32 %[[VAL_66_2]] to float +// CHECK-PTX: %[[VAL_66:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_65]], i32 2, i32 31) +// CHECK: store float %[[VAL_66]], ptr{{.*}} %[[VAL_14]], align 4 +// CHECK-GCN: %[[VAL_67_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_67_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_14]] to ptr +// CHECK-GCN: %[[VAL_67_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_13]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_67_1]], ptr %[[VAL_67_2]], ptr %[[VAL_67_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_14]], ptr %[[VAL_13]]) +// CHECK: %[[VAL_67:.*]] = load float, ptr{{.*}} %[[VAL_13]], align 4 +// CHECK: store float %[[VAL_67]], ptr{{.*}} %[[VAL_28]], align 4 +// CHECK: %[[VAL_68:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-GCN: %[[VAL_69_1:.*]] = bitcast float %[[VAL_68]] to i32 +// CHECK-GCN: %[[VAL_69_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_69_1]], i32 1) +// CHECK-GCN: %[[VAL_69:.*]] = bitcast i32 %[[VAL_69_2]] to float +// CHECK-PTX: %[[VAL_69:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_68]], i32 1, i32 31) +// CHECK: store float %[[VAL_69]], ptr{{.*}} %[[VAL_12]], align 4 +// CHECK-GCN: %[[VAL_70_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_70_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_12]] to ptr +// CHECK-GCN: %[[VAL_70_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_11]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_70_1]], ptr %[[VAL_70_2]], ptr %[[VAL_70_3]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_12]], ptr %[[VAL_11]]) +// CHECK: %[[VAL_70:.*]] = load float, ptr{{.*}} %[[VAL_11]], align 4 +// CHECK: store float %[[VAL_70]], ptr{{.*}} %[[VAL_28]], align 4 // CHECK: %[[VAL_71:.*]] = udiv i32 %thread.id.2, 32 // CHECK: %[[VAL_72:.*]] = icmp ult i32 %thread.id.1, 4 // CHECK: br i1 %[[VAL_72]], label %thread_in_bounds-true, label %thread_in_bounds-after + // CHECK: thread_in_bounds-after: ; preds = %[[VAL_73:.*]], %[[VAL_48]] // CHECK: br label %[[VAL_33]] + // CHECK: is_full_tile-true: ; preds = %[[VAL_49]] -// CHECK: store i32 0, ptr %[[VAL_26]], align 4 +// CHECK: store i32 0, ptr{{.*}} %[[VAL_26]], align 4 // CHECK: br label %[[VAL_74:.*]] + // CHECK: loop2.loop_header: ; preds = %[[VAL_75:.*]], %[[VAL_52]] -// CHECK: %[[VAL_76:.*]] = load i32, ptr %[[VAL_26]], align 4 -// CHECK: %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 512 +// CHECK: %[[VAL_76:.*]] = load i32, ptr{{.*}} %[[VAL_26]], align 4 +// CHECK-PTX: %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 512 +// CHECK-GCN: %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 1024 // CHECK: br i1 %[[VAL_77]], label %[[VAL_55]], label %[[VAL_78:.*]] + // CHECK: loop2.loop_body: ; preds = %[[VAL_74]] // CHECK: %[[VAL_79:.*]] = add nuw nsw i32 %[[VAL_76]], 64 -// CHECK: store i32 %[[VAL_79]], ptr %[[VAL_26]], align 4 +// CHECK: store i32 %[[VAL_79]], ptr{{.*}} %[[VAL_26]], align 4 // CHECK: %[[VAL_81:.*]] = add i32 %[[VAL_76]], %thread.id.2 -// CHECK: store i32 0, ptr %[[VAL_25]], align 4 +// CHECK-GCN: %[[VAL_88:.*]] = add i32 %tile_origin.0, 0 +// CHECK-GCN: %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]] +// CHECK-GCN: %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]] +// CHECK-GCN: %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_89]], i32 %[[VAL_90]] +// CHECK-GCN: %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !6 +// CHECK-GCN: store float %[[VAL_104]], ptr{{.*}} %[[VAL_29]], align 4 +// CHECK-GCN: %[[VAL_105_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_105_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_29]] to ptr +// CHECK-GCN: %[[VAL_105_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_24]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_105_1]], ptr %[[VAL_105_2]], ptr %[[VAL_105_3]]) +// CHECK-GCN: %[[VAL_105:.*]] = load float, ptr{{.*}} %[[VAL_24]], align 4 +// CHECK-GCN: store float %[[VAL_105]], ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-PTX: store i32 0, ptr %[[VAL_25]], align 4 // CHECK: br label %[[VAL_82:.*]] -// CHECK: loop3.loop_header: ; preds = %[[VAL_83:.*]], %[[VAL_78]] -// CHECK: %[[VAL_84:.*]] = load i32, ptr %[[VAL_25]], align 4 -// CHECK: %[[VAL_85:.*]] = icmp uge i32 %[[VAL_84]], 2 -// CHECK: br i1 %[[VAL_85]], label %[[VAL_75]], label %[[VAL_83]] -// CHECK: loop3.loop_body: ; preds = %[[VAL_82]] -// CHECK: %[[VAL_86:.*]] = add nuw nsw i32 %[[VAL_84]], 1 -// CHECK: store i32 %[[VAL_86]], ptr %[[VAL_25]], align 4 -// CHECK: %[[VAL_88:.*]] = add i32 %tile_origin.0, 0 -// CHECK: %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]] -// CHECK: %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]] -// CHECK: %[[VAL_91:.*]] = add i32 %tile_origin.3, %[[VAL_84]] -// CHECK: %[[VAL_92:.*]] = mul nuw nsw i32 %[[VAL_91]], 1 -// CHECK: %[[VAL_93:.*]] = add nuw nsw i32 0, %[[VAL_92]] -// CHECK: %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_90]], 2 -// CHECK: %[[VAL_95:.*]] = add nuw nsw i32 %[[VAL_93]], %[[VAL_94]] -// CHECK: %[[VAL_96:.*]] = udiv i32 %[[VAL_95]], 1024 -// CHECK: %[[VAL_97:.*]] = mul nuw nsw i32 %[[VAL_89]], 1 -// CHECK: %[[VAL_98:.*]] = add nuw nsw i32 0, %[[VAL_97]] -// CHECK: %[[VAL_99:.*]] = udiv i32 %[[VAL_98]], 131072 -// CHECK: %[[VAL_100:.*]] = mul nuw nsw i32 %[[VAL_88]], 1 -// CHECK: %[[VAL_101:.*]] = add nuw nsw i32 0, %[[VAL_100]] -// CHECK: %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_98]], i32 %[[VAL_95]] -// CHECK: %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !7 -// CHECK: store float %[[VAL_104]], ptr %[[VAL_29]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_24]]) -// CHECK: %[[VAL_105:.*]] = load float, ptr %[[VAL_24]], align 4 -// CHECK: store float %[[VAL_105]], ptr %[[VAL_28]], align 4 -// CHECK: br label %[[VAL_82]], !llvm.loop !8 -// CHECK: loop3.loop_exit: ; preds = %[[VAL_82]] -// CHECK: br label %[[VAL_74]], !llvm.loop !9 + +// CHECK-PTX: loop3.loop_header: ; preds = %[[VAL_83:.*]], %[[VAL_78]] +// CHECK-PTX: %[[VAL_84:.*]] = load i32, ptr %[[VAL_25]], align 4 +// CHECK-PTX: %[[VAL_85:.*]] = icmp uge i32 %[[VAL_84]], 2 +// CHECK-PTX: br i1 %[[VAL_85]], label %[[VAL_75]], label %[[VAL_83]] + +// CHECK-PTX: loop3.loop_body: ; preds = %[[VAL_82]] +// CHECK-PTX: %[[VAL_86:.*]] = add nuw nsw i32 %[[VAL_84]], 1 +// CHECK-PTX: store i32 %[[VAL_86]], ptr %[[VAL_25]], align 4 +// CHECK-PTX: %[[VAL_88:.*]] = add i32 %tile_origin.0, 0 +// CHECK-PTX: %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]] +// CHECK-PTX: %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]] +// CHECK-PTX: %[[VAL_91:.*]] = add i32 %tile_origin.3, %[[VAL_84]] +// CHECK-PTX: %[[VAL_92:.*]] = mul nuw nsw i32 %[[VAL_91]], 1 +// CHECK-PTX: %[[VAL_93:.*]] = add nuw nsw i32 0, %[[VAL_92]] +// CHECK-PTX: %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_90]], 2 +// CHECK-PTX: %[[VAL_95:.*]] = add nuw nsw i32 %[[VAL_93]], %[[VAL_94]] +// CHECK-PTX: %[[VAL_96:.*]] = udiv i32 %[[VAL_95]], 1024 +// CHECK-PTX: %[[VAL_97:.*]] = mul nuw nsw i32 %[[VAL_89]], 1 +// CHECK-PTX: %[[VAL_98:.*]] = add nuw nsw i32 0, %[[VAL_97]] +// CHECK-PTX: %[[VAL_99:.*]] = udiv i32 %[[VAL_98]], 131072 +// CHECK-PTX: %[[VAL_100:.*]] = mul nuw nsw i32 %[[VAL_88]], 1 +// CHECK-PTX: %[[VAL_101:.*]] = add nuw nsw i32 0, %[[VAL_100]] +// CHECK-PTX: %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_98]], i32 %[[VAL_95]] +// CHECK-PTX: %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !7 +// CHECK-PTX: store float %[[VAL_104]], ptr %[[VAL_29]], align 4 +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_24]]) +// CHECK-PTX: %[[VAL_105:.*]] = load float, ptr %[[VAL_24]], align 4 +// CHECK-PTX: store float %[[VAL_105]], ptr %[[VAL_28]], align 4 +// CHECK-PTX: br label %[[VAL_82]], !llvm.loop !8 + +// CHECK-PTX: loop3.loop_exit: ; preds = %[[VAL_82]] +// CHECK-PTX: br label %[[VAL_74]], !llvm.loop !9 + // CHECK: loop2.loop_exit: ; preds = %[[VAL_74]] // CHECK: br label %[[VAL_45]] // CHECK: is_full_tile-false: ; preds = %[[VAL_49]] -// CHECK: store i32 0, ptr %[[VAL_23]], align 4 +// CHECK: store i32 0, ptr{{.*}} %[[VAL_23]], align 4 // CHECK: br label %[[VAL_106:.*]] -// CHECK: loop2.loop_header5: ; preds = %[[VAL_107:.*]], %[[VAL_53]] -// CHECK: %[[VAL_108:.*]] = load i32, ptr %[[VAL_23]], align 4 -// CHECK: %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 512 + +// CHECK: loop2.loop_header{{(5|4)}}: ; preds = %[[VAL_107:.*]], %[[VAL_53]] +// CHECK: %[[VAL_108:.*]] = load i32, ptr{{.*}} %[[VAL_23]], align 4 +// CHECK-PTX: %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 512 +// CHECK-GCN: %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 1024 // CHECK: br i1 %[[VAL_109]], label %[[VAL_54]], label %[[VAL_110:.*]] -// CHECK: loop2.loop_body6: ; preds = %[[VAL_106]] + +// CHECK: loop2.loop_body{{(6|5)}}: ; preds = %[[VAL_106]] // CHECK: %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_108]], 64 -// CHECK: store i32 %[[VAL_111]], ptr %[[VAL_23]], align 4 +// CHECK: store i32 %[[VAL_111]], ptr{{.*}} %[[VAL_23]], align 4 // CHECK: %[[VAL_113:.*]] = add i32 %[[VAL_108]], %thread.id.2 -// CHECK: %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 512 +// CHECK-PTX: %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 512 +// CHECK-GCN: %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 1024 // CHECK: br i1 %[[VAL_114]], label %[[VAL_115:.*]], label %[[VAL_107]] + // CHECK: x_in_tile-after: ; preds = %[[VAL_116:.*]], %[[VAL_110]] -// CHECK: br label %[[VAL_106]], !llvm.loop !11 -// CHECK: loop2.loop_exit4: ; preds = %[[VAL_106]] +// CHECK: br label %[[VAL_106]], !llvm.loop !{{(11|9)}} + +// CHECK: loop2.loop_exit{{(4|3)}}: ; preds = %[[VAL_106]] // CHECK: br label %[[VAL_45]] + // CHECK: x_in_tile-true: ; preds = %[[VAL_110]] -// CHECK: store i32 0, ptr %[[VAL_22]], align 4 +// CHECK-GCN: %[[VAL_123:.*]] = add i32 %tile_origin.0, 0 +// CHECK-GCN: %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]] +// CHECK-GCN: %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]] +// CHECK-GCN: %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_124]], i32 %[[VAL_125]] +// CHECK-GCN: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !6 +// CHECK-GCN: store float %[[VAL_138]], ptr{{.*}} %[[VAL_29]], align 4 +// CHECK-GCN: %[[VAL_139_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_28]] to ptr +// CHECK-GCN: %[[VAL_139_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_29]] to ptr +// CHECK-GCN: %[[VAL_139_3:.*]] = addrspacecast ptr addrspace(5) %[[VAL_21]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_139_1]], ptr %[[VAL_139_2]], ptr %[[VAL_139_3]]) +// CHECK-GCN: %[[VAL_139:.*]] = load float, ptr{{.*}} %[[VAL_21]], align 4 +// CHECK-GCN: store float %[[VAL_139]], ptr{{.*}} %[[VAL_28]], align 4 +// CHECK-PTX: store i32 0, ptr %[[VAL_22]], align 4 // CHECK: br label %[[VAL_117:.*]] -// CHECK: loop3.loop_header11: ; preds = %[[VAL_118:.*]], %[[VAL_115]] -// CHECK: %[[VAL_119:.*]] = load i32, ptr %[[VAL_22]], align 4 -// CHECK: %[[VAL_120:.*]] = icmp uge i32 %[[VAL_119]], 2 -// CHECK: br i1 %[[VAL_120]], label %[[VAL_116]], label %[[VAL_118]] -// CHECK: loop3.loop_body12: ; preds = %[[VAL_117]] -// CHECK: %[[VAL_121:.*]] = add nuw nsw i32 %[[VAL_119]], 1 -// CHECK: store i32 %[[VAL_121]], ptr %[[VAL_22]], align 4 -// CHECK: %[[VAL_123:.*]] = add i32 %tile_origin.0, 0 -// CHECK: %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]] -// CHECK: %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]] -// CHECK: %[[VAL_126:.*]] = add i32 %tile_origin.3, %[[VAL_119]] -// CHECK: %[[VAL_127:.*]] = mul nuw nsw i32 %[[VAL_126]], 1 -// CHECK: %[[VAL_128:.*]] = add nuw nsw i32 0, %[[VAL_127]] -// CHECK: %[[VAL_129:.*]] = mul nuw nsw i32 %[[VAL_125]], 2 -// CHECK: %[[VAL_130:.*]] = add nuw nsw i32 %[[VAL_128]], %[[VAL_129]] -// CHECK: %[[VAL_131:.*]] = udiv i32 %[[VAL_130]], 1024 -// CHECK: %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_124]], 1 -// CHECK: %[[VAL_133:.*]] = add nuw nsw i32 0, %[[VAL_132]] -// CHECK: %[[VAL_134:.*]] = udiv i32 %[[VAL_133]], 131072 -// CHECK: %[[VAL_135:.*]] = mul nuw nsw i32 %[[VAL_123]], 1 -// CHECK: %[[VAL_136:.*]] = add nuw nsw i32 0, %[[VAL_135]] -// CHECK: %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_133]], i32 %[[VAL_130]] -// CHECK: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !7 -// CHECK: store float %[[VAL_138]], ptr %[[VAL_29]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_21]]) -// CHECK: %[[VAL_139:.*]] = load float, ptr %[[VAL_21]], align 4 -// CHECK: store float %[[VAL_139]], ptr %[[VAL_28]], align 4 -// CHECK: br label %[[VAL_117]], !llvm.loop !12 -// CHECK: loop3.loop_exit10: ; preds = %[[VAL_117]] -// CHECK: br label %[[VAL_107]] + +// CHECK-PTX: loop3.loop_header11: ; preds = %[[VAL_118:.*]], %[[VAL_115]] +// CHECK-PTX: %[[VAL_119:.*]] = load i32, ptr %[[VAL_22]], align 4 +// CHECK-PTX: %[[VAL_120:.*]] = icmp uge i32 %[[VAL_119]], 2 +// CHECK-PTX: br i1 %[[VAL_120]], label %[[VAL_116]], label %[[VAL_118]] + +// CHECK-PTX: loop3.loop_body12: ; preds = %[[VAL_117]] +// CHECK-PTX: %[[VAL_121:.*]] = add nuw nsw i32 %[[VAL_119]], 1 +// CHECK-PTX: store i32 %[[VAL_121]], ptr %[[VAL_22]], align 4 +// CHECK-PTX: %[[VAL_123:.*]] = add i32 %tile_origin.0, 0 +// CHECK-PTX: %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]] +// CHECK-PTX: %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]] +// CHECK-PTX: %[[VAL_126:.*]] = add i32 %tile_origin.3, %[[VAL_119]] +// CHECK-PTX: %[[VAL_127:.*]] = mul nuw nsw i32 %[[VAL_126]], 1 +// CHECK-PTX: %[[VAL_128:.*]] = add nuw nsw i32 0, %[[VAL_127]] +// CHECK-PTX: %[[VAL_129:.*]] = mul nuw nsw i32 %[[VAL_125]], 2 +// CHECK-PTX: %[[VAL_130:.*]] = add nuw nsw i32 %[[VAL_128]], %[[VAL_129]] +// CHECK-PTX: %[[VAL_131:.*]] = udiv i32 %[[VAL_130]], 1024 +// CHECK-PTX: %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_124]], 1 +// CHECK-PTX: %[[VAL_133:.*]] = add nuw nsw i32 0, %[[VAL_132]] +// CHECK-PTX: %[[VAL_134:.*]] = udiv i32 %[[VAL_133]], 131072 +// CHECK-PTX: %[[VAL_135:.*]] = mul nuw nsw i32 %[[VAL_123]], 1 +// CHECK-PTX: %[[VAL_136:.*]] = add nuw nsw i32 0, %[[VAL_135]] +// CHECK-PTX: %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_133]], i32 %[[VAL_130]] +// CHECK-PTX: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !7 +// CHECK-PTX: store float %[[VAL_138]], ptr %[[VAL_29]], align 4 +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_21]]) +// CHECK-PTX: %[[VAL_139:.*]] = load float, ptr %[[VAL_21]], align 4 +// CHECK-PTX: store float %[[VAL_139]], ptr %[[VAL_28]], align 4 +// CHECK-PTX: br label %[[VAL_117]], !llvm.loop !12 + +// CHECK-PTX: loop3.loop_exit10: ; preds = %[[VAL_117]] +// CHECK-PTX: br label %[[VAL_107]] + // CHECK: thread_in_bounds-true: ; preds = %[[VAL_48]] // CHECK: %[[VAL_140:.*]] = icmp eq i32 %lane_id, 0 // CHECK: br i1 %[[VAL_140]], label %[[VAL_141:.*]], label %[[VAL_142:.*]] + // CHECK: intra_warp_reduce_write-after: ; preds = %[[VAL_141]], %thread_in_bounds-true -// CHECK: call void @llvm.nvvm.barrier0() +// CHECK-PTX: call void @llvm.nvvm.barrier0() +// CHECK-GCN: fence syncscope("workgroup") seq_cst +// CHECK-GCN: call void @llvm.amdgcn.s.barrier() // CHECK: %[[VAL_143:.*]] = icmp eq i32 %[[VAL_71]], 0 // CHECK: br i1 %[[VAL_143]], label %[[VAL_144:.*]], label %[[VAL_73]] + // CHECK: inter_warp_reduce-after: ; preds = %[[VAL_145:.*]], %[[VAL_142]] // CHECK: br label %thread_in_bounds-after + // CHECK: intra_warp_reduce_write-true: ; preds = %thread_in_bounds-true -// CHECK: %[[VAL_146:.*]] = load float, ptr %[[VAL_28]], align 4 +// CHECK: %[[VAL_146:.*]] = load float, ptr{{.*}} %[[VAL_28]], align 4 // CHECK: %[[VAL_147:.*]] = getelementptr inbounds [4 x [2 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %[[VAL_71]] // CHECK: %[[VAL_148:.*]] = addrspacecast ptr addrspace(3) %[[VAL_147]] to ptr // CHECK: store float %[[VAL_146]], ptr %[[VAL_148]], align 4 // CHECK: br label %[[VAL_142]] + // CHECK: inter_warp_reduce-true: ; preds = %[[VAL_142]] // CHECK: %[[VAL_149:.*]] = getelementptr inbounds [4 x [2 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %lane_id // CHECK: %[[VAL_150:.*]] = addrspacecast ptr addrspace(3) %[[VAL_149]] to ptr -// CHECK: store float %[[VAL_35]], ptr %[[VAL_10]], align 4 +// CHECK-GCN: %[[VAL_150_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_10]] to ptr +// CHECK-GCN: store float %[[VAL_35]], ptr %[[VAL_150_1]], align 4 +// CHECK-PTX: store float %[[VAL_35]], ptr %[[VAL_10]], align 4 // CHECK: %[[VAL_151:.*]] = icmp ult i32 %thread.id.2, 2 -// CHECK: %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_10]] +// CHECK-GCN: %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_150_1]] +// CHECK-PTX: %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_10]] // CHECK: %[[VAL_153:.*]] = load float, ptr %[[VAL_152]], align 4 -// CHECK: %[[VAL_154:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_153]], i32 16, i32 31) -// CHECK: store float %[[VAL_154]], ptr %[[VAL_9]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_9]], ptr %[[VAL_8]]) -// CHECK: %[[VAL_155:.*]] = load float, ptr %[[VAL_8]], align 4 +// CHECK-GCN: %[[VAL_154_1:.*]] = bitcast float %[[VAL_153]] to i32 +// CHECK-GCN: %[[VAL_154_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_154_1]], i32 16) +// CHECK-GCN: %[[VAL_154:.*]] = bitcast i32 %[[VAL_154_2]] to float +// CHECK-PTX: %[[VAL_154:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_153]], i32 16, i32 31) +// CHECK: store float %[[VAL_154]], ptr{{.*}} %[[VAL_9]], align 4 +// CHECK-GCN: %[[VAL_155_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_9]] to ptr +// CHECK-GCN: %[[VAL_155_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_8]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_155_1]], ptr %[[VAL_155_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_9]], ptr %[[VAL_8]]) +// CHECK: %[[VAL_155:.*]] = load float, ptr{{.*}} %[[VAL_8]], align 4 // CHECK: store float %[[VAL_155]], ptr %[[VAL_152]], align 4 // CHECK: %[[VAL_156:.*]] = load float, ptr %[[VAL_152]], align 4 -// CHECK: %[[VAL_157:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_156]], i32 8, i32 31) -// CHECK: store float %[[VAL_157]], ptr %[[VAL_7]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_7]], ptr %[[VAL_6]]) -// CHECK: %[[VAL_158:.*]] = load float, ptr %[[VAL_6]], align 4 +// CHECK-GCN: %[[VAL_157_1:.*]] = bitcast float %[[VAL_156]] to i32 +// CHECK-GCN: %[[VAL_157_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_157_1]], i32 8) +// CHECK-GCN: %[[VAL_157:.*]] = bitcast i32 %[[VAL_157_2]] to float +// CHECK-PTX: %[[VAL_157:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_156]], i32 8, i32 31) +// CHECK: store float %[[VAL_157]], ptr{{.*}} %[[VAL_7]], align 4 +// CHECK-GCN: %[[VAL_158_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_7]] to ptr +// CHECK-GCN: %[[VAL_158_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_6]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_158_1]], ptr %[[VAL_158_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_7]], ptr %[[VAL_6]]) +// CHECK: %[[VAL_158:.*]] = load float, ptr{{.*}} %[[VAL_6]], align 4 // CHECK: store float %[[VAL_158]], ptr %[[VAL_152]], align 4 // CHECK: %[[VAL_159:.*]] = load float, ptr %[[VAL_152]], align 4 -// CHECK: %[[VAL_160:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_159]], i32 4, i32 31) -// CHECK: store float %[[VAL_160]], ptr %[[VAL_5]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_5]], ptr %[[VAL_4]]) -// CHECK: %[[VAL_161:.*]] = load float, ptr %[[VAL_4]], align 4 +// CHECK-GCN: %[[VAL_160_1:.*]] = bitcast float %[[VAL_159]] to i32 +// CHECK-GCN: %[[VAL_160_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_160_1]], i32 4) +// CHECK-GCN: %[[VAL_160:.*]] = bitcast i32 %[[VAL_160_2]] to float +// CHECK-PTX: %[[VAL_160:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_159]], i32 4, i32 31) +// CHECK: store float %[[VAL_160]], ptr{{.*}} %[[VAL_5]], align 4 +// CHECK-GCN: %[[VAL_161_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_5]] to ptr +// CHECK-GCN: %[[VAL_161_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_4]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_161_1]], ptr %[[VAL_161_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_5]], ptr %[[VAL_4]]) +// CHECK: %[[VAL_161:.*]] = load float, ptr{{.*}} %[[VAL_4]], align 4 // CHECK: store float %[[VAL_161]], ptr %[[VAL_152]], align 4 // CHECK: %[[VAL_162:.*]] = load float, ptr %[[VAL_152]], align 4 -// CHECK: %[[VAL_163:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_162]], i32 2, i32 31) -// CHECK: store float %[[VAL_163]], ptr %[[VAL_3]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_3]], ptr %[[VAL_2]]) -// CHECK: %[[VAL_164:.*]] = load float, ptr %[[VAL_2]], align 4 +// CHECK-GCN: %[[VAL_163_1:.*]] = bitcast float %[[VAL_162]] to i32 +// CHECK-GCN: %[[VAL_163_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_163_1]], i32 2) +// CHECK-GCN: %[[VAL_163:.*]] = bitcast i32 %[[VAL_163_2]] to float +// CHECK-PTX: %[[VAL_163:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_162]], i32 2, i32 31) +// CHECK: store float %[[VAL_163]], ptr{{.*}} %[[VAL_3]], align 4 +// CHECK-GCN: %[[VAL_164_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_3]] to ptr +// CHECK-GCN: %[[VAL_164_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_2]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_164_1]], ptr %[[VAL_164_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_3]], ptr %[[VAL_2]]) +// CHECK: %[[VAL_164:.*]] = load float, ptr{{.*}} %[[VAL_2]], align 4 // CHECK: store float %[[VAL_164]], ptr %[[VAL_152]], align 4 // CHECK: %[[VAL_165:.*]] = load float, ptr %[[VAL_152]], align 4 -// CHECK: %[[VAL_166:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_165]], i32 1, i32 31) -// CHECK: store float %[[VAL_166]], ptr %[[VAL_1]], align 4 -// CHECK: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_1]], ptr %[[VAL_0]]) -// CHECK: %[[VAL_167:.*]] = load float, ptr %[[VAL_0]], align 4 +// CHECK-GCN: %[[VAL_166_1:.*]] = bitcast float %[[VAL_165]] to i32 +// CHECK-GCN: %[[VAL_166_2:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_166_1]], i32 1) +// CHECK-GCN: %[[VAL_166:.*]] = bitcast i32 %[[VAL_166_2]] to float +// CHECK-PTX: %[[VAL_166:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_165]], i32 1, i32 31) +// CHECK: store float %[[VAL_166]], ptr{{.*}} %[[VAL_1]], align 4 +// CHECK-GCN: %[[VAL_167_1:.*]] = addrspacecast ptr addrspace(5) %[[VAL_1]] to ptr +// CHECK-GCN: %[[VAL_167_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_0]] to ptr +// CHECK-GCN: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_167_1]], ptr %[[VAL_167_2]]) +// CHECK-PTX: call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_1]], ptr %[[VAL_0]]) +// CHECK: %[[VAL_167:.*]] = load float, ptr{{.*}} %[[VAL_0]], align 4 // CHECK: store float %[[VAL_167]], ptr %[[VAL_152]], align 4 // CHECK: %[[VAL_168:.*]] = icmp eq i32 %thread.id.2, 0 // CHECK: br i1 %[[VAL_168]], label %[[VAL_169:.*]], label %[[VAL_145]] + // CHECK: reduction_write_output-after: ; preds = %[[VAL_169]], %[[VAL_144]] // CHECK: br label %[[VAL_73]] // CHECK: reduction_write_output-true: ; preds = %[[VAL_144]] @@ -289,7 +413,7 @@ ENTRY reduce.1 { // CHECK: %[[VAL_179:.*]] = load float, ptr %[[VAL_180:.*]], align 4 // CHECK: %[[VAL_181:.*]] = load float, ptr %[[VAL_182:.*]], align 4 // CHECK: %[[VAL_183:.*]] = fadd float %[[VAL_179]], %[[VAL_181]] -// CHECK: store float %[[VAL_183]], ptr %[[VAL_178]], align 4 -// CHECK: %[[VAL_184:.*]] = load float, ptr %[[VAL_178]], align 4 +// CHECK: store float %[[VAL_183]], ptr{{.*}} %[[VAL_178]], align 4 +// CHECK: %[[VAL_184:.*]] = load float, ptr{{.*}} %[[VAL_178]], align 4 // CHECK: store float %[[VAL_184]], ptr %[[VAL_185:.*]], align 4 // CHECK: ret void