Merge pull request #57979 from ROCmSoftwarePlatform:fixed_gpu_kernel_tiling_test_2

PiperOrigin-RevId: 480958124

Merge pull request #57979 from ROCmSoftwarePlatform:fixed_gpu_kernel_tiling_test_2
PiperOrigin-RevId: 480958124
df1d9f90 · TensorFlower Gardener · f7d6896e · 46731284 · df1d9f90 · df1d9f90
2 changed file
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -68,8 +68,17 @@ std::string GpuCodegenTest::MakePlatformSpecificLlvm(absl::string_view input) {
        is_built_with_rocm_ ? "@llvm.amdgcn.s.barrier" : "@llvm.nvvm.barrier0"},
       {"SHUFFLE", is_built_with_rocm_ ? "i32 @llvm.amdgcn.ds.bpermute"
                                       : "float @llvm.nvvm.shfl.sync.down.f32"},
-       {"TIDX", is_built_with_rocm_ ? "llvm.amdgcn.workitem.id.x"
-                                    : "@llvm.nvvm.read.ptx.sreg.tid.x"}});
+       {"TIDX", is_built_with_rocm_ ? "@llvm.amdgcn.workitem.id.x"
+                                    : "@llvm.nvvm.read.ptx.sreg.tid.x"},
+       {"LCAL", is_built_with_rocm_ ? "%[[LOGICAL_T1:.*]] = call { i1, i64 } "
+                                      "@llvm.amdgcn.if.i64(i1 %[[LOGICAL_T0]])"
+                                    : "0"},
+       {"EXTV",
+        is_built_with_rocm_
+            ? "%[[LOGICAL_T2:.*]] = extractvalue { i1, i64 } %[[LOGICAL_T1]], 0"
+            : "0"},
+       {"BR_CAL", is_built_with_rocm_ ? "br i1 %[[LOGICAL_T2]],"
+                                      : "br i1 %[[LOGICAL_T0]]"}});
 }

 }  // namespace gpu

--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -535,7 +535,9 @@ TEST_F(GpuKernelTilingTest, RowReductionTwoRowsPerWarp) {
 ; CHECK: %[[TID_LOGICAL:.*]] = and i32 %[[TID_X]], 15
 ; CHECK: call SHUFFLE
 ; CHECK: %[[LOGICAL_T0:.*]] = icmp eq i32 %[[TID_LOGICAL]], 0
-; CHECK: br i1 %[[LOGICAL_T0]],
+; CHECK: LCAL
+; CHECK: EXTV
+; CHECK: BR_CAL
 )";
  CompileAndVerifyIr(std::move(hlo_module),
                     MakePlatformSpecificLlvm(expected_ir),
@@ -572,8 +574,11 @@ TEST_F(GpuKernelTilingTest, RowReductionFourRowsPerWarp) {
 ; CHECK: %[[TID_LOGICAL:.*]] = and i32 %[[TID_X]], 7
 ; CHECK: call SHUFFLE
 ; CHECK: %[[LOGICAL_T0:.*]] = icmp eq i32 %[[TID_LOGICAL]], 0
-; CHECK: br i1 %[[LOGICAL_T0]],
+; CHECK: LCAL
+; CHECK: EXTV
+; CHECK: BR_CAL
 )";
+
  CompileAndVerifyIr(std::move(hlo_module),
                     MakePlatformSpecificLlvm(expected_ir),
                     /*match_optimized_ir=*/true);