[ROCM] fix reduce op, test=develop (#31478)

b85c8e03 · Qi Li · GitHub · 39a5424e · b85c8e03 · b85c8e03
5 changed file
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
  }
  __syncthreads;

-#ifdef PADDLE_WITH_HIP
+#ifdef __HIPCC__
  if (true && hipThreadIdx_x == 0) {
    printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
           nan_count, inf_count, num_count);
@@ -156,7 +156,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
                            "op_var2gpu_str, but now failed",
                            op_var));

-#ifdef PADDLE_WITH_HIP
+#ifdef __HIPCC__
      PADDLE_ENFORCE_CUDA_SUCCESS(
          hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                         hipMemcpyHostToDevice, dev_ctx->stream()));
@@ -176,11 +176,16 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
    }
  }

+#ifdef __HIPCC__
+  // HIP will throw GPU memory access fault if threads > 256
+  const size_t threads = 256;
+#else
  const size_t threads = 1024;
+#endif
  size_t blocks =
      std::min(static_cast<size_t>(128),
               static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
-#ifdef PADDLE_WITH_HIP
+#ifdef __HIPCC__
  hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
                     dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
                     print_num, gpu_str_ptr);

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
        cv_.notify_all();
      }
    });
-#elif defined(PADDLE_WITH_NCCL)
+#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
    FusedAllReduceSchedule(run_order, group);
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -14,6 +14,17 @@

 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"

+#ifdef __HIPCC__
+// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
+// do not support double in HIPCC platform (Eigen3 to be fixed)
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+#else
 REGISTER_OP_CUDA_KERNEL(reduce_prod,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          float, ops::ProdFunctor>,
@@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
                                          int, ops::ProdFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::ProdFunctor>);
+#endif
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -42,6 +42,7 @@ struct GpuLaunchConfig {
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
    const platform::CUDADeviceContext& context, int element_count,
 #ifdef PADDLE_WITH_HIP
+    // HIP will throw GPU memory access fault if threads > 256
    int max_threads = 256) {
 #else
    int max_threads = 1024) {

--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -156,9 +156,14 @@ class TestMin8DOp(OpTest):
 class TestProdOp(OpTest):
    def setUp(self):
        self.op_type = "reduce_prod"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.init_data_type()
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.data_type)}
        self.outputs = {'Out': self.inputs['X'].prod(axis=0)}

+    def init_data_type(self):
+        self.data_type = "float32" if core.is_compiled_with_rocm(
+        ) else "float64"
+
    def test_check_output(self):
        self.check_output()

@@ -169,14 +174,19 @@ class TestProdOp(OpTest):
 class TestProd6DOp(OpTest):
    def setUp(self):
        self.op_type = "reduce_prod"
+        self.init_data_type()
        self.inputs = {
-            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64")
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.data_type)
        }
        self.attrs = {'dim': [2, 3, 4]}
        self.outputs = {
            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
        }

+    def init_data_type(self):
+        self.data_type = "float32" if core.is_compiled_with_rocm(
+        ) else "float64"
+
    def test_check_output(self):
        self.check_output()

@@ -187,14 +197,20 @@ class TestProd6DOp(OpTest):
 class TestProd8DOp(OpTest):
    def setUp(self):
        self.op_type = "reduce_prod"
+        self.init_data_type()
        self.inputs = {
-            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+            'X': np.random.random(
+                (2, 5, 3, 2, 2, 3, 4, 2)).astype(self.data_type)
        }
        self.attrs = {'dim': [2, 3, 4]}
        self.outputs = {
            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
        }

+    def init_data_type(self):
+        self.data_type = "float32" if core.is_compiled_with_rocm(
+        ) else "float64"
+
    def test_check_output(self):
        self.check_output()