From b85c8e03befe0152db6079daf54a92338604fc4f Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 9 Mar 2021 11:02:08 +0800 Subject: [PATCH] [ROCM] fix reduce op, test=develop (#31478) --- .../framework/details/nan_inf_utils_detail.cu | 11 +++++++--- paddle/fluid/imperative/reducer.cc | 2 +- .../operators/reduce_ops/reduce_prod_op.cu | 12 ++++++++++ paddle/fluid/platform/gpu_launch_config.h | 1 + .../fluid/tests/unittests/test_reduce_op.py | 22 ++++++++++++++++--- 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 55261cf7cde..96d1a9fb949 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value, } __syncthreads; -#ifdef PADDLE_WITH_HIP +#ifdef __HIPCC__ if (true && hipThreadIdx_x == 0) { printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x, nan_count, inf_count, num_count); @@ -156,7 +156,7 @@ void TensorCheckerVisitor::apply( "op_var2gpu_str, but now failed", op_var)); -#ifdef PADDLE_WITH_HIP +#ifdef __HIPCC__ PADDLE_ENFORCE_CUDA_SUCCESS( hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, hipMemcpyHostToDevice, dev_ctx->stream())); @@ -176,11 +176,16 @@ void TensorCheckerVisitor::apply( } } +#ifdef __HIPCC__ + // HIP will throw GPU memory access fault if threads > 256 + const size_t threads = 256; +#else const size_t threads = 1024; +#endif size_t blocks = std::min(static_cast(128), static_cast((tensor_.numel() + threads - 1) / threads)); -#ifdef PADDLE_WITH_HIP +#ifdef __HIPCC__ hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0, dev_ctx->stream(), tensor_.data(), tensor_.numel(), print_num, gpu_str_ptr); diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 5dd7e2d8213..e8b531d35ca 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) { cv_.notify_all(); } }); -#elif defined(PADDLE_WITH_NCCL) +#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) FusedAllReduceSchedule(run_order, group); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 4434937f753..44e76c78b1f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -14,6 +14,17 @@ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#ifdef __HIPCC__ +// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 +// do not support double in HIPCC platform (Eigen3 to be fixed) +REGISTER_OP_CUDA_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +#else REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceKernel, @@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod, int, ops::ProdFunctor>, ops::ReduceKernel); +#endif diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h index e94bf6d89da..6c265677d63 100644 --- a/paddle/fluid/platform/gpu_launch_config.h +++ b/paddle/fluid/platform/gpu_launch_config.h @@ -42,6 +42,7 @@ struct GpuLaunchConfig { inline GpuLaunchConfig GetGpuLaunchConfig1D( const platform::CUDADeviceContext& context, int element_count, #ifdef PADDLE_WITH_HIP + // HIP will throw GPU memory access fault if threads > 256 int max_threads = 256) { #else int max_threads = 1024) { diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index e549a2eca2d..912df563fcd 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -156,9 +156,14 @@ class TestMin8DOp(OpTest): class TestProdOp(OpTest): def setUp(self): self.op_type = "reduce_prod" - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.init_data_type() + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.data_type)} self.outputs = {'Out': self.inputs['X'].prod(axis=0)} + def init_data_type(self): + self.data_type = "float32" if core.is_compiled_with_rocm( + ) else "float64" + def test_check_output(self): self.check_output() @@ -169,14 +174,19 @@ class TestProdOp(OpTest): class TestProd6DOp(OpTest): def setUp(self): self.op_type = "reduce_prod" + self.init_data_type() self.inputs = { - 'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64") + 'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.data_type) } self.attrs = {'dim': [2, 3, 4]} self.outputs = { 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) } + def init_data_type(self): + self.data_type = "float32" if core.is_compiled_with_rocm( + ) else "float64" + def test_check_output(self): self.check_output() @@ -187,14 +197,20 @@ class TestProd6DOp(OpTest): class TestProd8DOp(OpTest): def setUp(self): self.op_type = "reduce_prod" + self.init_data_type() self.inputs = { - 'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64") + 'X': np.random.random( + (2, 5, 3, 2, 2, 3, 4, 2)).astype(self.data_type) } self.attrs = {'dim': [2, 3, 4]} self.outputs = { 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) } + def init_data_type(self): + self.data_type = "float32" if core.is_compiled_with_rocm( + ) else "float64" + def test_check_output(self): self.check_output() -- GitLab