未验证 提交 b85c8e03 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] fix reduce op, test=develop (#31478)

上级 39a5424e
...@@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value, ...@@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
} }
__syncthreads; __syncthreads;
#ifdef PADDLE_WITH_HIP #ifdef __HIPCC__
if (true && hipThreadIdx_x == 0) { if (true && hipThreadIdx_x == 0) {
printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x, printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
nan_count, inf_count, num_count); nan_count, inf_count, num_count);
...@@ -156,7 +156,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -156,7 +156,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed", "op_var2gpu_str, but now failed",
op_var)); op_var));
#ifdef PADDLE_WITH_HIP #ifdef __HIPCC__
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
hipMemcpyHostToDevice, dev_ctx->stream())); hipMemcpyHostToDevice, dev_ctx->stream()));
...@@ -176,11 +176,16 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -176,11 +176,16 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
} }
} }
#ifdef __HIPCC__
// HIP will throw GPU memory access fault if threads > 256
const size_t threads = 256;
#else
const size_t threads = 1024; const size_t threads = 1024;
#endif
size_t blocks = size_t blocks =
std::min(static_cast<size_t>(128), std::min(static_cast<size_t>(128),
static_cast<size_t>((tensor_.numel() + threads - 1) / threads)); static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
#ifdef PADDLE_WITH_HIP #ifdef __HIPCC__
hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0, hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(), dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
print_num, gpu_str_ptr); print_num, gpu_str_ptr);
......
...@@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) { ...@@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
cv_.notify_all(); cv_.notify_all();
} }
}); });
#elif defined(PADDLE_WITH_NCCL) #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
FusedAllReduceSchedule(run_order, group); FusedAllReduceSchedule(run_order, group);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -14,6 +14,17 @@ ...@@ -14,6 +14,17 @@
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
#ifdef __HIPCC__
// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
// do not support double in HIPCC platform (Eigen3 to be fixed)
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
#else
REGISTER_OP_CUDA_KERNEL(reduce_prod, REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>, float, ops::ProdFunctor>,
...@@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod, ...@@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
int, ops::ProdFunctor>, int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>); int64_t, ops::ProdFunctor>);
#endif
...@@ -42,6 +42,7 @@ struct GpuLaunchConfig { ...@@ -42,6 +42,7 @@ struct GpuLaunchConfig {
inline GpuLaunchConfig GetGpuLaunchConfig1D( inline GpuLaunchConfig GetGpuLaunchConfig1D(
const platform::CUDADeviceContext& context, int element_count, const platform::CUDADeviceContext& context, int element_count,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// HIP will throw GPU memory access fault if threads > 256
int max_threads = 256) { int max_threads = 256) {
#else #else
int max_threads = 1024) { int max_threads = 1024) {
......
...@@ -156,9 +156,14 @@ class TestMin8DOp(OpTest): ...@@ -156,9 +156,14 @@ class TestMin8DOp(OpTest):
class TestProdOp(OpTest): class TestProdOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "reduce_prod" self.op_type = "reduce_prod"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} self.init_data_type()
self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.data_type)}
self.outputs = {'Out': self.inputs['X'].prod(axis=0)} self.outputs = {'Out': self.inputs['X'].prod(axis=0)}
def init_data_type(self):
self.data_type = "float32" if core.is_compiled_with_rocm(
) else "float64"
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -169,14 +174,19 @@ class TestProdOp(OpTest): ...@@ -169,14 +174,19 @@ class TestProdOp(OpTest):
class TestProd6DOp(OpTest): class TestProd6DOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "reduce_prod" self.op_type = "reduce_prod"
self.init_data_type()
self.inputs = { self.inputs = {
'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64") 'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.data_type)
} }
self.attrs = {'dim': [2, 3, 4]} self.attrs = {'dim': [2, 3, 4]}
self.outputs = { self.outputs = {
'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
} }
def init_data_type(self):
self.data_type = "float32" if core.is_compiled_with_rocm(
) else "float64"
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -187,14 +197,20 @@ class TestProd6DOp(OpTest): ...@@ -187,14 +197,20 @@ class TestProd6DOp(OpTest):
class TestProd8DOp(OpTest): class TestProd8DOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "reduce_prod" self.op_type = "reduce_prod"
self.init_data_type()
self.inputs = { self.inputs = {
'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64") 'X': np.random.random(
(2, 5, 3, 2, 2, 3, 4, 2)).astype(self.data_type)
} }
self.attrs = {'dim': [2, 3, 4]} self.attrs = {'dim': [2, 3, 4]}
self.outputs = { self.outputs = {
'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
} }
def init_data_type(self):
self.data_type = "float32" if core.is_compiled_with_rocm(
) else "float64"
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册