未验证 提交 b85c8e03 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] fix reduce op, test=develop (#31478)

上级 39a5424e
......@@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
}
__syncthreads;
#ifdef PADDLE_WITH_HIP
#ifdef __HIPCC__
if (true && hipThreadIdx_x == 0) {
printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
nan_count, inf_count, num_count);
......@@ -156,7 +156,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed",
op_var));
#ifdef PADDLE_WITH_HIP
#ifdef __HIPCC__
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
hipMemcpyHostToDevice, dev_ctx->stream()));
......@@ -176,11 +176,16 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
}
}
#ifdef __HIPCC__
// HIP will throw GPU memory access fault if threads > 256
const size_t threads = 256;
#else
const size_t threads = 1024;
#endif
size_t blocks =
std::min(static_cast<size_t>(128),
static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
#ifdef PADDLE_WITH_HIP
#ifdef __HIPCC__
hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
print_num, gpu_str_ptr);
......
......@@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
cv_.notify_all();
}
});
#elif defined(PADDLE_WITH_NCCL)
#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
FusedAllReduceSchedule(run_order, group);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......
......@@ -14,6 +14,17 @@
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
#ifdef __HIPCC__
// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
// do not support double in HIPCC platform (Eigen3 to be fixed)
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
#else
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
......@@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
#endif
......@@ -42,6 +42,7 @@ struct GpuLaunchConfig {
inline GpuLaunchConfig GetGpuLaunchConfig1D(
const platform::CUDADeviceContext& context, int element_count,
#ifdef PADDLE_WITH_HIP
// HIP will throw GPU memory access fault if threads > 256
int max_threads = 256) {
#else
int max_threads = 1024) {
......
......@@ -156,9 +156,14 @@ class TestMin8DOp(OpTest):
class TestProdOp(OpTest):
def setUp(self):
self.op_type = "reduce_prod"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.init_data_type()
self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.data_type)}
self.outputs = {'Out': self.inputs['X'].prod(axis=0)}
def init_data_type(self):
self.data_type = "float32" if core.is_compiled_with_rocm(
) else "float64"
def test_check_output(self):
self.check_output()
......@@ -169,14 +174,19 @@ class TestProdOp(OpTest):
class TestProd6DOp(OpTest):
def setUp(self):
self.op_type = "reduce_prod"
self.init_data_type()
self.inputs = {
'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64")
'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.data_type)
}
self.attrs = {'dim': [2, 3, 4]}
self.outputs = {
'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
}
def init_data_type(self):
self.data_type = "float32" if core.is_compiled_with_rocm(
) else "float64"
def test_check_output(self):
self.check_output()
......@@ -187,14 +197,20 @@ class TestProd6DOp(OpTest):
class TestProd8DOp(OpTest):
def setUp(self):
self.op_type = "reduce_prod"
self.init_data_type()
self.inputs = {
'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
'X': np.random.random(
(2, 5, 3, 2, 2, 3, 4, 2)).astype(self.data_type)
}
self.attrs = {'dim': [2, 3, 4]}
self.outputs = {
'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
}
def init_data_type(self):
self.data_type = "float32" if core.is_compiled_with_rocm(
) else "float64"
def test_check_output(self):
self.check_output()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册