diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 9cebf1bdfea6711c8f147734222829700e6025e5..70c9d7e836dd8b166178c96379633a6052c7b846 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -20,15 +20,6 @@ #include #include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif - -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - #include "paddle/fluid/framework/tensor.h" #include "paddle/pten/core/dense_tensor.h" @@ -46,7 +37,7 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - pten::kernels::TensorReduceFunctorImpl( + pten::kernels::TensorReduceImpl( static_cast(dev_ctx), x, y, transform, origin_reduce_dims, stream); } diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 947f969b077d001ad3d4b4b35be07347d8831079..c9181a1fdfd6e32459da17a187f9b47b386c43a3 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -2016,10 +2016,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceFunctorImpl>( + kernels::TensorReduceImpl>( ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -2034,10 +2031,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceFunctorImpl>( + kernels::TensorReduceImpl>( ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -2133,10 +2127,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceFunctorImpl>( + kernels::TensorReduceImpl>( ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -2157,10 +2148,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceFunctorImpl>( + kernels::TensorReduceImpl>( ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); } } diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index b0fbef9a18e8882d6383ea5ef93eb301fa56a220..ad988317d571f68e719e0cb372228ff81f597ed2 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -1007,12 +1007,12 @@ template ::value, void>::type - CubTensorReduceFunctorImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - gpuStream_t stream) { + CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const paddle::platform::Place& place, + gpuStream_t stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, transform); @@ -1051,12 +1051,12 @@ template ::value, void>::type - CubTensorReduceFunctorImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - gpuStream_t stream) { + CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const paddle::platform::Place& place, + gpuStream_t stream) { PADDLE_THROW(pten::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } @@ -1065,12 +1065,12 @@ template class ReduceOp, typename TransformOp> -void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx, - const pten::DenseTensor& x, - pten::DenseTensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - gpuStream_t stream) { +void TensorReduceImpl(const pten::GPUContext& dev_ctx, + const pten::DenseTensor& x, + pten::DenseTensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims, + gpuStream_t stream) { y->mutable_data(x.place()); auto x_dim = pten::framework::vectorize(x.dims()); @@ -1102,7 +1102,7 @@ void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx, constexpr bool kIsTxFP16 = std::is_same::value; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; if (use_cub_reduce) { - CubTensorReduceFunctorImpl( + CubTensorReduceImpl( x_data, y_data, transform, config.reduce_num, x.place(), stream); return; } @@ -1239,13 +1239,13 @@ void Reduce(const GPUContext& dev_ctx, pten::DataType::INT64, pten::DataType::FLOAT16, out_dtype, - "TensorReduceFunctorImpl", + "TensorReduceImpl", ([&] { using MPType = typename kps::details::MPTypeTrait::Type; - pten::kernels::TensorReduceFunctorImpl>( + pten::kernels::TensorReduceImpl>( dev_ctx, tmp_tensor, out, @@ -1255,14 +1255,13 @@ void Reduce(const GPUContext& dev_ctx, })); } else { using MPType = typename kps::details::MPTypeTrait::Type; - pten::kernels:: - TensorReduceFunctorImpl>( - dev_ctx, - x, - out, - TransformOp(reduce_num), - reduce_dims, - stream); + pten::kernels::TensorReduceImpl>( + dev_ctx, + x, + out, + TransformOp(reduce_num), + reduce_dims, + stream); } } } // namespace pten diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index 226cfd89b13271dfdcf62f8f4879a39e43df0450..b346acb6e25c6ab2936980034794cf750a248468 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -60,10 +60,7 @@ struct ReduceSumForMatmulGrad { DenseTensor* output, const std::vector& reduce_dims) { auto stream = dev_ctx.stream(); - kernels::TensorReduceFunctorImpl>( + kernels::TensorReduceImpl>( dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); } };