From 688743bf7ce7846873481dc5fdc2454c6e2de4f6 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 8 Mar 2022 21:22:17 +0800 Subject: [PATCH] Rename phi::func::TensorReduceImpl to phi::func::ReduceKernel. (#40183) --- .../fluid/operators/reduce_ops/reduce_op.cu.h | 4 +-- paddle/phi/kernels/funcs/matrix_reduce.cu | 9 ++---- paddle/phi/kernels/funcs/reduce_function.h | 12 ++++---- .../gpu/broadcast_tensors_grad_kernel.cu | 5 ++-- paddle/phi/kernels/gpu/compare_kernel.cu | 4 +-- paddle/phi/kernels/gpu/elementwise_grad.h | 29 +++++++------------ paddle/phi/kernels/gpu/reduce.h | 24 +++++---------- ...d_cross_entropy_with_logits_grad_kernel.cu | 17 +++-------- ...igmoid_cross_entropy_with_logits_kernel.cu | 18 +++--------- paddle/phi/kernels/gpu/trace_kernel.cu | 5 ++-- .../kernels/impl/matmul_grad_kernel_impl.h | 5 ++-- 11 files changed, 44 insertions(+), 88 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index eb76eee1048..16061769533 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::funcs::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu index 5e288c6e9c2..5c3ebd6bb01 100644 --- a/paddle/phi/kernels/funcs/matrix_reduce.cu +++ b/paddle/phi/kernels/funcs/matrix_reduce.cu @@ -45,13 +45,8 @@ class MatrixReduceSumFunctor { out_reduce_dims.push_back(idx); } } - TensorReduceImpl>( - dev_ctx, - in, - out, - kps::IdentityFunctor(), - out_reduce_dims, - dev_ctx.stream()); + ReduceKernel>( + dev_ctx, in, out, kps::IdentityFunctor(), out_reduce_dims); } }; diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index ce6bb0d559c..5834f091d9a 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -1087,12 +1087,12 @@ template class ReduceOp, typename TransformOp> -void TensorReduceImpl(const phi::GPUContext& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - KPStream stream) { +void ReduceKernel(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& x, + phi::DenseTensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims) { + auto stream = dev_ctx.stream(); dev_ctx.Alloc(y); auto x_dim = phi::vectorize(x.dims()); diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 926dffc7450..d4850b74477 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -87,13 +87,12 @@ void BroadcastTensorsGradKernel(const Context& ctx, *input_tensor, ctx.GetPlace(), ctx, output_tensor); } else { // reduce_sum implementation on CUDA - funcs::TensorReduceImpl>( + funcs::ReduceKernel>( ctx, *input_tensor, output_tensor, kps::IdentityFunctor(), - reduce_dims_vec, - ctx.stream()); + reduce_dims_vec); } } } diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu index 9c02627e546..225164687b7 100644 --- a/paddle/phi/kernels/gpu/compare_kernel.cu +++ b/paddle/phi/kernels/gpu/compare_kernel.cu @@ -80,8 +80,8 @@ inline void CompareAllKernelImpl(const Context& ctx, for (int i = 0; i < reduce_dims.size(); ++i) { reduce_dims[i] = i; } - funcs::TensorReduceImpl>( - ctx, tmp, out, kps::IdentityFunctor(), reduce_dims, ctx.stream()); + funcs::ReduceKernel>( + ctx, tmp, out, kps::IdentityFunctor(), reduce_dims); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index b356f19555f..98df65c92f3 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -29,13 +29,8 @@ void ReduceWrapper(const GPUContext &dev_ctx, DenseTensor *dst) { std::vector reduce_dims = funcs::GetReduceDim(dst->dims(), src->dims(), axis); - funcs::TensorReduceImpl>( - dev_ctx, - *src, - dst, - kps::IdentityFunctor(), - reduce_dims, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims); } template @@ -172,9 +167,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims); } } // dy @@ -187,9 +181,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dy, kps::IdentityFunctor(), reduce_dims); } } } @@ -285,9 +278,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims); } } // dy @@ -306,9 +298,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dy, kps::InverseFunctor(), reduce_dims); } } } diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 0319de7558e..da5315f3447 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -39,8 +39,6 @@ void Reduce(const KPDevice& dev_ctx, reduce_num *= (x.dims())[i]; } - KPStream stream = dev_ctx.stream(); - if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { auto tmp_tensor = phi::Cast(dev_ctx, x, out_dtype); PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES( @@ -48,29 +46,23 @@ void Reduce(const KPDevice& dev_ctx, phi::DataType::INT64, phi::DataType::FLOAT16, out_dtype, - "TensorReduceImpl", + "ReduceKernel", ([&] { using MPType = typename kps::details::MPTypeTrait::Type; - phi::funcs::TensorReduceImpl>( + phi::funcs::ReduceKernel>( dev_ctx, tmp_tensor, out, TransformOp(reduce_num), - reduce_dims, - stream); + reduce_dims); })); } else { using MPType = typename kps::details::MPTypeTrait::Type; - phi::funcs::TensorReduceImpl>( - dev_ctx, - x, - out, - TransformOp(reduce_num), - reduce_dims, - stream); + phi::funcs::ReduceKernel>( + dev_ctx, x, out, TransformOp(reduce_num), reduce_dims); } } } // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index 598b0138fb3..6fc65006ae2 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, dev_ctx.template Alloc(counts_tensor); counts_tensor->Resize(in_grad->dims()); - int limit = in_grad->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; std::vector ins = {&x, &label, &out_grad}; std::vector outs = {in_grad, counts_tensor}; auto functor = SigmoidBwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, functor); if (normalize) { - T *counts = dev_ctx.template Alloc(counts_tensor); DenseTensor *norm_tensor = new DenseTensor(); norm_tensor->Resize({sizeof(T)}); dev_ctx.template Alloc(norm_tensor); @@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - funcs::TensorReduceImpl>( - dev_ctx, - *counts_tensor, - norm_tensor, - NonzeroFunctor(), - reduce_dim, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(norm_tensor); auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); @@ -114,6 +104,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); delete norm_tensor; } + delete counts_tensor; } } // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index 13d63f8d97e..4b6e5628c72 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, dev_ctx.template Alloc(counts_tensor); counts_tensor->Resize(out->dims()); - int limit = out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; std::vector ins = {&x, &label}; std::vector outs = {out, counts_tensor}; auto functor = SigmoidFwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, functor); if (normalize) { - T *counts = dev_ctx.template Alloc(counts_tensor); DenseTensor *norm_tensor = new DenseTensor(); norm_tensor->Resize({sizeof(T)}); dev_ctx.template Alloc(norm_tensor); @@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - funcs::TensorReduceImpl>( - dev_ctx, - *counts_tensor, - norm_tensor, - NonzeroFunctor(), - reduce_dim, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(norm_tensor); auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); @@ -114,8 +104,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); delete norm_tensor; - delete counts_tensor; } + delete counts_tensor; } } // namespace phi diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 4266f0174ff..4a749c5b334 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx, T* out_data = ctx.template Alloc(out); auto diag = funcs::Diagonal(ctx, &x, offset, axis1, axis2); if (diag.numel() > 0) { - auto stream = ctx.stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - funcs::TensorReduceImpl>( - ctx, diag, out, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, diag, out, kps::IdentityFunctor(), reduce_dims); } else { phi::funcs::SetConstant functor; functor(ctx, out, static_cast(0)); diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index d06bdc55030..495b93f2a4e 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad { const DenseTensor& input, DenseTensor* output, const std::vector& reduce_dims) { - auto stream = dev_ctx.stream(); - funcs::TensorReduceImpl>( - dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims); } }; #endif -- GitLab