diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu index 5a835c7678fa261dc9b806068def552b198b9ec4..5882258317d7daa6c62905f8a76d5c68060787a8 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cu +++ b/paddle/fluid/operators/broadcast_tensors_op.cu @@ -89,7 +89,7 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { } else { // reduce_sum implementation on CUDA auto stream = context.cuda_device_context().stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( context.cuda_device_context(), *input_tensor, output_tensor, kps::IdentityFunctor(), reduce_dims_vec, stream); } diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu index a5d5baf19dad0d621122c3a71f77ff84c216e5cb..cce30cc29857d2b5c6fd7e7b8fd666aa0a509dc6 100644 --- a/paddle/fluid/operators/cholesky_solve_op.cu +++ b/paddle/fluid/operators/cholesky_solve_op.cu @@ -114,7 +114,7 @@ class MatrixReduceSumFunctor { } } gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), out_reduce_dims, stream); } diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu index f2714d1378510e262271fb19efc382db43be20d7..0b7167844a3f0d7b616758bfdf6351189c331d91 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cu +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -75,8 +75,8 @@ class ClipByNormKernel } Tensor tmp = context.AllocateTmpTensor( {1}, dev_ctx); - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, *input, &tmp, kps::SquareFunctor(), reduce_dims, dev_ctx.stream()); auto tmp_eigen = EigenVector::Flatten(tmp); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu index 6526d774cafe884889354f80a1e29819b17a854e..1dfa7f44279adce410ea95b3f18f06cdc1ea8833 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ b/paddle/fluid/operators/controlflow/compare_all_op.cu @@ -63,8 +63,7 @@ class CompareReduceOpKernel reduce_dims.resize(tmp.dims().size()); for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; auto stream = context.cuda_device_context().stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( context.cuda_device_context(), tmp, z, kps::IdentityFunctor(), reduce_dims, stream); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 74e74870b8e119d5b84e001245a7208082326db0..34d40c741f038f795a5abb701d270b1a4d7984f9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -1188,7 +1188,7 @@ template void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, framework::Tensor *src, framework::Tensor *dst) { std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, dev_ctx.stream()); } diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h index 1128997fd259a1a4553d2668863c7da35bb5bcd9..48f520d60b73588f7c15cc33d5bdfbd06cd571b4 100644 --- a/paddle/fluid/operators/fused/attn_gemm.h +++ b/paddle/fluid/operators/fused/attn_gemm.h @@ -165,7 +165,7 @@ class AttnMatMul { (input_dims[2] == output_dims[0])); if (support_case_1 || support_case_2) { gpuStream_t stream = dev_ctx_.stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx_, *d_output, d_bias, kps::IdentityFunctor(), {0, 1}, stream); } else { diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h index ff3baf4d70f10383ac71989f65501870a8980af2..65f60bc19ad123e1e6397a59a9cb43bff2b6210a 100644 --- a/paddle/fluid/operators/kron_op.h +++ b/paddle/fluid/operators/kron_op.h @@ -305,11 +305,11 @@ struct KronGradOpFunctor { #if defined(__NVCC__) || defined(__HIPCC__) auto stream = dev_ctx.stream(); // it is a cuda device_context if (dx) { - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}, stream); } if (dy) { - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}, stream); } #else diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index a8a3390c0020705322b2b0427b551d27509b3d38..51776f2166dd5a4cb4187073bf04f7be30269c9e 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -298,7 +298,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* logits_max_buff = logits_max.mutable_data(place); - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor(), {1}, dev_ctx.stream()); @@ -320,7 +320,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor(), {1}, dev_ctx.stream()); diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index db8f586297c54221fd792605dd36d13da2fac740..01a5632a960c3611e0638200e7130ed8de879426 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -65,7 +65,7 @@ class MeanCUDAKernel : public framework::OpKernel { for (decltype(rank) i = 0; i < rank; ++i) { reduce_dims.push_back(i); } - TensorReduceFunctorImpl( + TensorReduceImpl( context.cuda_device_context(), *input, output, Div(numel), reduce_dims, stream); } diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index e11fe478106f945dad496de2a669e91ea3e1a0a4..abbbffb6331f582fc09ec05712b40376a2fa1f56 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -105,19 +105,19 @@ class PnormCUDAKernel : public framework::OpKernel { using MT = typename details::MPTypeTrait::Type; if (porder == 0) { - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor(), reduce_axis, stream); } else if (porder == INFINITY) { - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor(), reduce_axis, stream); } else if (porder == -INFINITY) { - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor(), reduce_axis, stream); } else { - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), *in_x, out_norm, UnsignedPowFunctor(porder), reduce_axis, stream); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index f7f60e82216aedad7631e8dc8931198c071eeab3..9e2f6cf223b085c67b8d9b57ef5977c1e9aaa631 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -206,8 +206,7 @@ class PoolKernel : public framework::OpKernel { adaptive) { // for adaptive_avg_pool2d && output_size == 1 #if defined(__HIPCC__) || defined(__NVCC__) auto stream = dev_ctx.stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), reduce_dim, stream); #else // for cpu diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu index 9493b6d4391d53def2efee51b04578ad8ccb895b..0a0afffee9da01d5e26e1f171db0dbd369ec8278 100644 --- a/paddle/fluid/operators/prelu_op.cu +++ b/paddle/fluid/operators/prelu_op.cu @@ -185,7 +185,7 @@ class CUDAPReluGradKernel : public framework::OpKernel { reduce_dims.push_back(i); } - TensorReduceFunctorImpl>( + TensorReduceImpl>( context.cuda_device_context(), dalpha_tmp, dalpha, kps::IdentityFunctor(), reduce_dims, stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index f741c5941eb26f3fc410b3cc96295741583ed615..9cebf1bdfea6711c8f147734222829700e6025e5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -39,11 +39,11 @@ namespace operators { template class ReduceOp, typename TransformOp> -void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& x, framework::Tensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - gpuStream_t stream) { +void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor& x, framework::Tensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims, + gpuStream_t stream) { y->mutable_data(x.place()); pten::kernels::TensorReduceFunctorImpl( diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index ca6169d0410f1171b56df3edb63032e6521887de..ec1b4f6d5b2657cc8aaa76d089b494160aba64d7 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -155,7 +155,7 @@ class CUDARenormKernel : public framework::OpKernel { ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( cuda_ctx, ins, &outs, func); std::vector reduce_axis = {0, 2}; - TensorReduceFunctorImpl>( + TensorReduceImpl>( cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor(), reduce_axis, stream); RenormKernelFunc3<<>>( @@ -213,10 +213,10 @@ class CUDAGradRenormKernel : public framework::OpKernel { mul_value.mutable_data(ctx.GetPlace()), numel, dimension_each, p, dim_divisor); std::vector reduce_axis = {0, 2}; - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), pow_value, &dim_value, kps::IdentityFunctor(), reduce_axis, stream); - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), mul_value, &weight_derivative, kps::IdentityFunctor(), reduce_axis, stream); RenormGradKernelFunc2<<>>( diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h index ea3a5aa5af9b221f6f57aad2288baadbfbba8590..7893b5da12c470cbcfc964b5cb77acbbe89c2cb6 100644 --- a/paddle/fluid/operators/solve_op.h +++ b/paddle/fluid/operators/solve_op.h @@ -45,7 +45,7 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output, const paddle::framework::ExecutionContext& ctx) { #if defined(__NVCC__) || defined(__HIPCC__) auto stream = ctx.cuda_device_context().stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), *input, output, kps::IdentityFunctor(), reduce_dims, stream); #else diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu index 0b9e615eece8fbe149b61afd0f9e2839c44e77e6..3d8a60dd65fc6fe3cf85b4c507ffffff647c414c 100644 --- a/paddle/fluid/operators/trace_op.cu +++ b/paddle/fluid/operators/trace_op.cu @@ -39,7 +39,7 @@ class TraceCUDAKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - TensorReduceFunctorImpl>( + TensorReduceImpl>( context.cuda_device_context(), diag, out, kps::IdentityFunctor(), reduce_dims, stream); } else { diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu index 28cdc56e2aec1460f30399aa6891044360b648bd..098ce74c4a5bab7b17345c4dcb355b45041cdea2 100644 --- a/paddle/fluid/operators/triangular_solve_op.cu +++ b/paddle/fluid/operators/triangular_solve_op.cu @@ -44,7 +44,7 @@ class MatrixReduceSumFunctor { } } gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceFunctorImpl>( + TensorReduceImpl>( ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), out_reduce_dims, stream); }