Rename partial function name TensorReduceFunctorImpl to TensorReduceImpl. (#39388)

f71241b9 · Yiqun Liu · GitHub · e4d475ea · f71241b9 · f71241b9
17 changed file
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -89,7 +89,7 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
      } else {
        // reduce_sum implementation on CUDA
        auto stream = context.cuda_device_context().stream();
-        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
            context.cuda_device_context(), *input_tensor, output_tensor,
            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
      }

--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -114,7 +114,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
      }
    }
    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
        out_reduce_dims, stream);
  }

--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -75,7 +75,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
    }
    Tensor tmp = context.AllocateTmpTensor<float, platform::CUDADeviceContext>(
        {1}, dev_ctx);
-    TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
+    TensorReduceImpl<platform::float16, float, kps::AddFunctor,
                     kps::SquareFunctor<platform::float16, float>>(
        dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
        reduce_dims, dev_ctx.stream());

--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -63,8 +63,7 @@ class CompareReduceOpKernel
      reduce_dims.resize(tmp.dims().size());
      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
      auto stream = context.cuda_device_context().stream();
-      TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
+      TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
-                              kps::IdentityFunctor<bool>>(
          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
          reduce_dims, stream);
    }

--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1188,7 +1188,7 @@ template <typename T>
 void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
                   framework::Tensor *src, framework::Tensor *dst) {
  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
-  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
      dev_ctx.stream());
 }

--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -165,7 +165,7 @@ class AttnMatMul {
           (input_dims[2] == output_dims[0]));
      if (support_case_1 || support_case_2) {
        gpuStream_t stream = dev_ctx_.stream();
-        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
            stream);
      } else {

--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -305,11 +305,11 @@ struct KronGradOpFunctor {
 #if defined(__NVCC__) || defined(__HIPCC__)
    auto stream = dev_ctx.stream();  // it is a cuda device_context
    if (dx) {
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
    }
    if (dy) {
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
    }
 #else

--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -298,7 +298,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
    logits_max =
        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
    T* logits_max_buff = logits_max.mutable_data<T>(place);
-    TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
        dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
        dev_ctx.stream());
@@ -320,7 +320,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
    sum_exp_logits =
        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
    T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
        dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
        dev_ctx.stream());

--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -65,7 +65,7 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
    for (decltype(rank) i = 0; i < rank; ++i) {
      reduce_dims.push_back(i);
    }
-    TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
+    TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>(
        context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
        stream);
  }

--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -105,19 +105,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
    using MT = typename details::MPTypeTrait<T>::Type;
    if (porder == 0) {
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
          ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
          reduce_axis, stream);
    } else if (porder == INFINITY) {
-      TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
+      TensorReduceImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
          reduce_axis, stream);
    } else if (porder == -INFINITY) {
-      TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
+      TensorReduceImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
          reduce_axis, stream);
    } else {
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
+      TensorReduceImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
          ctx.cuda_device_context(), *in_x, out_norm,
          UnsignedPowFunctor<T>(porder), reduce_axis, stream);

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -206,8 +206,7 @@ class PoolKernel : public framework::OpKernel<T> {
              adaptive) {  // for adaptive_avg_pool2d && output_size == 1
 #if defined(__HIPCC__) || defined(__NVCC__)
            auto stream = dev_ctx.stream();
-            TensorReduceFunctorImpl<T, T, kps::AddFunctor,
+            TensorReduceImpl<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
-                                    kps::DivideFunctor<T>>(
                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
                reduce_dim, stream);
 #else  // for cpu

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -185,7 +185,7 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
      reduce_dims.push_back(i);
    }
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        context.cuda_device_context(), dalpha_tmp, dalpha,
        kps::IdentityFunctor<T>(), reduce_dims, stream);
  }

--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -39,7 +39,7 @@ namespace operators {
 template <typename Tx, typename Ty, template <typename> class ReduceOp,
          typename TransformOp>
-void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
+void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
                      const framework::Tensor& x, framework::Tensor* y,
                      const TransformOp& transform,
                      const std::vector<int>& origin_reduce_dims,

--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -155,7 +155,7 @@ class CUDARenormKernel : public framework::OpKernel<T> {
        ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
        cuda_ctx, ins, &outs, func);
    std::vector<int> reduce_axis = {0, 2};
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
        stream);
    RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
@@ -213,10 +213,10 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
        mul_value.mutable_data<T>(ctx.GetPlace()), numel, dimension_each, p,
        dim_divisor);
    std::vector<int> reduce_axis = {0, 2};
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        ctx.cuda_device_context(), pow_value, &dim_value,
        kps::IdentityFunctor<T>(), reduce_axis, stream);
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        ctx.cuda_device_context(), mul_value, &weight_derivative,
        kps::IdentityFunctor<T>(), reduce_axis, stream);
    RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(

--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -45,7 +45,7 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
                       const paddle::framework::ExecutionContext& ctx) {
 #if defined(__NVCC__) || defined(__HIPCC__)
  auto stream = ctx.cuda_device_context().stream();
-  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
      ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
      reduce_dims, stream);
 #else

--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -39,7 +39,7 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
      auto stream = context.cuda_device_context().stream();
      std::vector<int> reduce_dims;
      reduce_dims.push_back(out->dims().size());
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
          context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
          reduce_dims, stream);
    } else {

--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -44,7 +44,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
      }
    }
    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
        out_reduce_dims, stream);
  }