[Phi] move reduce_grad kernel into phi (#40522)

* move reduce_mean_grad kernel into phi * move reduce_max/min_grad into phi * remove raw max/min grad kernel * fix bug * fix max/min grad error * move all reduce_grad kernel into one file * add prod grad kernel * add infermeta for prod kernel

[Phi] move reduce_grad kernel into phi (#40522)
* move reduce_mean_grad kernel into phi * move reduce_max/min_grad into phi * remove raw max/min grad kernel * fix bug * fix max/min grad error * move all reduce_grad kernel into one file * add prod grad kernel * add infermeta for prod kernel
70726696 · chentianyu03 · GitHub · 1a13fa0f · 70726696 · 70726696
23 changed file
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -38,7 +38,7 @@ USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
 USE_OP_ITSELF(reduce_sum_grad);
-USE_OP(reduce_mean_grad);
+USE_OP_ITSELF(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
 USE_OP_ITSELF(elementwise_add_grad);

--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -35,13 +35,3 @@ REGISTER_OPERATOR(
    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
    ReduceMaxInferShapeFunctor);
 REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
-REGISTER_OP_CPU_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                  ops::ReduceMeanDoubleGradDescMaker,
                  ops::ReduceMeanDoubleGradOpBaseMaker,
                  ops::ReduceMeanGradNoNeedBufferVarInferer);
-template <typename T>
-using CPUReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
-                          ops::MeanGradFunctor, true>;
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<bool>,
-                       CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>);
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// .part used to speed up nvcc compile
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-template <typename T>
-using CUDAReduceMeanGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::DivideFunctor>;
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
-                        CUDAReduceMeanGradKernel<paddle::platform::float16>,
-                        CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>);
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -35,13 +35,3 @@ REGISTER_OPERATOR(
    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
    ReduceMinInferShapeFunctor);
 REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
-REGISTER_OP_CPU_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -14,6 +14,10 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -26,14 +30,20 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
-REGISTER_REDUCE_OP(reduce_prod);
+namespace ops = paddle::operators;
+class ReduceProdOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_prod"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_prod"; }
+};
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
-REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+REGISTER_OPERATOR(
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+    reduce_prod, ops::ReduceOp, ReduceProdOpMaker,
-                                             float, ops::ProdGradFunctor>,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
-                                             double, ops::ProdGradFunctor>,
+    ReduceProdInferShapeFunctor);
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
-                                             int, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::ProdGradFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -47,8 +47,13 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                           "matmul_grad",
                                                           "matmul_grad_grad",
                                                           "mean",
+                                                           "mean_grad",
                                                           "max",
+                                                           "max_grad",
                                                           "min",
+                                                           "min_grad",
+                                                           "prod",
+                                                           "prod_grad",
                                                           "any",
                                                           "all",
                                                           "reshape",

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -31,10 +31,11 @@ set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_k
    matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
    put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
    softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
-    triangular_solve_grad_kernel determinant_grad_kernel)
+    triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel)
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
 kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
 kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)

--- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -12,33 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/cpu/reduce_grad.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
 namespace phi {
-struct SumGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
 template <typename T, typename Context>
 void ComputeFromInput(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -111,16 +97,38 @@ void ReduceSumGradKernel(const Context& dev_ctx,
    }
  }
-  ReduceGradKernel<Context, T, SumGradFunctor, true>(dev_ctx,
+  ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
-                                                     x,
+                                                            x,
-                                                     out_grad,
+                                                            out_grad,
-                                                     paddle::none,
+                                                            paddle::none,
-                                                     dims,
+                                                            dims,
-                                                     keep_dim,
+                                                            keep_dim,
-                                                     reduce_all,
+                                                            reduce_all,
-                                                     in_dtype,
+                                                            in_dtype,
-                                                     out_dtype,
+                                                            out_dtype,
-                                                     x_grad);
+                                                            x_grad);
+}
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
+                                                             x,
+                                                             out_grad,
+                                                             paddle::none,
+                                                             dims,
+                                                             keep_dim,
+                                                             reduce_all,
+                                                             in_dtype,
+                                                             out_dtype,
+                                                             x_grad);
 }
 }  // namespace phi
@@ -137,3 +145,38 @@ PD_REGISTER_KERNEL(sum_grad,
                   int64_t,
                   phi::dtype::complex<float>,
                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(mean_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(prod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(max_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(min_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -73,5 +73,82 @@ struct AnyFunctor {
  }
 };
+struct MeanGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+struct ProdGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/reduce_grad_functions.h
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -41,14 +41,14 @@ void ReduceGradFunctor(const Context& dev_ctx,
  Eigen::array<int, D> broadcast_dim;
  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-  int broad_cats_times = 1;
+  int broad_cast_times = 1;
  for (size_t i = 0; i < dims_ref.size(); ++i) {
    if (dims_ref[i] < 0) {
      dims_ref[i] = x_rank + dims_ref[i];
    }
    reduced_dims_v[dims_ref[i]] = 1;
    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
-    broad_cats_times *= x_dims[dims_ref[i]];
+    broad_cast_times *= x_dims[dims_ref[i]];
  }
  auto reduced_dims = phi::make_ddim(reduced_dims_v);
  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
@@ -62,7 +62,7 @@ void ReduceGradFunctor(const Context& dev_ctx,
          &x_grad,
          &x_reduce_grad,
          broadcast_dim,
-          broad_cats_times);
+          broad_cast_times);
 }
 inline void GetOriginDimFromShuffled(const DDim& src_dim,

--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -43,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx,
      }));
 }
+template <typename T,
+          typename Context,
+          template <typename, typename> class TransformOp>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+  auto pt_out_dtype = in_dtype;
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+  phi::ReduceGrad<T, TransformOp<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      TransformOp<T, MPType>(reduce_num));
+}
 }  // namespace phi
 #endif
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
 namespace phi {
@@ -31,46 +34,36 @@ void ReduceSumGradKernel(const Context& dev_ctx,
                         DataType in_dtype,
                         DataType out_dtype,
                         DenseTensor* x_grad) {
-  auto* in_x = &x;
+  ReduceGradKernel<T, Context, kps::IdentityFunctor>(dev_ctx,
-  auto* d_out = &out_grad;
+                                                     x,
-  auto* d_x = x_grad;
+                                                     out_grad,
+                                                     dims,
-  auto pt_out_dtype = in_dtype;
+                                                     keep_dim,
+                                                     reduce_all,
-  // get reduce_dim and reduce_num for reduce_mean_grad
+                                                     in_dtype,
-  int dim_size = in_x->dims().size();
+                                                     out_dtype,
-  std::vector<int> reduce_dims =
+                                                     x_grad);
-      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+}
-  auto update_dims = vectorize(d_x->dims());
-  int reduce_num = 1;
-  for (auto i : reduce_dims) {
-    reduce_num *= (in_x->dims())[i];
-    update_dims[i] = 1;
-  }
-  // make new tensor
-  DenseTensor new_d_out(d_out->dtype());
-  new_d_out.ShareDataWith(*d_out);
-  new_d_out.Resize(phi::make_ddim(update_dims));
-  if (in_dtype != DataType::UNDEFINED) {
-    dev_ctx.Alloc(d_x, in_dtype);
-  } else {
-    dev_ctx.Alloc(d_x, d_out->dtype());
-  }
-  auto pt_d_out = new_d_out;
-  auto pt_d_x = *d_x;
-  if (in_dtype == DataType::UNDEFINED) {
-    pt_out_dtype = d_out->dtype();
-  }
-  using MPType = typename kps::details::MPTypeTrait<T>::Type;
-  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
+template <typename T, typename Context>
-      dev_ctx,
+void ReduceMeanGradKernel(const Context& dev_ctx,
-      &pt_d_out,
+                          const DenseTensor& x,
-      &pt_d_x,
+                          const DenseTensor& out_grad,
-      pt_out_dtype,
+                          const std::vector<int64_t>& dims,
-      kps::IdentityFunctor<T, MPType>(reduce_num));
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::DivideFunctor>(dev_ctx,
+                                                   x,
+                                                   out_grad,
+                                                   dims,
+                                                   keep_dim,
+                                                   reduce_all,
+                                                   in_dtype,
+                                                   out_dtype,
+                                                   x_grad);
 }
 }  // namespace phi
@@ -88,3 +81,39 @@ PD_REGISTER_KERNEL(sum_grad,
                   int64_t,
                   phi::dtype::complex<float>,
                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(mean_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(prod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(max_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(min_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/reduce_grad.h
+++ b/paddle/phi/kernels/cpu/reduce_grad.h
--- a/paddle/phi/kernels/reduce_sum_grad_kernel.h
+++ b/paddle/phi/kernels/reduce_sum_grad_kernel.h
@@ -14,19 +14,34 @@
 #pragma once
-#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
-#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
 namespace phi {
 template <typename T, typename Context>
-void ReduceSumGradKernel(const Context& dev_ctx,
+void ReduceMaxGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
                         const std::vector<int64_t>& dims,
                         bool keep_dim,
                         bool reduce_all,
                         DataType in_dtype,
                         DataType out_dtype,
-                         DenseTensor* x_grad);
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out_grad,
+                                                           out,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
 }  // namespace phi
--- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+namespace phi {
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out_grad,
+                                                           out,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+namespace phi {
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
+                                                       x,
+                                                       out_grad,
+                                                       out,
+                                                       dims,
+                                                       keep_dim,
+                                                       reduce_all,
+                                                       in_dtype,
+                                                       out_dtype,
+                                                       x_grad);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/reduce_grad_kernel.h
+++ b/paddle/phi/kernels/reduce_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/reduce_kernel.h
+++ b/paddle/phi/kernels/reduce_kernel.h
@@ -16,7 +16,6 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 namespace phi {
 template <typename T, typename Context>

--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -136,6 +136,42 @@ KernelSignature ReduceSumGradOpArgumentMapping(
      {GradVarName("X")});
 }
+KernelSignature ReduceMeanGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mean_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+KernelSignature ReduceMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+KernelSignature ReduceMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "min_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+KernelSignature ReduceProdGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "prod_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
 }  // namespace phi
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
@@ -147,6 +183,10 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad);
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
@@ -158,3 +198,11 @@ PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
                           phi::ReduceSumGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad,
+                           phi::ReduceMeanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad,
+                           phi::ReduceProdGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad,
+                           phi::ReduceMaxGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad,
+                           phi::ReduceMinGradOpArgumentMapping);