diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
index 18c846bc2b4699ab0fd7b91fe43d1c9f3fcd1c14..ed6df1e558bed673e495a4fd455049dad08fc5ee 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amax_grad,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::AMaxOrAMinGradFunctor>);
+template <typename T>
+using CUDAReduceMaxGradKernel =
+    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
+REGISTER_OP_CUDA_KERNEL(reduce_amax_grad, CUDAReduceMaxGradKernel<int>,
+                        CUDAReduceMaxGradKernel<int64_t>,
+                        CUDAReduceMaxGradKernel<float>,
+                        CUDAReduceMaxGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
index c7a26049634ce685cde29fbb7d3c77d72b4ecc22..69854da3c4f2590eb3f148f3674daf06e900d1f8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amin_grad,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::AMaxOrAMinGradFunctor>);
+template <typename T>
+using CUDAReduceMinGradKernel =
+    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
+REGISTER_OP_CUDA_KERNEL(reduce_amin_grad, CUDAReduceMinGradKernel<int>,
+                        CUDAReduceMinGradKernel<int64_t>,
+                        CUDAReduceMinGradKernel<float>,
+                        CUDAReduceMinGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 322ef1fdff67abd861c6603c3e7c4fc6b5d19f39..ff7429f75ebe3a02e3f75083a1c70240a0de837a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
 // only can include the headers in paddle/phi/api dirs
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -655,6 +654,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     bool reduce_all = context.Attr<bool>("reduce_all");
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
     auto* in_x = context.Input<Tensor>("X");
+
     auto* d_out =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
@@ -685,12 +685,106 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     if (out_dtype <= 0) {
       pt_out_dtype = d_out->dtype();
     }
+
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
     phi::ReduceGrad<T, TransformOp<T, MPType>>(
         dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype,
         TransformOp<T, MPType>(reduce_num));
   }
 };
+
+template <typename T>
+struct EqualFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return static_cast<T>(a == b);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct DivideFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <typename T, template <typename, typename> class TransformOp>
+class ReduceCudaAMaxAMinGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
+    auto* in_x = context.Input<Tensor>("X");
+    auto* out_y = context.Input<Tensor>("Out");
+    auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto out_dtype = context.Attr<int>("in_dtype");
+    auto pt_out_dtype = framework::TransToPhiDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype));
+    // get reduce_dim and reduce_num for reduce_mean_grad
+    int dim_size = in_x->dims().size();
+    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
+    auto update_dims = vectorize(d_x->dims());
+    int reduce_num = 1;
+    for (auto i : reduce_dims) {
+      reduce_num *= (in_x->dims())[i];
+      update_dims[i] = 1;
+    }
+    auto& dev_ctx = context.cuda_device_context();
+
+    // make new tensor reduce_out
+    phi::DenseTensor new_y(out_y->type());
+    new_y.ShareDataWith(*out_y);
+    new_y.Resize(phi::make_ddim(update_dims));
+
+    // make new tensor d_out
+    phi::DenseTensor new_dout(d_out->type());
+    new_dout.ShareDataWith(*d_out);
+    new_dout.Resize(phi::make_ddim(update_dims));
+    d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
+
+    auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x);
+    auto new_in_tensor = new_in.get();
+
+    auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x);
+    auto new_dx_tensor = new_dx.get();
+
+    // make equal_out
+    phi::DenseTensor* equal_out = new phi::DenseTensor();
+    equal_out->Resize(in_x->dims());
+    dev_ctx.template Alloc<T>(equal_out);
+    auto equal_out_tensor = *equal_out;
+
+    // make new tensor equal_count
+    phi::DenseTensor* equal_count = new phi::DenseTensor();
+    equal_count->Resize(phi::make_ddim(update_dims));
+    dev_ctx.template Alloc<T>(equal_count);
+
+    // compute
+    // 1. equal_out = Equal(x, y)
+    std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
+    std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        dev_ctx, equal_inputs, &equal_outputs, 0, EqualFunctor<T>());
+    // 2. equal_count = reduceSum(equal_out)
+    using MPType = typename kps::details::MPTypeTrait<T>::Type;
+    phi::funcs::ReduceKernel<T, T, kps::AddFunctor,
+                             kps::IdentityFunctor<T, MPType>>(
+        dev_ctx, equal_out_tensor, equal_count,
+        kps::IdentityFunctor<T, MPType>(), reduce_dims, false);
+
+    // 3. dx = Div(dout, equal_out)
+    std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
+                                                        equal_count};
+    std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        dev_ctx, grad_inputs, &grad_outputs, 0, DivideFunctor<T>());
+    delete equal_out;
+    delete equal_count;
+  }
+};
 #endif
 #endif
 
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 88b87c07c7615ccef3a20e3441854bbb6b940394..74e48f39185485fac9d55e778645686955b6d606 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -605,7 +605,22 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
       dev_ctx, ins, &outs, axis, func);
 }
 
-#endif
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+}
+
+#else
 
 template <typename DeviceContext,
           typename T,
@@ -627,5 +642,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   }
 }
 
+#endif
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
index b921d2d6403304bf980ed4d091b3b6327cdf9086..f41ccfd37735d9e9c9f0be5f1f9f44d1dcc72e64 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -14,7 +14,27 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::SquareFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+  std::vector<const DenseTensor*> ins = {out};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = funcs::CudaSqrtFunctor<T>();
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(
     frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}