diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 4231c75748167a196a9b929bc3ef65548b191b4a..e7f18bdc88a110583f499a987fdda223d9d0663f 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -54,7 +54,7 @@ USE_OP(sum);
 USE_OP_ITSELF(slice_grad);
 USE_OP_ITSELF(lookup_table_grad);
 USE_OP(sqrt);
-USE_OP(elementwise_max);
+USE_OP_ITSELF(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
 USE_OP_ITSELF(sgd);
 USE_OP(squared_l2_norm);
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 54931d99292f9d1453e2a3deb72e75ed63c9f46f..5dfb7eece9ec77b3f10196654e976e335af890e9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -70,75 +70,29 @@ struct InverseFloorDivFunctor {
 
 // Maximum
 template <typename T>
-struct MaxFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return a > b ? a : b;
-  }
-};
+using MaxFunctor = phi::funcs::MaximumFunctor<T>;
 
 // Minmum
 template <typename T>
-struct MinFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return a < b ? a : b;
-  }
-};
+using MinFunctor = phi::funcs::MinimumFunctor<T>;
 
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
+// Ternary compare
 template <typename T>
-struct MinGradXFunctor {
-  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
-    return dout * static_cast<T>(x < y);
-  }
-};
+using MaxGradXFunctor = phi::funcs::MaxGradXFunctor<T>;
 template <typename T>
-struct MinGradYFunctor {
-  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
-    return dout * static_cast<T>(x >= y);
-  }
-};
-
+using MaxGradYFunctor = phi::funcs::MaxGradYFunctor<T>;
 template <typename InT, typename OutT>
-struct MinGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT x, const InT y,
-                                                   const InT dout) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * (x < y)
-    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
-    // dy = dout * (x >= y)
-    outs[1] = static_cast<OutT>(dout * static_cast<InT>(x >= y));
-    return outs;
-  }
-};
+using MaxGradXYFunctor = phi::funcs::MaxGradXYFunctor<InT, OutT>;
 
-// Ternary compare
 template <typename T>
-struct MaxGradXFunctor {
-  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
-    return dout * static_cast<T>(x > y);
-  }
-};
+using MinGradXFunctor = phi::funcs::MinGradXFunctor<T>;
 template <typename T>
-struct MaxGradYFunctor {
-  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
-    return dout * static_cast<T>(x <= y);
-  }
-};
-
+using MinGradYFunctor = phi::funcs::MinGradYFunctor<T>;
 template <typename InT, typename OutT>
-struct MaxGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT x, const InT y,
-                                                   const InT dout) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * (x > y)
-    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
-    // dy = dout * (x <= y)
-    outs[1] = static_cast<OutT>(dout * static_cast<InT>(x <= y));
-    return outs;
-  }
-};
+using MinGradXYFunctor = phi::funcs::MinGradXYFunctor<InT, OutT>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index d91315cc511aa80c0e9c44ccc688b2746eac764e..f11c22f24555a4bae1d275766f964d4cb5388817 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -119,23 +117,6 @@ REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
 
 REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>);
-
 REGISTER_OP_VERSION(elementwise_max)
     .AddCheckpoint(
         R"ROC(Register elementwise_max for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
deleted file mode 100644
index 0d5f56fda17322d86ef13990e9fc2432816dc9cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, axis,
-                                                      MaxFunctor<T>());
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMaxGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {x, y, dout};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MaxGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {x, y, dout};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, MaxGradXFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {x, y, dout};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, MaxGradYFunctor<T>());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
deleted file mode 100644
index afe1073d89a06618af95490ac6d264073bd930d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MaxFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct MaxGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(x > y);
-  }
-};
-
-template <typename T>
-struct MaxGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(x <= y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMaxGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMaxGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseMaxGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
index 87a0264c3ce7b219bb437c8187aaa19a3c30150f..176d1328192eaf2c3c582c5d68a0acde3d07b770 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
index 9fb3c9d493dc04a3dda6bd4a25c0d759e0b44c20..f8cc6d2b985be30ee9b2a2e98680b55c888ff08f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index dad80a2c33f3abfde457a6d750f89e47374fae13..5f552460efd42c9a95c57ad576890ce798dbd0a0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -119,19 +117,6 @@ REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
 
 REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
 REGISTER_OP_VERSION(elementwise_min)
     .AddCheckpoint(
         R"ROC(Register elementwise_min for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
deleted file mode 100644
index fb8bc9ac7f83c8dd99e40685acc68eec4c77b3ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseMinKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, axis,
-                                                      MinFunctor<T>());
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMinGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {x, y, dout};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MinGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {x, y, dout};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, MinGradXFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {x, y, dout};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, MinGradYFunctor<T>());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
deleted file mode 100644
index 283ad2adde978680d4d0c3a579d55e588368a28e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MinFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct MinGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x < y);
-  }
-};
-
-template <typename T>
-struct MinGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x >= y);
-  }
-};
-
-#ifdef PADDLE_CUDA_FP16
-template <>
-struct MinGradDx<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(platform::float16 x,
-                                          platform::float16 y,
-                                          platform::float16 out,
-                                          platform::float16 dout) const {
-    return x < y ? dout : static_cast<platform::float16>(0);
-  }
-};
-
-template <>
-struct MinGradDy<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(platform::float16 x,
-                                          platform::float16 y,
-                                          platform::float16 out,
-                                          platform::float16 dout) const {
-    return x >= y ? dout : static_cast<platform::float16>(0);
-  }
-};
-#endif
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMinGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMinGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
index d790a8f6969b5a1734f1ba8012ef916b0dde57ed..fa94f85ec6a31243ce2b2dc2a1497e98d263dbd2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
index 5d4ff69b06dffcf8e1391cb0bddfe4f01124c421..b501ff48c73d2c593c655727c4b34923b332c202 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index 28cd57618ee3baddae73d93f9657881e53784c8f..e693925748807ffed1c3a7cf7207ce81b8646ef8 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -62,13 +60,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
                              ops::ElementwiseModOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mod,
-    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, double>);
-
 REGISTER_OP_VERSION(elementwise_mod)
     .AddCheckpoint(
         R"ROC(Register elementwise_mod for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
deleted file mode 100644
index 6ecff73257cdf7785b4ac45fd9ac0de581418e02..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseModKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(cuda_ctx, ins, &outs,
-                                                      axis, ModFunctor<T>());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseModKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseModKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
deleted file mode 100644
index 2c001ceda4f1934ed16396ce0c450294e89c4070..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename Enable = void>
-struct ModFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    T res = a % b;
-
-    // Accoding to #PR26732: in dividen % divsor
-    // remainder shall have the same sign as divsor.
-    if ((res != 0) && ((b ^ res) < 0)) res += b;
-    return res;
-  }
-};
-
-template <typename T>
-struct ModFunctor<T,
-                  typename std::enable_if_t<std::is_floating_point<T>::value>> {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    T res = fmod(a, b);
-
-    // Accoding to #PR26732: in dividen % divsor
-    // remainder shall have the same sign as divsor.
-    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
-    return res;
-  }
-};
-
-template <typename T, typename Enable = void>
-struct InverseModFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    T res = b % a;
-    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
-    return res;
-  }
-};
-
-template <typename T>
-struct InverseModFunctor<
-    T, typename std::enable_if_t<std::is_floating_point<T>::value>> {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    T res = fmod(b, a);
-    if ((res != 0) && ((a < 0) != (res < 0))) res += a;
-    return res;
-  }
-};
-
-template <typename DeviceContext, typename T>
-void elementwise_mod(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          ModFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseModFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseModFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseModKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-
-    // dtype of x and y is int64 or int32
-    elementwise_mod<DeviceContext, T>(ctx, x, y, z);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
index 900bd5d6c29268f3a5af95cd606698416a5787b2..049b0710145ec627990fb75481cf1667bff84134 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index d5b78909e9287ee0c6cf93164a19b49733a2d76d..b617064987507339743ab145ef94bdf15bdbdca2 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -135,6 +135,32 @@ void MultiplyGradKernel(const Context& dev_ctx,
       dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
 }
 
+template <typename T, typename Context>
+void MaximumGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* dx,
+                       DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, MaxGradDx<T>, MaxGradDy<T>>(
+      dev_ctx, x, y, dout, dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
+}
+
+template <typename T, typename Context>
+void MinimumGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* dx,
+                       DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, MinGradDx<T>, MinGradDy<T>>(
+      dev_ctx, x, y, dout, dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -259,6 +285,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
 PD_REGISTER_KERNEL(fmax_grad,
                    CPU,
                    ALL_LAYOUT,
@@ -276,3 +303,23 @@ PD_REGISTER_KERNEL(fmin_grad,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(maximum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaximumGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(minimum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MinimumGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 004f40ddedadf5e2609868478c7b0d4169b73a63..1de40cb9466b295e5656ae231cb754990e004195 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -70,6 +70,49 @@ void DivideRawKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void MaximumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::MaximumFunctor<T>, T>(
+      dev_ctx, x, y, axis, funcs::MaximumFunctor<T>(), out);
+}
+
+template <typename T, typename Context>
+void MinimumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::MinimumFunctor<T>, T>(
+      dev_ctx, x, y, axis, funcs::MinimumFunctor<T>(), out);
+}
+
+template <typename T, typename Context>
+void ModuloRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<funcs::ModuloFunctor<T>, T>(
+        dev_ctx, x, y, axis, funcs::ModuloFunctor<T>(), out);
+  } else {
+    funcs::ElementwiseCompute<funcs::InverseModuloFunctor<T>, T>(
+        dev_ctx, x, y, axis, funcs::InverseModuloFunctor<T>(), out);
+  }
+}
+
 // Create the definition of Add
 DEFINE_CPU_ELEMENTWISE_OP(Add)
 
@@ -138,3 +181,29 @@ PD_REGISTER_KERNEL(multiply_raw,
                    complex64,
                    complex128,
                    phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(maximum_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaximumRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(minimum_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MinimumRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(modulo_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ModuloRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index fb2633cc9fcea7c619193ad964ad62247ed654dd..95832013caff5d366f264123262d3c05fb36b35f 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -142,4 +142,21 @@ void ElementwiseFMinGradKernel(const Context& dev_ctx,
                                DenseTensor* x_grad,
                                DenseTensor* y_grad);
 
+template <typename T, typename Context>
+void MaximumGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* dx,
+                       DenseTensor* dy);
+
+template <typename T, typename Context>
+void MinimumGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* dx,
+                       DenseTensor* dy);
 }  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 9d10a48c9e0795d8914c0c6cfb49b7686575cfac..019d4fed5b28eaed72a370bfb51c1a75807964fd 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -55,6 +55,32 @@ void MultiplyKernel(const Context& dev_ctx,
   MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void ModuloKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  ModuloRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -105,6 +131,26 @@ PD_REGISTER_KERNEL(multiply,
                    complex64,
                    complex128,
                    phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(maximum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaximumKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(minimum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MinimumKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(
+    modulo, CPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
@@ -158,4 +204,26 @@ PD_REGISTER_KERNEL(multiply,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
+PD_REGISTER_KERNEL(maximum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaximumKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(minimum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinimumKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(
+    modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index a6ba7bdac5829f88c153496c908a6e7ac14f91d2..f9c9c7f7139f3ad2298ba0ebcae026746ab450dc 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -85,6 +85,45 @@ void MultiplyKernel(const Context& dev_ctx,
                     const DenseTensor& y,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void MaximumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void MinimumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ModuloRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void ModuloKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
 template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
@@ -129,4 +168,36 @@ DenseTensor Multiply(const Context& dev_ctx,
   return dense_out;
 }
 
+template <typename T, typename Context>
+DenseTensor Maximum(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MaximumKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Minimum(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MinimumKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Modulo(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  ModuloKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index ac262fe2d571e587e3bdfa6a2d4e58bd5b865e68..1e39cf55035129cc0df92e689cb44a32aeba5562 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -422,5 +422,121 @@ struct MultiplyGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
   }
 };
 
+// Maximum
+template <typename T>
+struct MaximumFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return a > b ? a : b;
+  }
+};
+
+template <typename T>
+struct MaxGradXFunctor {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
+    return dout * static_cast<T>(x > y);
+  }
+};
+
+template <typename T>
+struct MaxGradYFunctor {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
+    return dout * static_cast<T>(x <= y);
+  }
+};
+
+template <typename InT, typename OutT>
+struct MaxGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT x,
+                                                   const InT y,
+                                                   const InT dout) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * (x > y)
+    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
+    // dy = dout * (x <= y)
+    outs[1] = static_cast<OutT>(dout * static_cast<InT>(x <= y));
+    return outs;
+  }
+};
+
+// Minimum
+template <typename T>
+struct MinimumFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return a < b ? a : b;
+  }
+};
+template <typename T>
+struct MinGradXFunctor {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
+    return dout * static_cast<T>(x < y);
+  }
+};
+template <typename T>
+struct MinGradYFunctor {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
+    return dout * static_cast<T>(x >= y);
+  }
+};
+
+template <typename InT, typename OutT>
+struct MinGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT x,
+                                                   const InT y,
+                                                   const InT dout) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * (x < y)
+    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
+    // dy = dout * (x >= y)
+    outs[1] = static_cast<OutT>(dout * static_cast<InT>(x >= y));
+    return outs;
+  }
+};
+
+// Modulo
+template <typename T, typename Enable = void>
+struct ModuloFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    T res = a % b;
+
+    // Accoding to #PR26732: in dividen % divsor
+    // remainder shall have the same sign as divsor.
+    if ((res != 0) && ((b ^ res) < 0)) res += b;
+    return res;
+  }
+};
+
+template <typename T>
+struct ModuloFunctor<
+    T,
+    typename std::enable_if_t<std::is_floating_point<T>::value>> {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    T res = fmod(a, b);
+
+    // Accoding to #PR26732: in dividen % divsor
+    // remainder shall have the same sign as divsor.
+    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
+    return res;
+  }
+};
+
+template <typename T, typename Enable = void>
+struct InverseModuloFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    T res = b % a;
+    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
+    return res;
+  }
+};
+
+template <typename T>
+struct InverseModuloFunctor<
+    T,
+    typename std::enable_if_t<std::is_floating_point<T>::value>> {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    T res = fmod(b, a);
+    if ((res != 0) && ((a < 0) != (res < 0))) res += a;
+    return res;
+  }
+};
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 3392a3cec4ecad08b0442a54c3c3dbc652ebd0b6..52819fd3dee8c5a0314f4e95e563e3f3db32a136 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -148,6 +148,67 @@ void MultiplyGradKernel(const Context& dev_ctx,
   ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
 }
 
+template <typename T, typename Context>
+void MaximumGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* dx,
+                       DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&x, &y, &dout};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MaxGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&x, &y, &dout};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MaxGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&x, &y, &dout};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MaxGradYFunctor<T>());
+  }
+}
+
+template <typename T, typename Context>
+void MinimumGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       int axis,
+                       DenseTensor* dx,
+                       DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&x, &y, &dout};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MinGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&x, &y, &dout};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MinGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&x, &y, &dout};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MinGradYFunctor<T>());
+  }
+}
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -299,3 +360,25 @@ PD_REGISTER_KERNEL(fmin_grad,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(maximum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaximumGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(minimum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinimumGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu
index 8de55e8a412d36c615ed923984c1a3fadc073d0b..bd6995cb1353c71ad9a0cbda857c25f0beeccf7e 100644
--- a/paddle/phi/kernels/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -49,6 +49,12 @@ DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
 DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
 // Create the definition of Divide
 DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+// Create the definition of Maximum
+DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
+// Create the definition of Minimum
+DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
+// Create the definition of Modulo
+DEFINE_CUDA_ELEMENTWISE_OP(Modulo)
 
 }  // namespace phi
 
@@ -114,3 +120,31 @@ PD_REGISTER_KERNEL(multiply_raw,
                    complex64,
                    complex128,
                    bfloat16) {}
+PD_REGISTER_KERNEL(maximum_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaximumRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16) {}
+PD_REGISTER_KERNEL(minimum_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinimumRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16) {}
+PD_REGISTER_KERNEL(modulo_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ModuloRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 0b7a5d3bcb26a360eb5f7f664ead7932f428cc64..07e5bf9ae040e10e1f339b0100ba7f8daccd438f 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -628,4 +628,42 @@ void MultiplyTripleGradKernel(const Context& dev_ctx,
   }
 }
 
+/*
+******************************
+    Maximum Grad
+******************************
+*/
+
+template <typename T>
+struct MaxGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(x > y);
+  }
+};
+
+template <typename T>
+struct MaxGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(x <= y);
+  }
+};
+
+/*
+******************************
+    Minimum Grad
+******************************
+*/
+template <typename T>
+struct MinGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(x < y);
+  }
+};
+
+template <typename T>
+struct MinGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(x >= y);
+  }
+};
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index bb05689dee1d31e2a81bfa15793ee6de52f63120..7f00af6f9af86f35709c2d120a7a47917d8d8431 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -55,6 +55,33 @@ KernelSignature ElementwiseDivOpArgumentMapping(
   return KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ElementwiseMaxOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("maximum", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature("maximum_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature ElementwiseMinOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("minimum", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature("minimum_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature ElementwiseModOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("modulo", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature("modulo_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
 KernelSignature ElementwiseAddGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("add_grad",
@@ -158,12 +185,30 @@ KernelSignature ElementwiseMulTripleGradOpArgumentMapping(
       {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"});
 }
 
+KernelSignature ElementwiseMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("maximum_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("minimum_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_max, maximum);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_min, minimum);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mod, modulo);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
@@ -178,6 +223,8 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax, fmax);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin, fmin);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax_grad, fmax_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin_grad, fmin_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_max_grad, maximum_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_min_grad, minimum_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -187,6 +234,12 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul,
                            phi::ElementwiseMulOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_div,
                            phi::ElementwiseDivOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_max,
+                           phi::ElementwiseMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_min,
+                           phi::ElementwiseMinOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mod,
+                           phi::ElementwiseModOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                            phi::ElementwiseAddGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
@@ -211,8 +264,11 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax,
                            phi::ElementwiseFMaxOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin,
                            phi::ElementwiseFMinOpArgumentMapping);
-
 PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad,
                            phi::ElementwiseFMaxGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad,
                            phi::ElementwiseFMinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad,
+                           phi::ElementwiseMaxGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad,
+                           phi::ElementwiseMinGradOpArgumentMapping);