Move meshgrid to phi (#40994)

* move momentum, rmsprop to phi; test=develop * update * update * update * update * udpate; test=develop * fix xpu npu bugs; test=develop * fix npu bug; test=develop * fix windows compile error; test=develop * fix windows compile error; test=develop * polish code; test=develop * fix conflict; test=develop * add meshgrid; * update * polish code * polish code; * fix bug * format; remove useless code * fix npu bug * fix bug

Move meshgrid to phi (#40994)
* move momentum, rmsprop to phi; test=develop * update * update * update * update * udpate; test=develop * fix xpu npu bugs; test=develop * fix npu bug; test=develop * fix windows compile error; test=develop * fix windows compile error; test=develop * polish code; test=develop * fix conflict; test=develop * add meshgrid; * update * polish code * polish code; * fix bug * format; remove useless code * fix npu bug * fix bug
ca871957 · hong · GitHub · e77a947e · ca871957 · ca871957
49 changed file
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -280,6 +280,46 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
  }
 };

+template <typename T>
+struct SelectedRowsAddToTensor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
+                  const phi::SelectedRows& input1, framework::Tensor* input2) {
+    if (UNLIKELY(input1.rows().size() == 0)) {
+      LOG(WARNING) << "input selected rows is empty!";
+      return;
+    }
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
@@ -287,6 +327,11 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
                                        platform::bfloat16>;

+template struct SelectedRowsAddToTensor<phi::CPUContext, float>;
+template struct SelectedRowsAddToTensor<phi::CPUContext, double>;
+template struct SelectedRowsAddToTensor<phi::CPUContext, int>;
+template struct SelectedRowsAddToTensor<phi::CPUContext, int64_t>;
+template struct SelectedRowsAddToTensor<phi::CPUContext, platform::bfloat16>;
 // This is a separated namespace for manipulate SelectedRows typed
 // data. Like merge duplicated rows, adding two SelectedRows etc.
 //

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -174,12 +174,77 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
  }
 };

+template <typename T>
+struct SelectedRowsAddTensor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const phi::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument(
+            "The two inputs height must be equal."
+            "But recieved first input height = [%d], first input height = [%d]",
+            in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2.data<T>();
+    auto* out_data = output->data<T>();
+
+    phi::funcs::SetConstant<phi::GPUContext, T> functor;
+    functor(context, output, static_cast<T>(0));
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
+    SelectedRowsAddTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_row_numel);
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
+  }
+};
+
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
                                      platform::float16>;

+template struct SelectedRowsAddTensor<phi::GPUContext, float>;
+template struct SelectedRowsAddTensor<phi::GPUContext, double>;
+template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
+template struct SelectedRowsAddTensor<phi::GPUContext, platform::float16>;
+
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
  void operator()(const platform::CUDADeviceContext& context,
@@ -285,12 +350,54 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
  }
 };

+template <typename T>
+struct SelectedRowsAddToTensor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const phi::SelectedRows& input1, framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2->data<T>();
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
+    SelectedRowsAddToTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_row_numel);
+  }
+};
+
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
                                        platform::float16>;
+template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
+template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
+template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
+template struct SelectedRowsAddToTensor<phi::GPUContext, int64_t>;
+template struct SelectedRowsAddToTensor<phi::GPUContext, platform::float16>;

 namespace scatter {


--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/operators/meshgrid_op.h"
-
 #include <memory>
 #include <string>
 #include <vector>

+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
 namespace paddle {
 namespace operators {

@@ -145,29 +146,3 @@ REGISTER_OPERATOR(meshgrid, ops::MeshgridOp, ops::MeshgridOpMaker,
                  ops::MeshgridGradOpMaker<paddle::framework::OpDesc>,
                  ops::MeshgridGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(meshgrid_grad, ops::MeshgridGradOp);
-REGISTER_OP_CPU_KERNEL(
-    meshgrid, ops::MeshgridKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeshgridKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MeshgridKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MeshgridKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    meshgrid_grad,
-    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid_grad,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-#endif
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/errors.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MeshgridKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto ins = context.MultiInput<framework::Tensor>("X");
-    auto rank = ins.size();
-    switch (rank) {
-      case 1:
-        MeshgridForward<1>(context);
-        break;
-      case 2:
-        MeshgridForward<2>(context);
-        break;
-      case 3:
-        MeshgridForward<3>(context);
-        break;
-      case 4:
-        MeshgridForward<4>(context);
-        break;
-      case 5:
-        MeshgridForward<5>(context);
-        break;
-      case 6:
-        MeshgridForward<6>(context);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Excepted Tensor numbers between 1 and 6, but only received d% .",
-            rank));
-    }
-  }
-
- protected:
-  template <int Rank>
-  void MeshgridForward(const framework::ExecutionContext& context) const {
-    auto ins = context.MultiInput<framework::Tensor>("X");
-    auto outs = context.MultiOutput<framework::Tensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        ins.size() > 1, true,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            ins.size()));
-
-    int64_t size = ins.size();
-    std::vector<int64_t> shape(size);
-
-    for (int64_t i = 0; i < size; i++) {
-      switch (ins[i]->dims().size()) {
-        case 0:
-          shape[i] = 1;
-          break;
-        case 1:
-          shape[i] = ins[i]->dims()[0];
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected scalar or 1D tensor in the tensor list but got tensor "
-              "%d: ",
-              i));
-      }
-    }
-
-    for (int64_t i = 0; i < size; i++) {
-      std::vector<int64_t> view_shape(size, 1);
-      view_shape[i] = shape[i];
-
-      framework::Tensor reshape_ins_tensor;
-      paddle::framework::TensorCopy(*ins[i], context.GetPlace(),
-                                    context.device_context(),
-                                    &reshape_ins_tensor);
-      framework::DDim out_dims_reshape = phi::make_ddim(view_shape);
-      reshape_ins_tensor.Resize(out_dims_reshape);
-      framework::DDim out_dims = phi::make_ddim(shape);
-
-      Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-      for (int64_t j = 0; j < size; j++) {
-        bcast_dims[j] = shape[j];
-      }
-      bcast_dims[i] = 1;
-
-      outs[i]->Resize(out_dims);
-      auto x = framework::EigenTensor<T, Rank>::From(
-          static_cast<const framework::Tensor>(reshape_ins_tensor));
-      outs[i]->mutable_data<T>(context.GetPlace());
-      auto y = framework::EigenTensor<T, Rank>::From(*outs[i]);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                   bcast_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MeshgridGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto out_grad =
-        context.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
-    int n = out_grad.size();
-    switch (n) {
-      case 1:
-        MeshgridBackward<1>(context);
-        break;
-      case 2:
-        MeshgridBackward<2>(context);
-        break;
-      case 3:
-        MeshgridBackward<3>(context);
-        break;
-      case 4:
-        MeshgridBackward<4>(context);
-        break;
-      case 5:
-        MeshgridBackward<5>(context);
-        break;
-      case 6:
-        MeshgridBackward<6>(context);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Excepted Tensor numbers between 1 and 6, but only received d% .",
-            n));
-    }
-  }
-
- protected:
-  template <int Rank>
-  void MeshgridBackward(const framework::ExecutionContext& context) const {
-    auto out_grad =
-        context.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = context.MultiInput<framework::Tensor>("X");
-    auto outs =
-        context.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    int n = out_grad.size();
-    auto out_dims = out_grad[0]->dims();
-
-    for (int i = 0; i < n; i++) {
-      outs[i]->mutable_data<T>(context.GetPlace());
-      auto out_grad_tmp = framework::EigenVector<T>::Flatten(*out_grad[i]);
-      auto in_grad = framework::EigenVector<T>::Flatten(*outs[i]);
-
-      std::vector<int> reduce_dims_vec;
-      std::vector<int> reshape_dims_vec;
-      for (int j = 0; j < n; j++) {
-        reduce_dims_vec.push_back(reshape_dims_vec.size());
-        if (j == i) {
-          reshape_dims_vec.push_back(1);
-          reshape_dims_vec.push_back(out_dims[j]);
-        } else {
-          reshape_dims_vec.push_back(out_dims[j]);
-          reshape_dims_vec.push_back(1);
-        }
-      }
-
-      Eigen::DSizes<Eigen::DenseIndex, Rank> reduce_dims;
-      for (int k = 0; k < n; k++) {
-        reduce_dims[k] = reduce_dims_vec[k];
-      }
-
-      Eigen::DSizes<Eigen::DenseIndex, Rank * 2> reshape_dims;
-      for (int k = 0; k < n * 2; k++) {
-        reshape_dims[k] = reshape_dims_vec[k];
-      }
-
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Rank>::Eval(
-          place, in_grad, out_grad_tmp, reduce_dims, reshape_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/meshgrid_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {

--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/optimizers/adagrad_op.h"
-#include <vector>
-
 #include <cmath>
+#include <vector>

+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -102,54 +101,8 @@ for numerical stability to avoid the division by zero error.
  }
 };

-namespace {
-size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
-  return std::find(rows.begin(), rows.end(), value) - rows.begin();
-}
-}  // namespace
-
-template <typename T>
-struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param) {
-    // 1. g_m.rows = set(g.rows)
-    auto grad_width = grad.value().dims()[1];
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    auto grad_merge = merge_func(context, grad);
-    auto& merge_rows = grad_merge.rows();
-    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-
-    // 2. m += g_m * g_m
-    auto grad_square =
-        SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);
-
-    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
-    functor(context, grad_square, moment);
-
-    // 3. update parameter
-    auto* lr = learning_rate.data<T>();
-    auto* param_data = param->data<T>();
-    auto* moment_data = moment->data<T>();
-
-    for (size_t i = 0; i < merge_rows.size(); i++) {
-      for (int64_t j = 0; j < grad_width; j++) {
-        param_data[merge_rows[i] * grad_width + j] -=
-            lr[0] * grad_merge_data[i * grad_width + j] /
-            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
-      }
-    }
-  }
-};
-
-template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/optimizers/adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/adagrad_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct SparseAdagradFunctor {
-  void operator()(const DeviceContext &context, const phi::SelectedRows &grad,
-                  const framework::Tensor &learning_rate, T epsilon,
-                  framework::Tensor *moment, framework::Tensor *param);
-};
-
-template <typename DeviceContext, typename T>
-phi::SelectedRows SquareSelectedRows(const DeviceContext &context,
-                                     const phi::SelectedRows &input) {
-  phi::SelectedRows out;
-  out.set_rows(input.rows());
-  out.set_height(input.height());
-  out.mutable_value()->mutable_data<T>(input.value().dims(),
-                                       context.GetPlace());
-  auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-  auto e_in = framework::EigenVector<T>::Flatten(input.value());
-  e_out.device(*context.eigen_device()) = e_in.square();
-  return out;
-}
-
-template <typename DeviceContext, typename T>
-class AdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-
-    auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto *grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto param = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Param"));
-      auto grad = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Grad"));
-      auto moment = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Moment"));
-      auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto *place = ctx.template device_context<DeviceContext>().eigen_device();
-
-      moment_out.device(*place) = moment + grad * grad;
-      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto *lr = learning_rate->data<T>();
-        param_out.device(*place) =
-            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
-      } else {
-        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-        param_out.device(*place) =
-            param -
-            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
-      }
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      auto *param_tensor = ctx.Input<framework::Tensor>("Param");
-      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor,
-                        platform::errors::InvalidArgument(
-                            "the input tensor not euqal with output tensor"));
-
-      auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
-      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor,
-                        platform::errors::InvalidArgument(
-                            "the input moment not eual with output moment"));
-
-      SparseAdagradFunctor<DeviceContext, T> functor;
-      functor(ctx.template device_context<DeviceContext>(),
-              *ctx.Input<phi::SelectedRows>("Grad"),
-              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
-              moment_out_tensor, param_out_tensor);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Variable Type of Grad"));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -17,6 +17,7 @@
 #include <memory>

 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/phi/kernels/momentum_kernel.h"
 #include "paddle/phi/kernels/sgd_kernel.h"

 namespace paddle {
@@ -25,8 +26,7 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class DGCMomentumKernel : public framework::OpKernel<T> {
 public:
-  DGCMomentumKernel()
-      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()) {}
+  DGCMomentumKernel() {}

  void Compute(const framework::ExecutionContext& context) const override {
    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
@@ -60,15 +60,56 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
    VLOG(10) << "current_step:" << *current_step
             << ", rampup_begin_step:" << rampup_begin_step;

+    const auto* grad_var = context.InputVar("Grad");
    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
      VLOG(10) << " so use momentum optimizer";
-      return _momentum_op_kernel->Compute(context);
+      auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
+      bool multi_precision = context.Attr<bool>("multi_precision");
+
+      auto* param = context.Input<framework::Tensor>("Param");
+      auto* velocity = context.Input<framework::Tensor>("Velocity");
+      auto* param_out = context.Output<framework::Tensor>("ParamOut");
+      auto* velocity_out = context.Output<framework::Tensor>("VelocityOut");
+      auto* master_param_out =
+          context.Output<framework::Tensor>("MasterParamOut");
+      paddle::optional<const framework::Tensor&> master_param_opt =
+          paddle::none;
+      float mu = context.Attr<float>("mu");
+      bool use_nesterov = context.Attr<bool>("use_nesterov");
+      std::string regularization_method =
+          context.Attr<std::string>("regularization_method");
+      float regularization_coeff = context.Attr<float>("regularization_coeff");
+      float rescale_grad = context.Attr<float>("rescale_grad");
+
+      if (grad_var->IsType<framework::Tensor>()) {
+        // sgd_dense
+        auto* grad = context.Input<framework::Tensor>("Grad");
+        phi::MomentumDenseKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *grad, *velocity, *learning_rate, master_param_opt, mu,
+            use_nesterov, regularization_method, regularization_coeff,
+            multi_precision, rescale_grad, param_out, velocity_out,
+            master_param_out);
+      } else {
+        // sgd dense param sparse grad
+        auto* grad = context.Input<phi::SelectedRows>("Grad");
+        phi::MomentumSparseKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *grad, *velocity, *learning_rate, master_param_opt, mu,
+            use_nesterov, regularization_method, regularization_coeff,
+            multi_precision, rescale_grad, param_out, velocity_out,
+            master_param_out);
+      }
+
+      return;
    }

    VLOG(10) << " so use sgd optimizer";

    const auto* param_var = context.InputVar("Param");
-    const auto* grad_var = context.InputVar("Grad");
+
    auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
    bool multi_precision = context.Attr<bool>("multi_precision");
    if (param_var->IsType<framework::LoDTensor>()) {
@@ -125,9 +166,6 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
      PADDLE_THROW("gdc not support yet");
    }
  }
-
- private:
-  std::unique_ptr<MomentumOpKernel<DeviceContext, T>> _momentum_op_kernel;
 };

 }  // namespace operators

--- a/paddle/fluid/operators/optimizers/merged_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.h
@@ -18,13 +18,16 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"

 namespace paddle {
 namespace operators {

+template <typename T>
+using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
+
 template <typename MT, uint32_t kParamNum, bool kHasMasterParams>
 struct MergedMomentumMasterParams {
  MT *PADDLE_RESTRICT master_params[kParamNum];
@@ -259,11 +262,11 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
 #undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
    } else {
      for (size_t idx = 0; idx < n; idx++) {
-        RegularizationType regularization_flag =
+        phi::RegularizationType regularization_flag =
            regularization_methods.size() > 0 &&
                    regularization_methods[idx] == "l2_decay"
-                ? RegularizationType::kL2DECAY
-                : RegularizationType::kNONE;
+                ? phi::RegularizationType::kL2DECAY
+                : phi::RegularizationType::kNONE;

        MT regularization_coeff = static_cast<MT>(0.0);
        if (regularization_coeffs.size() != 0) {
@@ -276,7 +279,7 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
        MT *master_out_data =
            multi_precision ? master_params_out[idx]->data<MT>() : nullptr;
        if (platform::is_cpu_place(ctx.GetPlace())) {
-          CPUDenseMomentumFunctor<MT> functor;
+          phi::CPUDenseMomentumFunctor<MT> functor;
          functor(params[idx], grads[idx], velocitys[idx], lr_temp,
                  static_cast<MT>(mu), use_nesterov, regularization_flag,
                  regularization_coeff, params_out[idx], velocitys_out[idx]);
@@ -286,7 +289,7 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
              static_cast<const DeviceContext &>(ctx.device_context()),
              params[idx]->numel());
 #define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type)         \
-  DenseMomentumFunctor<T, MT, __reg_type, __nesterov> functor(                \
+  phi::DenseMomentumFunctor<T, MT, __reg_type, __nesterov> functor(           \
      params[idx]->data<T>(), grads[idx]->data<T>(),                          \
      velocitys[idx]->data<MT>(), lr_temp->data<MPType>(), master_in_data,    \
      static_cast<MT>(mu), static_cast<MT>(rescale_grad),                     \
@@ -294,26 +297,26 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
      velocitys_out[idx]->data<MT>(), master_out_data);                       \
  for_range(functor);
          if (use_nesterov) {
-            if (regularization_flag == RegularizationType::kL2DECAY) {
+            if (regularization_flag == phi::RegularizationType::kL2DECAY) {
              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
-                  UseNesterov, RegularizationType::kL2DECAY);
+                  phi::UseNesterov, phi::RegularizationType::kL2DECAY);
              VLOG(10)
                  << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY.";
            } else {
-              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(UseNesterov,
-                                                    RegularizationType::kNONE);
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                  phi::UseNesterov, phi::RegularizationType::kNONE);
              VLOG(10)
                  << "Launch MergedMomentum gpu kernel use_nesterov kNONE.";
            }
          } else {
-            if (regularization_flag == RegularizationType::kL2DECAY) {
+            if (regularization_flag == phi::RegularizationType::kL2DECAY) {
              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
-                  NoNesterov, RegularizationType::kL2DECAY);
+                  phi::NoNesterov, phi::RegularizationType::kL2DECAY);
              VLOG(10)
                  << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY.";
            } else {
-              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(NoNesterov,
-                                                    RegularizationType::kNONE);
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                  phi::NoNesterov, phi::RegularizationType::kNONE);
              VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE.";
            }
          }

--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/optimizers/merged_momentum_op.h"

 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"

 namespace paddle {
 namespace operators {
@@ -118,11 +119,11 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
    FillNpuTensorWithConstant<T>(&mu_tensor, mu);

    for (size_t idx = 0; idx < n; ++idx) {
-      RegularizationType regularization_flag =
+      phi::RegularizationType regularization_flag =
          regularization_methods.size() > 0 &&
                  regularization_methods[idx] == "l2_decay"
-              ? RegularizationType::kL2DECAY
-              : RegularizationType::kNONE;
+              ? phi::RegularizationType::kL2DECAY
+              : phi::RegularizationType::kNONE;
      float regularization_coeff = 0.0;
      if (regularization_coeffs.size() != 0) {
        regularization_coeff = regularization_coeffs[idx];
@@ -136,7 +137,7 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {

      auto grad = grads[idx];
      Tensor regularized_grad;
-      if (regularization_flag == RegularizationType::kL2DECAY) {
+      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
        const auto& runner1 = NpuOpRunner("Muls", {*param}, {regularized_grad},
                                          {{"value", regularization_coeff}});

--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -108,9 +108,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    ops::MomentumOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);

 REGISTER_OP_VERSION(momentum)
    .AddCheckpoint(

--- a/paddle/fluid/operators/optimizers/momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/momentum_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::float16>);
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -26,44 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using framework::Tensor;
-using phi::SelectedRows;
-struct NoNesterov;
-struct UseNesterov;
-
-namespace details {
-
-template <typename T>
-struct CPUDenseUpdater {
-  template <typename G>
-  void operator()(const Tensor& param, const Tensor& velocity, const T& mu,
-                  const T& lr, const bool use_nesterov, G&& grad,
-                  Tensor* param_out, Tensor* velocity_out) const {
-    auto param_out_vec = framework::EigenVector<T>::Flatten(*param_out);
-    auto velocity_out_vec = framework::EigenVector<T>::Flatten(*velocity_out);
-
-    auto param_vec = framework::EigenVector<T>::Flatten(param);
-    auto velocity_vec = framework::EigenVector<T>::Flatten(velocity);
-    velocity_out_vec = velocity_vec * mu + grad;
-    if (use_nesterov) {
-      param_out_vec = param_vec - (grad + velocity_out_vec * mu) * lr;
-    } else {
-      param_out_vec = param_vec - lr * velocity_out_vec;
-    }
-  }
-};
-
-}  // namespace details
-
-template <typename T>
-using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
-
-enum class RegularizationType {
-  kNONE = 0,
-  kL1DECAY = 1,  // do not need support right now
-  kL2DECAY = 2,
-};
-
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
@@ -148,460 +110,5 @@ class MomentumOp : public framework::OperatorWithKernel {
  }
 };

-template <typename T>
-class CPUDenseMomentumFunctor {
- public:
-  void operator()(const Tensor* param, const Tensor* grad,
-                  const Tensor* velocity, const Tensor* learning_rate,
-                  const T mu, const bool use_nesterov,
-                  const RegularizationType regularization_flag,
-                  const T regularization_coeff, Tensor* param_out,
-                  Tensor* velocity_out) {
-    auto grad_vec = framework::EigenVector<T>::Flatten(*grad);
-    auto* lr = learning_rate->data<MultiPrecisionType<T>>();
-
-    details::CPUDenseUpdater<T> updater;
-    if (regularization_flag == RegularizationType::kL2DECAY) {
-      auto param_vec = framework::EigenVector<T>::Flatten(*param);
-      updater(*param, *velocity, mu, static_cast<T>(lr[0]), use_nesterov,
-              param_vec * regularization_coeff + grad_vec, param_out,
-              velocity_out);
-    } else {
-      updater(*param, *velocity, mu, static_cast<T>(lr[0]), use_nesterov,
-              grad_vec, param_out, velocity_out);
-    }
-  }
-};
-
-template <typename T, typename MT, RegularizationType kRegType,
-          typename UpdateMethod>
-class DenseMomentumFunctor;
-
-// NOTE(dzh) for performance.
-// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
-// functor.
-template <typename T, typename MT, RegularizationType kRegType>
-class DenseMomentumFunctor<T, MT, kRegType, UseNesterov> {
- private:
-  const T* param_;
-  const T* grad_;
-  const MT* velocity_;
-  const MultiPrecisionType<MT>* lr_;
-  const MT* master_param_;
-  const MT mu_;
-  const MT rescale_grad_;
-  const int64_t num_;
-  T* param_out_;
-  MT* velocity_out_;
-  MT* master_param_out_;
-  const MT regularization_coeff_;
-
- public:
-  DenseMomentumFunctor(const T* param, const T* grad, const MT* velocity,
-                       const MultiPrecisionType<MT>* learning_rate,
-                       const MT* master_param, const MT mu,
-                       const MT rescale_grad, const int64_t num,
-                       const MT regularization_coeff, T* param_out,
-                       MT* velocity_out, MT* master_param_out)
-      : param_(param),
-        grad_(grad),
-        velocity_(velocity),
-        lr_(learning_rate),
-        master_param_(master_param),
-        mu_(mu),
-        rescale_grad_(rescale_grad),
-        num_(num),
-        param_out_(param_out),
-        velocity_out_(velocity_out),
-        master_param_out_(master_param_out),
-        regularization_coeff_(regularization_coeff) {}
-  inline HOSTDEVICE void operator()(size_t i) const {
-    // put memory access in register
-    const MT param =
-        master_param_ ? master_param_[i] : static_cast<MT>(param_[i]);
-    MT grad = static_cast<MT>(grad_[i]) * rescale_grad_;
-    const MT lr = static_cast<MT>(lr_[0]);
-    const MT velocity = velocity_[i];
-
-    if (kRegType == RegularizationType::kL2DECAY) {
-      grad += regularization_coeff_ * param;
-    }
-
-    MT velocity_out = velocity * mu_ + grad;
-    MT param_out = param - (grad + velocity_out * mu_) * lr;
-    // write reigster to memory
-    velocity_out_[i] = velocity_out;
-    param_out_[i] = static_cast<T>(param_out);
-    if (master_param_out_) {
-      master_param_out_[i] = param_out;
-    }
-  }
-};
-
-template <typename T, typename MT, RegularizationType kRegType>
-class DenseMomentumFunctor<T, MT, kRegType, NoNesterov> {
- private:
-  const T* param_;
-  const T* grad_;
-  const MT* velocity_;
-  const MultiPrecisionType<MT>* lr_;
-  const MT* master_param_;
-  const MT mu_;
-  const MT rescale_grad_;
-  const int64_t num_;
-  T* param_out_;
-  MT* velocity_out_;
-  MT* master_param_out_;
-  const MT regularization_coeff_;
-
- public:
-  DenseMomentumFunctor(const T* param, const T* grad, const MT* velocity,
-                       const MultiPrecisionType<MT>* learning_rate,
-                       const MT* master_param, const MT mu,
-                       const MT rescale_grad, const int64_t num,
-                       const MT regularization_coeff, T* param_out,
-                       MT* velocity_out, MT* master_param_out)
-      : param_(param),
-        grad_(grad),
-        velocity_(velocity),
-        lr_(learning_rate),
-        master_param_(master_param),
-        mu_(mu),
-        rescale_grad_(rescale_grad),
-        num_(num),
-        param_out_(param_out),
-        velocity_out_(velocity_out),
-        master_param_out_(master_param_out),
-        regularization_coeff_(regularization_coeff) {}
-  inline HOSTDEVICE void operator()(size_t i) const {
-    // put memory access in register
-    const MT param =
-        master_param_ ? master_param_[i] : static_cast<MT>(param_[i]);
-    MT grad = static_cast<MT>(grad_[i]) * rescale_grad_;
-    const MT lr = static_cast<MT>(lr_[0]);
-    const MT velocity = velocity_[i];
-
-    if (kRegType == RegularizationType::kL2DECAY) {
-      grad += regularization_coeff_ * param;
-    }
-
-    MT velocity_out = velocity * mu_ + grad;
-    MT param_out = param - lr * velocity_out;
-    // write reigster to memory
-    velocity_out_[i] = velocity_out;
-    param_out_[i] = static_cast<T>(param_out);
-    if (master_param_out_) {
-      master_param_out_[i] = param_out;
-    }
-  }
-};
-
-template <typename T, typename MT, typename UpdateMethod>
-class SparseMomentumFunctor;
-
-template <typename T, typename MT>
-class SparseMomentumFunctor<T, MT, UseNesterov> {
- private:
-  const T* param_;
-  const T* grad_;
-  const MT* velocity_;
-  const MultiPrecisionType<MT>* lr_;
-  const MT* master_param_;
-  const MT mu_;
-  const MT rescale_grad_;
-  const int64_t* rows_;
-  const int64_t row_numel_;
-  const int64_t row_height_;
-  T* param_out_;
-  MT* velocity_out_;
-  MT* master_param_out_;
-  const RegularizationType regularization_flag_;
-  const MT regularization_coeff_;
-
- public:
-  SparseMomentumFunctor(const T* param, const T* grad, const MT* velocity,
-                        const MultiPrecisionType<MT>* lr,
-                        const MT* master_param, const MT mu,
-                        const MT rescale_grad, const int64_t* rows,
-                        int64_t row_numel, int64_t row_height,
-                        const RegularizationType regularization_flag,
-                        const MT regularization_coeff, T* param_out,
-                        MT* velocity_out, MT* master_param_out)
-      : param_(param),
-        grad_(grad),
-        velocity_(velocity),
-        lr_(lr),
-        master_param_(master_param),
-        mu_(mu),
-        rescale_grad_(rescale_grad),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_height_(row_height),
-        param_out_(param_out),
-        velocity_out_(velocity_out),
-        master_param_out_(master_param_out),
-        regularization_flag_(regularization_flag),
-        regularization_coeff_(regularization_coeff) {}
-
-  inline HOSTDEVICE void operator()(size_t i) {
-    auto row_idx =
-        phi::funcs::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    MT grad =
-        row_idx >= 0
-            ? static_cast<MT>(grad_[row_idx * row_numel_ + i % row_numel_]) *
-                  rescale_grad_
-            : static_cast<MT>(0);
-    // put memory access in register
-    const MT param =
-        master_param_ ? master_param_[i] : static_cast<MT>(param_[i]);
-    const MT lr = static_cast<MT>(lr_[0]);
-    const MT velocity = velocity_[i];
-
-    grad = regularization_flag_ == RegularizationType::kL2DECAY
-               ? grad + regularization_coeff_ * param
-               : grad;
-
-    MT velocity_out = velocity * mu_ + grad;
-    MT param_out = param - (grad + velocity_out * mu_) * lr;
-    // write reigster to memory
-    velocity_out_[i] = velocity_out;
-    param_out_[i] = static_cast<T>(param_out);
-    if (master_param_out_) {
-      master_param_out_[i] = param_out;
-    }
-  }
-};
-
-template <typename T, typename MT>
-class SparseMomentumFunctor<T, MT, NoNesterov> {
- private:
-  const T* param_;
-  const T* grad_;
-  const MT* velocity_;
-  const MultiPrecisionType<MT>* lr_;
-  const MT* master_param_;
-  const MT mu_;
-  const MT rescale_grad_;
-  const int64_t* rows_;
-  const int64_t row_numel_;
-  const int64_t row_height_;
-  T* param_out_;
-  MT* velocity_out_;
-  MT* master_param_out_;
-  const RegularizationType regularization_flag_;
-  const MT regularization_coeff_;
-
- public:
-  SparseMomentumFunctor(const T* param, const T* grad, const MT* velocity,
-                        const MultiPrecisionType<MT>* lr,
-                        const MT* master_param, const MT mu,
-                        const MT rescale_grad, const int64_t* rows,
-                        int64_t row_numel, int64_t row_height,
-                        const RegularizationType regularization_flag,
-                        const MT regularization_coeff, T* param_out,
-                        MT* velocity_out, MT* master_param_out)
-      : param_(param),
-        grad_(grad),
-        velocity_(velocity),
-        lr_(lr),
-        master_param_(master_param),
-        mu_(mu),
-        rescale_grad_(rescale_grad),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_height_(row_height),
-        param_out_(param_out),
-        velocity_out_(velocity_out),
-        master_param_out_(master_param_out),
-        regularization_flag_(regularization_flag),
-        regularization_coeff_(regularization_coeff) {}
-
-  inline HOSTDEVICE void operator()(size_t i) {
-    auto row_idx =
-        phi::funcs::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    MT grad =
-        row_idx >= 0
-            ? static_cast<MT>(grad_[row_idx * row_numel_ + i % row_numel_]) *
-                  rescale_grad_
-            : static_cast<MT>(0);
-    // put memory access in register
-    const MT param =
-        master_param_ ? master_param_[i] : static_cast<MT>(param_[i]);
-    const MT lr = static_cast<MT>(lr_[0]);
-    const MT velocity = velocity_[i];
-
-    grad = regularization_flag_ == RegularizationType::kL2DECAY
-               ? grad + regularization_coeff_ * param
-               : grad;
-
-    MT velocity_out = velocity * mu_ + grad;
-    MT param_out = param - velocity_out * lr;
-    // write reigster to memory
-    velocity_out_[i] = velocity_out;
-    param_out_[i] = static_cast<T>(param_out);
-    if (master_param_out_) {
-      master_param_out_[i] = param_out;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MomentumOpKernel : public framework::OpKernel<T> {
-  using MPDType = MultiPrecisionType<T>;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const bool multi_precision = ctx.Attr<bool>("multi_precision");
-    if (multi_precision) {
-      InnerCompute<MPDType>(ctx, multi_precision);
-    } else {
-      InnerCompute<T>(ctx, multi_precision);
-    }
-  }
-
- private:
-  template <typename MT>
-  void InnerCompute(const framework::ExecutionContext& ctx,
-                    const bool multi_precision) const {
-    std::string regularization_method =
-        ctx.Attr<std::string>("regularization_method");
-    MT regularization_coeff =
-        static_cast<MT>(ctx.Attr<float>("regularization_coeff"));
-    RegularizationType regularization_flag{
-        RegularizationType::kNONE};  // disable regularization
-    if (regularization_method == "l2_decay") {
-      regularization_flag = RegularizationType::kL2DECAY;
-    }
-
-    MT mu = static_cast<MT>(ctx.Attr<float>("mu"));
-    MT rescale_grad = static_cast<MT>(ctx.Attr<float>("rescale_grad"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-
-    const framework::Tensor* master_param = nullptr;
-    framework::Tensor* master_param_out = nullptr;
-    if (multi_precision) {
-      bool has_master =
-          ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
-      PADDLE_ENFORCE_EQ(has_master, true,
-                        platform::errors::InvalidArgument(
-                            "The Input(MasterParam) and Output(MasterParamOut) "
-                            "should not be null when "
-                            "the attr `multi_precision` is true"));
-      master_param = ctx.Input<framework::Tensor>("MasterParam");
-      master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
-    }
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<MT>(ctx.GetPlace());
-    const MT* master_in_data =
-        multi_precision ? master_param->data<MT>() : nullptr;
-    MT* master_out_data =
-        multi_precision ? master_param_out->mutable_data<MT>(ctx.GetPlace())
-                        : nullptr;
-
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto grad = ctx.Input<framework::Tensor>("Grad");
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        CPUDenseMomentumFunctor<MT> functor;
-        functor(param, grad, velocity, learning_rate, mu, use_nesterov,
-                regularization_flag, regularization_coeff, param_out,
-                velocity_out);
-      } else if (platform::is_gpu_place(ctx.GetPlace())) {
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(ctx.device_context()),
-            param->numel());
-#define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
-  DenseMomentumFunctor<T, MT, __reg_type, __nesterov> functor(          \
-      param->data<T>(), grad->data<T>(), velocity->data<MT>(),          \
-      learning_rate->data<MPDType>(), master_in_data, mu, rescale_grad, \
-      param->numel(), regularization_coeff,                             \
-      param_out->mutable_data<T>(ctx.GetPlace()),                       \
-      velocity_out->mutable_data<MT>(ctx.GetPlace()), master_out_data); \
-  for_range(functor);
-
-        if (use_nesterov) {
-          if (regularization_flag == RegularizationType::kL2DECAY) {
-            PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(UseNesterov,
-                                                RegularizationType::kL2DECAY);
-          } else {
-            PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(UseNesterov,
-                                                RegularizationType::kNONE);
-          }
-        } else {
-          if (regularization_flag == RegularizationType::kL2DECAY) {
-            PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(NoNesterov,
-                                                RegularizationType::kL2DECAY);
-          } else {
-            PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(NoNesterov,
-                                                RegularizationType::kNONE);
-          }
-        }
-      }
-
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      // sparse update embedding with selectedrows
-      auto grad = ctx.Input<phi::SelectedRows>("Grad");
-
-      // sparse update maybe empty.
-      if (grad->rows().size() == 0) {
-        VLOG(3) << "Grad SelectedRows contains no data!";
-        return;
-      }
-
-      phi::SelectedRows tmp_merged_grad;
-      phi::SelectedRows* merged_grad = &tmp_merged_grad;
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      merge_func(ctx.template device_context<DeviceContext>(), *grad,
-                 merged_grad);
-
-      auto* grad_merge_rows = merged_grad->mutable_rows();
-      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
-          grad_merge_rows);
-      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
-      int64_t row_numel =
-          merged_grad->value().numel() / merged_grad->rows().size();
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          param->numel());
-      if (use_nesterov) {
-        SparseMomentumFunctor<T, MT, UseNesterov> functor(
-            param->data<T>(), merged_grad->value().data<T>(),
-            velocity->data<MT>(), learning_rate->data<MPDType>(),
-            master_in_data, mu, rescale_grad, rows, row_numel,
-            static_cast<int64_t>(merged_grad->rows().size()),
-            regularization_flag, regularization_coeff,
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            velocity_out->mutable_data<MT>(ctx.GetPlace()), master_out_data);
-        for_range(functor);
-
-      } else {
-        SparseMomentumFunctor<T, MT, NoNesterov> functor(
-            param->data<T>(), merged_grad->value().data<T>(),
-            velocity->data<MT>(), learning_rate->data<MPDType>(),
-            master_in_data, mu, rescale_grad, rows, row_numel,
-            static_cast<int64_t>(merged_grad->rows().size()),
-            regularization_flag, regularization_coeff,
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            velocity_out->mutable_data<MT>(ctx.GetPlace()), master_out_data);
-        for_range(functor);
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false, true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in MomentumOp. Excepted LodTensor "
-                            "or SelectedRows, But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
@@ -15,6 +15,7 @@ limitations under the License. */

 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"

 namespace paddle {
 namespace operators {
@@ -28,10 +29,10 @@ class NPUMomentumOpKernel : public framework::OpKernel<T> {
    std::string regularization_method =
        ctx.Attr<std::string>("regularization_method");
    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
-    RegularizationType regularization_flag{
-        RegularizationType::kNONE};  // disable regularization
+    phi::RegularizationType regularization_flag{
+        phi::RegularizationType::kNONE};  // disable regularization
    if (regularization_method == "l2_decay") {
-      regularization_flag = RegularizationType::kL2DECAY;
+      regularization_flag = phi::RegularizationType::kL2DECAY;
    }

    T mu = static_cast<T>(ctx.Attr<float>("mu"));
@@ -55,7 +56,7 @@ class NPUMomentumOpKernel : public framework::OpKernel<T> {
      FillNpuTensorWithConstant<T>(&mu_tensor, mu);

      Tensor regularized_grad;
-      if (regularization_flag == RegularizationType::kL2DECAY) {
+      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
        const auto& runner1 = NpuOpRunner("Muls", {*param}, {regularized_grad},
                                          {{"value", regularization_coeff}});

--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
+#include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
 namespace operators {
@@ -170,6 +170,3 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/algorithm.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DenseRmspropGradFunctor {
-  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
-
-  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
-
-  const T *grad_;
-};
-
-template <typename T>
-struct SparseRmspropGradFunctor {
-  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
-                                  int64_t row_numel, int64_t row_count)
-      : grad_(grad),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_count_(row_count) {}
-
-  HOSTDEVICE inline T operator()(int64_t idx) const {
-    auto row_idx =
-        phi::funcs::BinarySearch(rows_, row_count_, idx / row_numel_);
-    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
-  }
-
-  const T *grad_;
-  const int64_t *rows_;
-  int64_t row_numel_;
-  int64_t row_count_;
-};
-
-template <typename T, typename GradFunctor>
-struct UncenteredRmspropFunctor {
-  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
-                           T epsilon, T momentum,
-                           const GradFunctor &grad_functor)
-      : param_(param),
-        ms_(ms),
-        mom_(mom),
-        lr_(lr),
-        rho_(rho),
-        epsilon_(epsilon),
-        momentum_(momentum),
-        grad_functor_(grad_functor) {}
-
-  HOSTDEVICE inline void operator()(int64_t idx) const {
-    T g = grad_functor_(idx);
-    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
-    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
-    param_[idx] -= mom_out;
-    ms_[idx] = ms_out;
-    mom_[idx] = mom_out;
-  }
-
-  T *param_;
-  T *ms_;
-  T *mom_;
-  const T *lr_;
-  T rho_;
-  T epsilon_;
-  T momentum_;
-  GradFunctor grad_functor_;
-};
-
-template <typename T, typename GradFunctor>
-struct CenteredRmspropFunctor {
-  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
-                         T rho, T epsilon, T momentum,
-                         const GradFunctor &grad_functor)
-      : param_(param),
-        ms_(ms),
-        mom_(mom),
-        mean_grad_(mean_grad),
-        lr_(lr),
-        rho_(rho),
-        epsilon_(epsilon),
-        momentum_(momentum),
-        grad_functor_(grad_functor) {}
-
-  HOSTDEVICE inline void operator()(int64_t idx) const {
-    T g = grad_functor_(idx);
-    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
-    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
-    T mom_out = momentum_ * mom_[idx] +
-                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
-    param_[idx] -= mom_out;
-    ms_[idx] = ms_out;
-    mom_[idx] = mom_out;
-    mean_grad_[idx] = mg_out;
-  }
-
-  T *param_;
-  T *ms_;
-  T *mom_;
-  T *mean_grad_;
-  const T *lr_;
-  T rho_;
-  T epsilon_;
-  T momentum_;
-  GradFunctor grad_functor_;
-};
-
-template <typename DeviceContext, typename T>
-class RmspropOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using LoDTensor = framework::LoDTensor;
-    auto *grad_var = ctx.InputVar("Grad");
-    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
-    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
-    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
-
-    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
-    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-    bool centered = ctx.Attr<bool>("centered");
-
-    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
-    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
-    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
-    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
-
-    PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out), true,
-                      platform::errors::InvalidArgument(
-                          "Param and ParamOut must be the same Tensor"));
-    PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out), true,
-                      platform::errors::InvalidArgument(
-                          "Moment and MomentOut must be the same Tensor"));
-    PADDLE_ENFORCE_EQ(
-        ms_tensor.IsSharedBufferWith(*mean_square_out), true,
-        platform::errors::InvalidArgument(
-            "MeanSquare and MeanSquareOut must be the same Tensor"));
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(ms_tensor.numel());
-
-    if (grad_var->IsType<LoDTensor>()) {
-      auto &grad_tensor = grad_var->Get<LoDTensor>();
-
-      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
-        auto &place =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        auto lr_value = lr_tensor.data<T>()[0];
-
-        auto p = framework::EigenVector<T>::Flatten(p_tensor);
-        auto ms = framework::EigenVector<T>::Flatten(ms_tensor);
-        auto g = framework::EigenVector<T>::Flatten(grad_tensor);
-        auto mom = framework::EigenVector<T>::Flatten(mom_tensor);
-
-        auto p_out = framework::EigenVector<T>::Flatten(*param_out);
-        auto mom_out = framework::EigenVector<T>::Flatten(*moment_out);
-        auto ms_out = framework::EigenVector<T>::Flatten(*mean_square_out);
-
-        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-        if (centered) {
-          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-          auto mg = framework::EigenVector<T>::Flatten(mg_tensor);
-          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(
-              &mg_tensor, mean_grad_out,
-              platform::errors::InvalidArgument(
-                  "MeanGrad and MeanGradOut must be the same Tensor"));
-          auto mg_out = framework::EigenVector<T>::Flatten(*mean_grad_out);
-
-          mg_out.device(place) = rho * mg + (1 - rho) * g;
-          mom_out.device(place) =
-              momentum * mom +
-              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
-        } else {
-          mom_out.device(place) =
-              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
-        }
-        p_out.device(place) = p - mom_out;
-      } else {
-        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
-        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-        if (centered) {
-          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(
-              &mg_tensor, mean_grad_out,
-              platform::errors::InvalidArgument(
-                  "MeanGrad and MeanGradOut must be the same Tensor"));
-          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
-              param_out->mutable_data<T>(ctx.GetPlace()),
-              mean_square_out->mutable_data<T>(ctx.GetPlace()),
-              moment_out->mutable_data<T>(ctx.GetPlace()),
-              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
-              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
-        } else {
-          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
-              param_out->mutable_data<T>(ctx.GetPlace()),
-              mean_square_out->mutable_data<T>(ctx.GetPlace()),
-              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-              rho, epsilon, momentum, grad_func));
-        }
-      }
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      auto &grad = grad_var->Get<phi::SelectedRows>();
-      phi::SelectedRows tmp_merged_grad;
-      phi::SelectedRows *merged_grad = &tmp_merged_grad;
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      merge_func(dev_ctx, grad, merged_grad);
-
-      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      auto &grad_merge_rows = merged_grad->rows();
-      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
-          &grad_merge_rows);
-      const int64_t *rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
-
-      auto &merged_tensor = merged_grad->value();
-      int64_t row_count = merged_grad->rows().size();
-      int64_t row_numel = merged_tensor.numel() / row_count;
-      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
-                                            row_numel, row_count);
-
-      if (centered) {
-        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-        PADDLE_ENFORCE_EQ(
-            &mg_tensor, mean_grad_out,
-            platform::errors::InvalidArgument(
-                "MeanGrad and MeanGradOut must be the same Tensor"));
-        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            mean_square_out->mutable_data<T>(ctx.GetPlace()),
-            moment_out->mutable_data<T>(ctx.GetPlace()),
-            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-            rho, epsilon, momentum, grad_func));
-      } else {
-        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            mean_square_out->mutable_data<T>(ctx.GetPlace()),
-            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-            rho, epsilon, momentum, grad_func));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false, true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in RmspropOp. Excepted LodTensor "
-                            "or SelectedRows, But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {

--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -14,9 +14,9 @@ limitations under the License. */

 #ifdef PADDLE_WITH_XPU

-#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
 #include <gflags/gflags.h>
 #include <iostream>
+#include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
 namespace operators {

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -11,7 +11,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "")

 # [ 1. Common kernel compilation dependencies ]
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor )
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)

--- a/paddle/phi/kernels/adagrad_kernel.h
+++ b/paddle/phi/kernels/adagrad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdagradDenseKernel(const Context& dev_ctx,
+                        const DenseTensor& param,
+                        const DenseTensor& grad,
+                        const DenseTensor& moment,
+                        const DenseTensor& learning_rate,
+                        float epsilon,
+                        DenseTensor* param_out,
+                        DenseTensor* moment_out);
+
+template <typename T, typename Context>
+void AdagradSparseKernel(const Context& dev_ctx,
+                         const DenseTensor& param,
+                         const SelectedRows& grad,
+                         const DenseTensor& moment,
+                         const DenseTensor& learning_rate,
+                         float epsilon,
+                         DenseTensor* param_out,
+                         DenseTensor* moment_out);
+
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/adagrad_kernel.cc
+++ b/paddle/phi/kernels/cpu/adagrad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adagrad_kernel.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/adagrad_kernel_impl.h"
+
+namespace phi {
+
+namespace {
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
+                  const phi::SelectedRows& grad,
+                  const DenseTensor& learning_rate,
+                  T epsilon,
+                  DenseTensor* moment,
+                  DenseTensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_width = grad.value().dims()[1];
+    paddle::operators::math::scatter::MergeAdd<phi::CPUContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto& merge_rows = grad_merge.rows();
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+
+    // 2. m += g_m * g_m
+    auto grad_square =
+        SquareSelectedRows<phi::CPUContext, T>(context, grad_merge);
+
+    paddle::operators::math::SelectedRowsAddToTensor<phi::CPUContext, T>
+        functor;
+    functor(context, grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    for (size_t i = 0; i < merge_rows.size(); i++) {
+      for (int64_t j = 0; j < grad_width; j++) {
+        param_data[merge_rows[i] * grad_width + j] -=
+            lr[0] * grad_merge_data[i * grad_width + j] /
+            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
+      }
+    }
+  }
+};
+
+template struct SparseAdagradFunctor<phi::CPUContext, float>;
+template struct SparseAdagradFunctor<phi::CPUContext, double>;
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    adagrad, CPU, ALL_LAYOUT, phi::AdagradDenseKernel, float, double) {}
+
+PD_REGISTER_KERNEL(adagrad_dense_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AdagradSparseKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
+#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(meshgrid_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MeshgridGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/meshgrid_kernel.h"
+#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(meshgrid,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MeshgridKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/momentum_kernel.cc
+++ b/paddle/phi/kernels/cpu/momentum_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/momentum_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    momentum, CPU, ALL_LAYOUT, phi::MomentumDenseKernel, float, double) {}
+
+PD_REGISTER_KERNEL(momentum_dense_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MomentumSparseKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rmsprop_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
+
+PD_REGISTER_KERNEL(rmsprop_dense_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RmspropSparseKernel,
+                   float,
+                   double) {}
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adagrad_kernel.h"

-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/operators/optimizers/adagrad_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/adagrad_kernel_impl.h"

-namespace paddle {
-namespace operators {
-
-namespace {
+namespace phi {

 template <typename T, int block_size>
-__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
-                                T* grad_merge, const int64_t* grad_merge_rows,
+__global__ void MergeGradKernel(const T* grad,
+                                const int64_t* grad_rows,
+                                T* grad_merge,
+                                const int64_t* grad_merge_rows,
                                size_t grad_merge_rows_size,
                                int64_t row_numel) {
  const int ty = blockIdx.y;
@@ -48,9 +52,12 @@ __global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
 }

 template <typename T, int block_size>
-__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
-                                           const T* learning_rate, T* param,
-                                           T* moment, int64_t row_numel,
+__global__ void SparseAdagradFunctorKernel(const T* grad,
+                                           const int64_t* rows,
+                                           const T* learning_rate,
+                                           T* param,
+                                           T* moment,
+                                           int64_t row_numel,
                                           T epsilon) {
  const int ty = blockIdx.y;
  int tid = threadIdx.x;
@@ -67,25 +74,27 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
                                        (sqrt(moment[index]) + epsilon));
  }
 }
-}  // namespace

 template <typename T>
-struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
+struct SparseAdagradFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
                  const phi::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param) {
+                  const DenseTensor& learning_rate,
+                  T epsilon,
+                  DenseTensor* moment,
+                  DenseTensor* param) {
    // 1. g_m.rows = set(g.rows)
    auto grad_width = grad.value().dims()[1];
-    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    paddle::operators::math::scatter::MergeAdd<phi::GPUContext, T> merge_func;
    auto grad_merge = merge_func(context, grad);
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    framework::Vector<int64_t> merge_rows(grad_merge.rows());
+    paddle::framework::Vector<int64_t> merge_rows(grad_merge.rows());
    // 2. m += g_m * g_m
    auto grad_square =
-        SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);
+        SquareSelectedRows<phi::GPUContext, T>(context, grad_merge);

-    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
+    paddle::operators::math::SelectedRowsAddToTensor<phi::GPUContext, T>
+        functor;
    functor(context, grad_square, moment);

    // 3. update parameter
@@ -98,22 +107,33 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    dim3 grid2(1, merge_rows.size());
    paddle::framework::MixVector<int64_t> mixv_merge_rows(&merge_rows);
    SparseAdagradFunctorKernel<
-        T, 256><<<grid2, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
-        grad_merge_data, mixv_merge_rows.CUDAMutableData(context.GetPlace()),
-        lr, param_data, moment_data, grad_width, epsilon);
+        T,
+        256><<<grid2,
+               threads,
+               0,
+               reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
+        grad_merge_data,
+        mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+        lr,
+        param_data,
+        moment_data,
+        grad_width,
+        epsilon);
    mixv_merge_rows.CopyToCPU();
  }
 };

-template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
+template struct SparseAdagradFunctor<phi::GPUContext, float>;
+template struct SparseAdagradFunctor<phi::GPUContext, double>;
+
+}  // namespace phi

-}  // namespace operators
-}  // namespace paddle
+PD_REGISTER_KERNEL(
+    adagrad, GPU, ALL_LAYOUT, phi::AdagradDenseKernel, float, double) {}

-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
+PD_REGISTER_KERNEL(adagrad_dense_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AdagradSparseKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
+#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(meshgrid_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeshgridGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/meshgrid_kernel.h"
+#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(meshgrid,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeshgridKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/momentum_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/momentum_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
+
+PD_REGISTER_KERNEL(momentum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MomentumDenseKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(momentum_dense_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MomentumSparseKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/rmsprop_kernel.cu
+++ b/paddle/phi/kernels/gpu/rmsprop_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rmsprop_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    rmsprop, GPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
+
+PD_REGISTER_KERNEL(rmsprop_dense_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RmspropSparseKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/impl/adagrad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/adagrad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adagrad_kernel.h"
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename DeviceContext, typename T>
+struct SparseAdagradFunctor {
+  void operator()(const DeviceContext& context,
+                  const phi::SelectedRows& grad,
+                  const DenseTensor& learning_rate,
+                  T epsilon,
+                  DenseTensor* moment,
+                  DenseTensor* param);
+};
+
+template <typename DeviceContext, typename T>
+phi::SelectedRows SquareSelectedRows(const DeviceContext& context,
+                                     const phi::SelectedRows& input) {
+  phi::SelectedRows out;
+  out.set_rows(input.rows());
+  out.set_height(input.height());
+  out.mutable_value()->Resize(input.value().dims());
+  context.template Alloc<T>(out.mutable_value());
+  auto e_out = EigenVector<T>::Flatten(*(out.mutable_value()));
+  auto e_in = EigenVector<T>::Flatten(input.value());
+  e_out.device(*context.eigen_device()) = e_in.square();
+  return out;
+}
+
+template <typename T, typename Context>
+void AdagradDenseKernel(const Context& ctx,
+                        const DenseTensor& param_t,
+                        const DenseTensor& grad_t,
+                        const DenseTensor& moment_t,
+                        const DenseTensor& learning_rate,
+                        float epsilon_t,
+                        DenseTensor* param_out_tensor,
+                        DenseTensor* moment_out_tensor) {
+  ctx.template Alloc<T>(param_out_tensor);
+  ctx.template Alloc<T>(moment_out_tensor);
+
+  T epsilon = static_cast<T>(epsilon_t);
+
+  auto param = EigenVector<T>::Flatten(param_t);
+
+  auto grad = EigenVector<T>::Flatten(grad_t);
+
+  auto moment = EigenVector<T>::Flatten(moment_t);
+
+  auto param_out = EigenVector<T>::Flatten(*param_out_tensor);
+  auto moment_out = EigenVector<T>::Flatten(*moment_out_tensor);
+  auto place = *ctx.eigen_device();
+
+  moment_out.device(place) = moment + grad * grad;
+  Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+  if (paddle::platform::is_cpu_place(ctx.GetPlace())) {
+    auto* lr = learning_rate.data<T>();
+    param_out.device(place) =
+        param - lr[0] * grad / (moment_out.sqrt() + epsilon);
+  } else {
+    auto lr = EigenVector<T>::Flatten(learning_rate);
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+}
+
+template <typename T, typename Context>
+void AdagradSparseKernel(const Context& ctx,
+                         const DenseTensor& param_t,
+                         const SelectedRows& grad_t,
+                         const DenseTensor& moment_t,
+                         const DenseTensor& learning_rate,
+                         float epsilon_t,
+                         DenseTensor* param_out,
+                         DenseTensor* moment_out) {
+  auto* param_out_tensor = param_out;
+  auto* moment_out_tensor = moment_out;
+
+  ctx.template Alloc<T>(param_out_tensor);
+  ctx.template Alloc<T>(moment_out_tensor);
+
+  T epsilon = static_cast<T>(epsilon_t);
+
+  auto* param_tensor = &param_t;
+  PADDLE_ENFORCE_EQ(param_tensor,
+                    param_out_tensor,
+                    phi::errors::InvalidArgument(
+                        "the input tensor not euqal with output tensor"));
+
+  auto* moment_tensor = &moment_t;
+  PADDLE_ENFORCE_EQ(moment_tensor,
+                    moment_out_tensor,
+                    phi::errors::InvalidArgument(
+                        "the input moment not eual with output moment"));
+
+  SparseAdagradFunctor<Context, T> functor;
+  functor(
+      ctx, grad_t, learning_rate, epsilon, moment_out_tensor, param_out_tensor);
+}
+
+}  // namespace phi
--- a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, int Rank>
+void MeshgridBackward(const Context& ctx,
+                      const std::vector<const DenseTensor*>& ins,
+                      const std::vector<const DenseTensor*>& out_grad,
+                      std::vector<DenseTensor*> outs) {
+  int n = out_grad.size();
+  auto out_dims = out_grad[0]->dims();
+
+  for (int i = 0; i < n; i++) {
+    ctx.template Alloc<T>(outs[i]);
+    auto out_grad_tmp = EigenVector<T>::Flatten(*out_grad[i]);
+    auto in_grad = EigenVector<T>::Flatten(*outs[i]);
+
+    std::vector<int> reduce_dims_vec;
+    std::vector<int> reshape_dims_vec;
+    for (int j = 0; j < n; j++) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      if (j == i) {
+        reshape_dims_vec.push_back(1);
+        reshape_dims_vec.push_back(out_dims[j]);
+      } else {
+        reshape_dims_vec.push_back(out_dims[j]);
+        reshape_dims_vec.push_back(1);
+      }
+    }
+
+    Eigen::DSizes<Eigen::DenseIndex, Rank> reduce_dims;
+    for (int k = 0; k < n; k++) {
+      reduce_dims[k] = reduce_dims_vec[k];
+    }
+
+    Eigen::DSizes<Eigen::DenseIndex, Rank * 2> reshape_dims;
+    for (int k = 0; k < n * 2; k++) {
+      reshape_dims[k] = reshape_dims_vec[k];
+    }
+
+    auto& place = *ctx.eigen_device();
+    funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, in_grad, out_grad_tmp, reduce_dims, reshape_dims);
+  }
+}
+
+template <typename T, typename Context>
+void MeshgridGradKernel(const Context& ctx,
+                        const std::vector<const DenseTensor*>& inputs,
+                        const std::vector<const DenseTensor*>& outputs_grad,
+                        std::vector<DenseTensor*> inputs_grad) {
+  int n = outputs_grad.size();
+  switch (n) {
+    case 1:
+      MeshgridBackward<T, Context, 1>(ctx, inputs, outputs_grad, inputs_grad);
+      break;
+    case 2:
+      MeshgridBackward<T, Context, 2>(ctx, inputs, outputs_grad, inputs_grad);
+      break;
+    case 3:
+      MeshgridBackward<T, Context, 3>(ctx, inputs, outputs_grad, inputs_grad);
+      break;
+    case 4:
+      MeshgridBackward<T, Context, 4>(ctx, inputs, outputs_grad, inputs_grad);
+      break;
+    case 5:
+      MeshgridBackward<T, Context, 5>(ctx, inputs, outputs_grad, inputs_grad);
+      break;
+    case 6:
+      MeshgridBackward<T, Context, 6>(ctx, inputs, outputs_grad, inputs_grad);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Excepted Tensor numbers between 1 and 6, but only received d% .",
+          n));
+  }
+}
+
+}  // namespace phi
--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/meshgrid_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, int Rank>
+void MeshgridForward(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     std::vector<DenseTensor*> outs) {
+  PADDLE_ENFORCE_EQ(
+      ins.size() > 1,
+      true,
+      phi::errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          ins.size()));
+
+  int64_t size = ins.size();
+  std::vector<int64_t> shape(size);
+
+  for (int64_t i = 0; i < size; i++) {
+    switch (ins[i]->dims().size()) {
+      case 0:
+        shape[i] = 1;
+        break;
+      case 1:
+        shape[i] = ins[i]->dims()[0];
+        break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "Expected scalar or 1D tensor in the tensor list but got tensor "
+            "%d: ",
+            i));
+    }
+  }
+
+  for (int64_t i = 0; i < size; i++) {
+    std::vector<int64_t> view_shape(size, 1);
+    view_shape[i] = shape[i];
+
+    DenseTensor reshape_ins_tensor;
+    paddle::framework::TensorCopy(
+        *ins[i], ctx.GetPlace(), ctx, &reshape_ins_tensor);
+    DDim out_dims_reshape = phi::make_ddim(view_shape);
+    reshape_ins_tensor.Resize(out_dims_reshape);
+    DDim out_dims = phi::make_ddim(shape);
+
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+    for (int64_t j = 0; j < size; j++) {
+      bcast_dims[j] = shape[j];
+    }
+    bcast_dims[i] = 1;
+
+    outs[i]->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(
+        static_cast<const DenseTensor>(reshape_ins_tensor));
+    ctx.template Alloc<T>(outs[i]);
+    auto y = EigenTensor<T, Rank>::From(*outs[i]);
+    auto& place = *ctx.eigen_device();
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, y, x, bcast_dims);
+  }
+}
+
+template <typename T, typename Context>
+void MeshgridKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& inputs,
+                    std::vector<DenseTensor*> outputs) {
+  int rank = inputs.size();
+  switch (rank) {
+    case 1:
+      MeshgridForward<T, Context, 1>(ctx, inputs, outputs);
+      break;
+    case 2:
+      MeshgridForward<T, Context, 2>(ctx, inputs, outputs);
+      break;
+    case 3:
+      MeshgridForward<T, Context, 3>(ctx, inputs, outputs);
+      break;
+    case 4:
+      MeshgridForward<T, Context, 4>(ctx, inputs, outputs);
+      break;
+    case 5:
+      MeshgridForward<T, Context, 5>(ctx, inputs, outputs);
+      break;
+    case 6:
+      MeshgridForward<T, Context, 6>(ctx, inputs, outputs);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Excepted Tensor numbers between 1 and 6, but only received d% .",
+          rank));
+  }
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
--- a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+
+#include "paddle/phi/kernels/rmsprop_kernel.h"
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/kernels/funcs/algorithm.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+
+template <typename T>
+struct DenseRmspropGradFunctor {
+  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
+
+  const T *grad_;
+};
+
+template <typename T>
+struct SparseRmspropGradFunctor {
+  inline SparseRmspropGradFunctor(const T *grad,
+                                  const int64_t *rows,
+                                  int64_t row_numel,
+                                  int64_t row_count)
+      : grad_(grad),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const {
+    auto row_idx =
+        phi::funcs::BinarySearch(rows_, row_count_, idx / row_numel_);
+    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
+  }
+
+  const T *grad_;
+  const int64_t *rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+};
+
+template <typename T, typename GradFunctor>
+struct UncenteredRmspropFunctor {
+  UncenteredRmspropFunctor(T *param,
+                           T *ms,
+                           T *mom,
+                           const T *lr,
+                           T rho,
+                           T epsilon,
+                           T momentum,
+                           const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
+template <typename T, typename GradFunctor>
+struct CenteredRmspropFunctor {
+  CenteredRmspropFunctor(T *param,
+                         T *ms,
+                         T *mom,
+                         T *mean_grad,
+                         const T *lr,
+                         T rho,
+                         T epsilon,
+                         T momentum,
+                         const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        mean_grad_(mean_grad),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
+    T mom_out = momentum_ * mom_[idx] +
+                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+    mean_grad_[idx] = mg_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  T *mean_grad_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
+template <typename T, typename Context>
+void RmspropDenseKernel(const Context &ctx,
+                        const DenseTensor &param,
+                        const DenseTensor &mean_square,
+                        const DenseTensor &grad,
+                        const DenseTensor &moment,
+                        const DenseTensor &learning_rate,
+                        paddle::optional<const DenseTensor &> mean_grad_opt,
+                        float epsilon_t,
+                        float decay_t,
+                        float momentum_t,
+                        bool centered,
+                        DenseTensor *param_out,
+                        DenseTensor *moment_out,
+                        DenseTensor *mean_square_out,
+                        DenseTensor *mean_grad_out) {
+  auto epsilon = static_cast<T>(epsilon_t);
+  auto rho = static_cast<T>(decay_t);
+  auto momentum = static_cast<T>(momentum_t);
+
+  auto &p_tensor = param;
+  auto &ms_tensor = mean_square;
+  auto &lr_tensor = learning_rate;
+  auto &mom_tensor = moment;
+
+  PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Param and ParamOut must be the same Tensor"));
+  PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Moment and MomentOut must be the same Tensor"));
+  PADDLE_ENFORCE_EQ(
+      ms_tensor.IsSharedBufferWith(*mean_square_out),
+      true,
+      phi::errors::InvalidArgument(
+          "MeanSquare and MeanSquareOut must be the same Tensor"));
+  size_t limit = static_cast<size_t>(ms_tensor.numel());
+  auto &grad_tensor = grad;
+  if (paddle::platform::is_cpu_place(ctx.GetPlace())) {
+    auto &place = *ctx.eigen_device();
+    auto lr_value = lr_tensor.data<T>()[0];
+
+    auto p = EigenVector<T>::Flatten(p_tensor);
+    auto ms = EigenVector<T>::Flatten(ms_tensor);
+    auto g = EigenVector<T>::Flatten(grad_tensor);
+    auto mom = EigenVector<T>::Flatten(mom_tensor);
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto mom_out = EigenVector<T>::Flatten(*moment_out);
+    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+
+    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+    if (centered) {
+      auto mg_tensor = mean_grad_opt.get_ptr();
+      auto mg = EigenVector<T>::Flatten(*mg_tensor);
+      PADDLE_ENFORCE_EQ(
+          mg_tensor,
+          mean_grad_out,
+          phi::errors::InvalidArgument(
+              "MeanGrad and MeanGradOut must be the same Tensor"));
+      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+      mg_out.device(place) = rho * mg + (1 - rho) * g;
+      mom_out.device(place) =
+          momentum * mom +
+          lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
+    } else {
+      mom_out.device(place) =
+          momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
+    }
+    p_out.device(place) = p - mom_out;
+  } else {
+    DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
+    funcs::ForRange<Context> for_range(ctx, limit);
+    if (centered) {
+      auto mg_tensor = mean_grad_opt.get_ptr();
+
+      PADDLE_ENFORCE_EQ(
+          mg_tensor,
+          mean_grad_out,
+          phi::errors::InvalidArgument(
+              "MeanGrad and MeanGradOut must be the same Tensor"));
+      for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+          ctx.template Alloc<T>(param_out),
+          ctx.template Alloc<T>(mean_square_out),
+          ctx.template Alloc<T>(moment_out),
+          ctx.template Alloc<T>(mean_grad_out),
+          lr_tensor.data<T>(),
+          rho,
+          epsilon,
+          momentum,
+          grad_func));
+    } else {
+      for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+          ctx.template Alloc<T>(param_out),
+          ctx.template Alloc<T>(mean_square_out),
+          ctx.template Alloc<T>(moment_out),
+          lr_tensor.data<T>(),
+          rho,
+          epsilon,
+          momentum,
+          grad_func));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RmspropSparseKernel(const Context &ctx,
+                         const DenseTensor &param,
+                         const DenseTensor &mean_square,
+                         const SelectedRows &grad,
+                         const DenseTensor &moment,
+                         const DenseTensor &learning_rate,
+                         paddle::optional<const DenseTensor &> mean_grad_opt,
+                         float epsilon_t,
+                         float decay_t,
+                         float momentum_t,
+                         bool centered,
+                         DenseTensor *param_out,
+                         DenseTensor *moment_out,
+                         DenseTensor *mean_square_out,
+                         DenseTensor *mean_grad_out) {
+  auto epsilon = static_cast<T>(epsilon_t);
+  auto rho = static_cast<T>(decay_t);
+  auto momentum = static_cast<T>(momentum_t);
+
+  auto &p_tensor = param;
+  auto &ms_tensor = mean_square;
+  auto &lr_tensor = learning_rate;
+  auto &mom_tensor = moment;
+
+  PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Param and ParamOut must be the same Tensor"));
+  PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Moment and MomentOut must be the same Tensor"));
+  PADDLE_ENFORCE_EQ(
+      ms_tensor.IsSharedBufferWith(*mean_square_out),
+      true,
+      phi::errors::InvalidArgument(
+          "MeanSquare and MeanSquareOut must be the same Tensor"));
+  size_t limit = static_cast<size_t>(ms_tensor.numel());
+
+  phi::SelectedRows tmp_merged_grad;
+  phi::SelectedRows *merged_grad = &tmp_merged_grad;
+  paddle::operators::math::scatter::MergeAdd<Context, T> merge_func;
+  merge_func(ctx, grad, merged_grad);
+
+  funcs::ForRange<Context> for_range(ctx, limit);
+  auto &grad_merge_rows = merged_grad->rows();
+  paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(&grad_merge_rows);
+  const int64_t *rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
+
+  auto &merged_tensor = merged_grad->value();
+  int64_t row_count = merged_grad->rows().size();
+  int64_t row_numel = merged_tensor.numel() / row_count;
+  SparseRmspropGradFunctor<T> grad_func(
+      merged_tensor.data<T>(), rows, row_numel, row_count);
+
+  if (centered) {
+    auto mg_tensor = mean_grad_opt.get_ptr();
+
+    PADDLE_ENFORCE_EQ(mg_tensor,
+                      mean_grad_out,
+                      phi::errors::InvalidArgument(
+                          "MeanGrad and MeanGradOut must be the same Tensor"));
+    for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+        ctx.template Alloc<T>(param_out),
+        ctx.template Alloc<T>(mean_square_out),
+        ctx.template Alloc<T>(moment_out),
+        ctx.template Alloc<T>(mean_grad_out),
+        lr_tensor.data<T>(),
+        rho,
+        epsilon,
+        momentum,
+        grad_func));
+  } else {
+    for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+        ctx.template Alloc<T>(param_out),
+        ctx.template Alloc<T>(mean_square_out),
+        ctx.template Alloc<T>(moment_out),
+        lr_tensor.data<T>(),
+        rho,
+        epsilon,
+        momentum,
+        grad_func));
+  }
+}
+
+}  // namespace phi
--- a/paddle/phi/kernels/meshgrid_grad_kernel.h
+++ b/paddle/phi/kernels/meshgrid_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeshgridGradKernel(const Context& ctx,
+                        const std::vector<const DenseTensor*>& inputs,
+                        const std::vector<const DenseTensor*>& outputs_grad,
+                        std::vector<DenseTensor*> inputs_grad);
+
+}  // namespace phi
--- a/paddle/phi/kernels/meshgrid_kernel.h
+++ b/paddle/phi/kernels/meshgrid_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeshgridKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& inputs,
+                    std::vector<DenseTensor*> outputs);
+
+}  // namespace phi
--- a/paddle/phi/kernels/momentum_kernel.h
+++ b/paddle/phi/kernels/momentum_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MomentumDenseKernel(const Context& dev_ctx,
+                         const DenseTensor& param,
+                         const DenseTensor& grad,
+                         const DenseTensor& velocity,
+                         const DenseTensor& learning_rate,
+                         paddle::optional<const DenseTensor&> master_param,
+                         float mu,
+                         bool use_nesterov,
+                         const std::string& regularization_method,
+                         float regularization_coeff,
+                         bool multi_precision,
+                         float rescale_grad,
+                         DenseTensor* param_out,
+                         DenseTensor* velocity_out,
+                         DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void MomentumSparseKernel(const Context& dev_ctx,
+                          const DenseTensor& param,
+                          const SelectedRows& grad,
+                          const DenseTensor& velocity,
+                          const DenseTensor& learning_rate,
+                          paddle::optional<const DenseTensor&> master_param,
+                          float mu,
+                          bool use_nesterov,
+                          const std::string& regularization_method,
+                          float regularization_coeff,
+                          bool multi_precision,
+                          float rescale_grad,
+                          DenseTensor* param_out,
+                          DenseTensor* velocity_out,
+                          DenseTensor* master_param_out);
+
+}  // namespace phi
--- a/paddle/phi/kernels/rmsprop_kernel.h
+++ b/paddle/phi/kernels/rmsprop_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RmspropDenseKernel(const Context& dev_ctx,
+                        const DenseTensor& param,
+                        const DenseTensor& mean_square,
+                        const DenseTensor& grad,
+                        const DenseTensor& moment,
+                        const DenseTensor& learning_rate,
+                        paddle::optional<const DenseTensor&> mean_grad,
+                        float epsilon,
+                        float decay,
+                        float momentum,
+                        bool centered,
+                        DenseTensor* param_out,
+                        DenseTensor* moment_out,
+                        DenseTensor* mean_square_out,
+                        DenseTensor* mean_grad_out);
+
+template <typename T, typename Context>
+void RmspropSparseKernel(const Context& dev_ctx,
+                         const DenseTensor& param,
+                         const DenseTensor& mean_square,
+                         const SelectedRows& grad,
+                         const DenseTensor& moment,
+                         const DenseTensor& learning_rate,
+                         paddle::optional<const DenseTensor&> mean_grad,
+                         float epsilon,
+                         float decay,
+                         float momentum,
+                         bool centered,
+                         DenseTensor* param_out,
+                         DenseTensor* moment_out,
+                         DenseTensor* mean_square_out,
+                         DenseTensor* mean_grad_out);
+
+}  // namespace phi
--- a/paddle/phi/ops/compat/adagrad_sig.cc
+++ b/paddle/phi/ops/compat/adagrad_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AdagradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Grad")) {
+    return KernelSignature("adagrad",
+                           {"Param", "Grad", "Moment", "LearningRate"},
+                           {"epsilon"},
+                           {"ParamOut", "MomentOut"});
+  } else if (ctx.IsSelectedRowsInput("Grad")) {
+    return KernelSignature("adagrad_dense_param_sparse_grad",
+                           {"Param", "Grad", "Moment", "LearningRate"},
+                           {"epsilon"},
+                           {"ParamOut", "MomentOut"});
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(adagrad, phi::AdagradOpArgumentMapping);
--- a/paddle/phi/ops/compat/meshgrid_sig.cc
+++ b/paddle/phi/ops/compat/meshgrid_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MeshgridOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("meshgrid", {"X"}, {}, {"Out"});
+}
+
+KernelSignature MeshgridGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "meshgrid_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(meshgrid, phi::MeshgridOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(meshgrid_grad, phi::MeshgridGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/momentum_sig.cc
+++ b/paddle/phi/ops/compat/momentum_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MomentumOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Grad")) {
+    return KernelSignature(
+        "momentum",
+        {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
+        {"mu",
+         "use_nesterov",
+         "regularization_method",
+         "regularization_coeff",
+         "multi_precision",
+         "rescale_grad"},
+        {"ParamOut", "VelocityOut", "MasterParamOut"});
+  } else if (ctx.IsSelectedRowsInput("Grad")) {
+    return KernelSignature(
+        "momentum_dense_param_sparse_grad",
+        {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
+        {"mu",
+         "use_nesterov",
+         "regularization_method",
+         "regularization_coeff",
+         "multi_precision",
+         "rescale_grad"},
+        {"ParamOut", "VelocityOut", "MasterParamOut"});
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(momentum, phi::MomentumOpArgumentMapping);
--- a/paddle/phi/ops/compat/rmsprop_sig.cc
+++ b/paddle/phi/ops/compat/rmsprop_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Grad")) {
+    return KernelSignature(
+        "rmsprop",
+        {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"},
+        {"epsilon", "decay", "momentum", "centered"},
+        {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"});
+  } else if (ctx.IsSelectedRowsInput("Grad")) {
+    return KernelSignature(
+        "rmsprop_dense_param_sparse_grad",
+        {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"},
+        {"epsilon", "decay", "momentum", "centered"},
+        {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"});
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(rmsprop, phi::RmspropOpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -20,6 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
 import math
+import paddle


 class TestAdagradOp1(OpTest):
@@ -189,4 +190,5 @@ class TestSparseAdagradOp(unittest.TestCase):


 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
@@ -258,6 +258,7 @@ class TestMergedMomentum(unittest.TestCase):
    def setUp(self):
        paddle.enable_static()
        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+
        self.seed = 10

    def gen_rand_data(self, shapes, dtype):
@@ -391,4 +392,5 @@ class TestMergedMomentum2(unittest.TestCase):


 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
@@ -84,7 +84,6 @@ class TestMeshgridOp3(unittest.TestCase):
                               feed={'x': input_1,
                                     'y': input_2},
                               fetch_list=[grid_x, grid_y])
-
        assert np.array_equal(res_1, out_1)
        assert np.array_equal(res_2, out_2)

@@ -180,4 +179,5 @@ class TestMeshgridOp8(unittest.TestCase):


 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -872,6 +872,7 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
            place=place, use_amp=use_amp, use_multi_tensor=True)
        output2, params2 = self._momentum_optimize_dygraph(
            place=place, use_amp=use_amp, use_multi_tensor=False)
+
        self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
        for idx in range(len(params1)):
            self.assertEqual(
@@ -991,4 +992,5 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):


 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -316,4 +316,5 @@ class TestRMSPropV2Group(TestRMSPropV2):


 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()