diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc
index 006107680592309a6518ceb032b8259a8cb2be18..7f9e5f3f3e37f0c07b4fae8469e199bbca3cd56a 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -85,9 +85,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::PartialAllGatherOpInplaceInferer)
 
-REGISTER_OP_CPU_KERNEL(partial_allgather,
-                       ops::PartialAllGatherOpCPUKernel<float>,
-                       ops::PartialAllGatherOpCPUKernel<double>,
-                       ops::PartialAllGatherOpCPUKernel<int>,
-                       ops::PartialAllGatherOpCPUKernel<int64_t>,
-                       ops::PartialAllGatherOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_allgather,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialAllGatherOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index ce5a5438eff55515a28dec9d7f633d27df5e390f..2374f4a4aed8239053a4ccb51803377c0d75b596 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -102,12 +102,16 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(partial_allgather,
-                        ops::PartialAllGatherOpCUDAKernel<float>,
+PD_REGISTER_STRUCT_KERNEL(partial_allgather,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialAllGatherOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialAllGatherOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialAllGatherOpCUDAKernel<double>,
-                        ops::PartialAllGatherOpCUDAKernel<int>,
-                        ops::PartialAllGatherOpCUDAKernel<int64_t>,
-                        ops::PartialAllGatherOpCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h
index 7e9c85214cf318db390985ac47c743808c151717..6b827a2656f29edd891ff4156ac718144d77beed 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc
index 14cca68cf16ab5dcadf2a97f817a3ba2780c6b3a..5cd4a72ea7ea9f5c8e2311e51d73221361e36e0e 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -129,9 +129,12 @@ REGISTER_OP_WITHOUT_GRADIENT(partial_recv,
                              ops::PartialRecvOp,
                              ops::PartialRecvOpMaker);
 
-REGISTER_OP_CPU_KERNEL(partial_recv,
-                       ops::PartialRecvOpCPUKernel<float>,
-                       ops::PartialRecvOpCPUKernel<double>,
-                       ops::PartialRecvOpCPUKernel<int>,
-                       ops::PartialRecvOpCPUKernel<int64_t>,
-                       ops::PartialRecvOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_recv,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialRecvOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 306175d1ca7af8d5e4768b488bfa0c7a417ed344..b0df94194e4f87801b38a1a6df65236e8f9944a2 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -118,12 +118,16 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(partial_recv,
-                        ops::PartialRecvOpCUDAKernel<float>,
+PD_REGISTER_STRUCT_KERNEL(partial_recv,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialRecvOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialRecvOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialRecvOpCUDAKernel<double>,
-                        ops::PartialRecvOpCUDAKernel<int>,
-                        ops::PartialRecvOpCUDAKernel<int64_t>,
-                        ops::PartialRecvOpCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/partial_recv_op.h b/paddle/fluid/operators/collective/partial_recv_op.h
index d64fa39939c2d6e85a709874f45977c15b26230a..fdf3f02b0d679f00541743728f66c2f7dd9a6054 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.h
+++ b/paddle/fluid/operators/collective/partial_recv_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialRecvOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc
index a45cc6ddde6438fb6b6165c0963ec550e9299a4c..936336ce74ad5257a88908a8792e9bb16f24f4db 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -94,9 +94,12 @@ REGISTER_OP_WITHOUT_GRADIENT(partial_send,
                              ops::PartialSendOp,
                              ops::PartialSendMaker);
 
-REGISTER_OP_CPU_KERNEL(partial_send,
-                       ops::PartialSendOpCPUKernel<float>,
-                       ops::PartialSendOpCPUKernel<double>,
-                       ops::PartialSendOpCPUKernel<int>,
-                       ops::PartialSendOpCPUKernel<int64_t>,
-                       ops::PartialSendOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_send,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSendOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index afac7f963fa0dc6107938db7b67c5164e512039e..dc24ea01fc98e96f59409f5a0628ba36642cc6c7 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSendCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -117,12 +117,16 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(partial_send,
-                        ops::PartialSendCUDAKernel<float>,
-                        ops::PartialSendCUDAKernel<double>,
+PD_REGISTER_STRUCT_KERNEL(partial_send,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialSendCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialSendCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialSendCUDAKernel<int>,
-                        ops::PartialSendCUDAKernel<int64_t>,
-                        ops::PartialSendCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/partial_send_op.h b/paddle/fluid/operators/collective/partial_send_op.h
index 7550ac40078c40c12f21c9193fc4244058a3b362..773125be7d40f0c3c7330a16efd0736623e70225 100644
--- a/paddle/fluid/operators/collective/partial_send_op.h
+++ b/paddle/fluid/operators/collective/partial_send_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSendOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index c331cdc97f00058a46a9d781bc80d8685d123b90..936480a9e23ddb22bceb1e71b0de6aa3e5c88c36 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -111,7 +111,10 @@ REGISTER_OPERATOR(
     ops::PolygonBoxTransformOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    polygon_box_transform,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(polygon_box_transform,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PolygonBoxTransformCPUKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index de43f2d62b45547176d1f13adf6240117501d202..4f182464f77b5015ce9e3639a63551aa2a06c8e4 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -38,7 +38,7 @@ __global__ void PolygonBoxTransformKernel(
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -73,7 +73,10 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    polygon_box_transform,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
+namespace ops = paddle::operators;
+PD_REGISTER_STRUCT_KERNEL(polygon_box_transform,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PolygonBoxTransformOpCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 0652151320d819a9a9129a88b27a0b13c384b19b..413cd8546011beee2faa54ca9dde5002f01555d5 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -242,7 +242,9 @@ REGISTER_OPERATOR(
     ops::PrecisionRecallOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    precision_recall,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
+PD_REGISTER_STRUCT_KERNEL(precision_recall,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PrecisionRecallKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
index bec8bba09ad1a1a98f6e6883cea1b3145d73a932..6eef5658c5c00749dd1d6dc61d8f1d2ddd9de21d 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
@@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 enum StateVariable { TP = 0, FP, TN, FN };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index d328329e1c24a8011b05eadc19ce7d425fc0ebbd..7dae16afafdf11e08df9799adea32d5657ea5474 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -52,7 +52,7 @@ static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
   return it->second;
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -87,7 +87,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -128,7 +128,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLBcastKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,6 +172,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(
+    ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {}
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 8d5528716f4a92819736474b56e4c50e41da5391..87c0708e12d398588ccfb3e4bdca53067be86e59 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -31,9 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_CUDA_ONLY_OP(ncclAllReduce);
-USE_CUDA_ONLY_OP(ncclReduce);
-USE_CUDA_ONLY_OP(ncclBcast);
+USE_OP_ITSELF(ncclAllReduce);
+USE_OP_ITSELF(ncclReduce);
+USE_OP_ITSELF(ncclBcast);
+PD_DECLARE_KERNEL(ncclAllReduce, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclReduce, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclBcast, GPU, ALL_LAYOUT);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 286c851278117979e34adff517c3727e8371e0f4..9c9055d1987e1202e6bcc4cee113799841ee44b7 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -320,9 +320,8 @@ REGISTER_OPERATOR(nce_grad,
                   ops::NCEOpGrad,
                   ops::NCEOpGradVarTypeInference,
                   ops::NCEGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(nce,
-                       ops::NCEKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(nce_grad,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(nce, CPU, ALL_LAYOUT, ops::NCEKernel, float, double) {
+}
+PD_REGISTER_STRUCT_KERNEL(
+    nce_grad, CPU, ALL_LAYOUT, ops::NCEGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 4b9fe86b225653c0df3a8d4a1d94278c1449e828..188568ec323ba3f3db9ce2f8e799f4b910d43a1c 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -75,7 +75,7 @@ void PrepareSamples(const framework::ExecutionContext &context,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class NCEKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -245,7 +245,7 @@ class NCEKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class NCEGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index 709b1f4f1f020901d659b35b69b8329391faa64a..69f0bfb2abcd312b1a3fd8ebd567a44d4fb355a7 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -45,7 +45,7 @@ establish the dependency between input and output tensors.
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NopKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {}
@@ -58,8 +58,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
-REGISTER_OP_CPU_KERNEL(nop, ops::NopKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-REGISTER_OP_CUDA_KERNEL(nop, ops::NopKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(nop, ops::NopKernel<float>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
+#endif
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
index e636bc98bfca5a695de2de345e6589d68d23618a..bc566ca5fbfa751c86727cf48cc7dae37e833e9b 100644
--- a/paddle/fluid/operators/number_count_op.cc
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -58,10 +58,9 @@ class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CPU_KERNEL(number_count,
-                       ops::NumberCountOpCPUKernel<int>,
-                       ops::NumberCountOpCPUKernel<int64_t>);
-
 REGISTER_OP_WITHOUT_GRADIENT(number_count,
                              ops::NumberCountOp,
                              ops::NumberCountOpMaker);
+
+PD_REGISTER_STRUCT_KERNEL(
+    number_count, CPU, ALL_LAYOUT, ops::NumberCountOpCPUKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index fdab03698711c35a3de4508aa38b0480b56fc24a..b9afffd7887d49a841469dd4d2c7424768c036c6 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -79,7 +79,7 @@ __global__ void NumberCount(const T* numbers,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -111,4 +111,5 @@ class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(
+    number_count, GPU, ALL_LAYOUT, ops::NumberCountOpCUDAKernel, int64_t) {}
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
index ded7ea6eec54f7ce08ae610274febdbb4f82d292..e95336ae2a3a8e8c7f32049fd03682cb5b42d77e 100644
--- a/paddle/fluid/operators/number_count_op.h
+++ b/paddle/fluid/operators/number_count_op.h
@@ -24,7 +24,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NumberCountOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 076f5137cab92f7cc34aed27e95e63da783219f0..3261e96cbbeca4200da4d89ba753477ed9ec250c 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -133,5 +133,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad,
                              ops::ProximalAdagradOp,
                              ops::ProximalAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(proximal_adagrad,
-                       ops::ProximalAdagradOpKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_adagrad, CPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
index c338f4cc717a5747c76ad2336628b9835ad97058..0a79dcd425f1281159754c2f0ded47a64bd3f750 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
@@ -13,5 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(proximal_adagrad,
-                        ops::ProximalAdagradOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_adagrad, GPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
index 72eccd17e4489e6d6a3617f5bdfcb23dfdfb1471..973d870d14f31b2736be289ce067c10cc55c9c66 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index d7e01aa07109ea030a6995684dc131750c6cd982..08cc29ce9eb8db2353e58173ba42ef4fb030c6f9 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -106,5 +106,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd,
                              ops::ProximalGDOp,
                              ops::ProximalGDOpMaker);
-REGISTER_OP_CPU_KERNEL(proximal_gd,
-                       ops::ProximalGDOpKernel<phi::CPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_gd, CPU, ALL_LAYOUT, ops::ProximalGDOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
index edc911134c7293f34c16900337568af48877ff88..ef1edfc2ee458ffb4208a029e38f1cdfb719aa1f 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
@@ -13,5 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(proximal_gd,
-                        ops::ProximalGDOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_gd, GPU, ALL_LAYOUT, ops::ProximalGDOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h
index 49cf7b68bd32afe7bbc330005aa6d81531a98469..1945ef5bf6b778ef3750f7615af7ef97123c2cdf 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 91eeed0e9008eccddb95e308d5982828699d8963..e29981d35b41f9b4ef34f66ba203f62c7131b53f 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -402,7 +402,7 @@ static inline void GetPaddings(int* paddings,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -520,7 +520,7 @@ class Pad2dCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -873,11 +873,8 @@ REGISTER_OPERATOR(pad2d,
 REGISTER_OPERATOR(pad2d_grad,
                   ops::Pad2dOpGrad,
                   ops::Pad2dOpGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(pad2d,
-                       ops::Pad2dCPUKernel<float>,
-                       ops::Pad2dCPUKernel<double>,
-                       ops::Pad2dCPUKernel<int>,
-                       ops::Pad2dCPUKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(pad2d_grad,
-                       ops::Pad2dGradCPUKernel<float>,
-                       ops::Pad2dGradCPUKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    pad2d, CPU, ALL_LAYOUT, ops::Pad2dCPUKernel, float, double, int, int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(
+    pad2d_grad, CPU, ALL_LAYOUT, ops::Pad2dGradCPUKernel, float, double) {}
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 7b0dd2149dead520f83b248d941062986568ac16..b8263ea6bb16929accfae163af9590535f455689 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -361,7 +361,7 @@ static inline void GetPaddings(int* paddings,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -489,7 +489,7 @@ class Pad2dCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -618,13 +618,19 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(pad2d,
-                        ops::Pad2dCUDAKernel<plat::float16>,
-                        ops::Pad2dCUDAKernel<float>,
-                        ops::Pad2dCUDAKernel<double>,
-                        ops::Pad2dCUDAKernel<int>,
-                        ops::Pad2dCUDAKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL(pad2d_grad,
-                        ops::Pad2dGradCUDAKernel<plat::float16>,
-                        ops::Pad2dGradCUDAKernel<float>,
-                        ops::Pad2dGradCUDAKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(pad2d,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::Pad2dCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
+PD_REGISTER_STRUCT_KERNEL(pad2d_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::Pad2dGradCUDAKernel,
+                          float,
+                          double,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 9b08bb3fc1e1c6c5451f979e8bbdc9ddbafdf259..d00cefab45045436a686dd3f41132e609e1c7c2a 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -243,26 +243,38 @@ REGISTER_OPERATOR(pad_constant_like,
                   ops::PadConstantLikeOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pad_constant_like,
-                       ops::PadConstantLikeKernel<phi::CPUContext, float>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, double>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, int>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, float>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, double>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, int>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(pad_constant_like,
-                        ops::PadConstantLikeKernel<phi::GPUContext, float>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, double>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, int>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, int>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, int64_t>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, float>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeGradKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeGradKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+#endif
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index ba87bd3ef18182288182d06c4ebd909c501043ed..f6162037fbd56fe335217b1ede9f7f49b9ab5e8b 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PadConstantLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -61,7 +61,7 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PadConstantLikeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index 1fb9dceb4150c035a5d6d84b0e09b5eb852943aa..f2f3da9f0511f1c89c221d564db213788904a628 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -202,14 +202,19 @@ REGISTER_OPERATOR(partial_concat,
 
 REGISTER_OPERATOR(partial_concat_grad, ops::PartialConcatGradOp);
 
-REGISTER_OP_CPU_KERNEL(partial_concat,
-                       ops::PartialConcatKernel<phi::CPUContext, double>,
-                       ops::PartialConcatKernel<phi::CPUContext, float>,
-                       ops::PartialConcatKernel<phi::CPUContext, int64_t>,
-                       ops::PartialConcatKernel<phi::CPUContext, int>);
-
-REGISTER_OP_CPU_KERNEL(partial_concat_grad,
-                       ops::PartialConcatGradientOpKernel<float>,
-                       ops::PartialConcatGradientOpKernel<int>,
-                       ops::PartialConcatGradientOpKernel<double>,
-                       ops::PartialConcatGradientOpKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(partial_concat,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatGradientOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index f4acf68dcbc70847f67dfd67a1eaf5d70403fcba..ffef094fa96dd047c265957c095a4102d42980d6 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -65,7 +65,7 @@ __global__ void ConcatPartialGradCUDAKernel(T **in,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -146,7 +146,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -231,16 +231,22 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(partial_concat,
-                        ops::PartialConcatOpCUDAKernel<float>,
-                        ops::PartialConcatOpCUDAKernel<double>,
-                        ops::PartialConcatOpCUDAKernel<int>,
-                        ops::PartialConcatOpCUDAKernel<int64_t>,
-                        ops::PartialConcatOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(partial_concat_grad,
-                        ops::PartialConcatGradOpCUDAKernel<float>,
-                        ops::PartialConcatGradOpCUDAKernel<double>,
-                        ops::PartialConcatGradOpCUDAKernel<int>,
-                        ops::PartialConcatGradOpCUDAKernel<int64_t>,
-                        ops::PartialConcatGradOpCUDAKernel<plat::float16>);
+
+PD_REGISTER_STRUCT_KERNEL(partial_concat,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
+PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatGradOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index 407b57e3a8281404c46ffec2c4271429427d6b8b..fb0d17aa97b842de7a5896cca72848e10b8717c2 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -39,7 +39,7 @@ static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
   return start_index;
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -84,7 +84,7 @@ class PartialConcatKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 9ef7ac0a21a4813d71658f3c0a63a03eba37810d..4b130306825c678d7885716a0629feb73fd95230 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -204,14 +204,19 @@ REGISTER_OPERATOR(partial_sum,
 
 REGISTER_OPERATOR(partial_sum_grad, ops::PartialSumGradOp);
 
-REGISTER_OP_CPU_KERNEL(partial_sum,
-                       ops::PartialSumKernel<phi::CPUContext, float>,
-                       ops::PartialSumKernel<phi::CPUContext, int>,
-                       ops::PartialSumKernel<phi::CPUContext, double>,
-                       ops::PartialSumKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(partial_sum_grad,
-                       ops::PartialSumGradientOpKernel<float>,
-                       ops::PartialSumGradientOpKernel<int>,
-                       ops::PartialSumGradientOpKernel<double>,
-                       ops::PartialSumGradientOpKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(partial_sum,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSumKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(partial_sum_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSumGradientOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index 093e0032b3cb9b53ce4ff876ad3ccd5a3f1c6464..a38ec4c83946918d336468fd6bf039810025124e 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -70,7 +70,7 @@ __global__ void PartialSumGradCUDAKernel(T **res_grad,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -144,7 +144,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -233,18 +233,3 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(partial_sum,
-                        ops::PartialSumOpCUDAKernel<float>,
-                        ops::PartialSumOpCUDAKernel<double>,
-                        ops::PartialSumOpCUDAKernel<int>,
-                        ops::PartialSumOpCUDAKernel<int64_t>,
-                        ops::PartialSumOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(partial_sum_grad,
-                        ops::PartialSumGradOpCUDAKernel<float>,
-                        ops::PartialSumGradOpCUDAKernel<double>,
-                        ops::PartialSumGradOpCUDAKernel<int>,
-                        ops::PartialSumGradOpCUDAKernel<int64_t>,
-                        ops::PartialSumGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index fa4cc19d5e2c3fe9904afc4773fa0840c1f6af54..1b88eafae77db865d70f1d19a7056706d16d383c 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PartialSumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -57,7 +57,7 @@ class PartialSumKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 3f4d8125671e4b5d1a4352af82dd894db9438411..72236c012c357ccdf79f02c6b6d71d7a0350e7f9 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -253,7 +253,10 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
                              ops::PositiveNegativePairOp,
                              ops::PositiveNegativePairOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    positive_negative_pair,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(positive_negative_pair,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PositiveNegativePairKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index 745b793f51147a00bc26734eff55d61a101a1e42..0cddbcc3abf8530bb95868d555373bb0d011f4a1 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
   struct PredictionResult {
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index d1c455331b4e780ca7651c8e92deffb49e3ffc55..0f0dbf3c6888a8b7e53619876e27661c9a1890bc 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -195,13 +195,20 @@ REGISTER_OPERATOR(prroi_pool,
                   ops::PRROIPoolGradMaker<paddle::framework::OpDesc>,
                   ops::PRROIPoolGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(prroi_pool,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, float>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, double>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(prroi_pool_grad,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, float>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, double>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(prroi_pool,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUPRROIPoolOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUPRROIPoolGradOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index d1aa1d37d0479a5347ecd53fe3ce36b02329770a..5d1243964279b082f608da52614190d1b9b478d0 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -211,7 +211,7 @@ __global__ void GPUPRROIPoolBackward(const int nthreads,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -314,7 +314,7 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -428,9 +428,12 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(prroi_pool,
-                        ops::GPUPRROIPoolOpKernel<float>,
-                        ops::GPUPRROIPoolOpKernel<double>);
-REGISTER_OP_CUDA_KERNEL(prroi_pool_grad,
-                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, float>,
-                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    prroi_pool, GPU, ALL_LAYOUT, ops::GPUPRROIPoolOpKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GPUPRROIPoolGradOpKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 07a2bde7e94e46681c81712794dc5fa53f13491b..e2417a071ce88658b8687b81c0ce70978a270216 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -327,7 +327,7 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(int s_w,
                         (*this_out_grad));
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -481,7 +481,7 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cc b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
index 388b65f3dd67436e10a744ce7a2ff3a76e2059b8..c1112b13feb50c9018f9fa53b425d795df88cbfe 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
@@ -126,7 +126,9 @@ REGISTER_OP_WITHOUT_GRADIENT(prune_gate_by_capacity,
                              ops::PruneGateByCapacityOp,
                              ops::PruneGateByCapacityOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    prune_gate_by_capacity,
-    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int>,
-    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(prune_gate_by_capacity,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PruneGateByCapacityCPUKernel,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 38baaeb809c11c2c72b030443b2513a5ecea5230..510de11029f0c011547b0bbf0ff7de49bd1a5af6 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -105,7 +105,7 @@ static void VisitDataType(phi::DataType type, Visitor visitor) {
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -127,6 +127,8 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    prune_gate_by_capacity,
-    ops::PruneGateByCapacityCUDAKernel<phi::GPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(prune_gate_by_capacity,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PruneGateByCapacityCUDAKernel,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.h b/paddle/fluid/operators/prune_gate_by_capacity_op.h
index d7a00bd40d786f669f2d8d0cca68938b7285ac5f..4420fae6ef5e3ed2695a75bae60d5bd5dc77b8c3 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.h
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PruneGateByCapacityCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cc b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
index 7b949fa4338c72c1379fcd71866ff23e41779e9e..f0799f75862bc4cf2a018c244ac7befea9facf7d 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
@@ -151,10 +151,15 @@ REGISTER_OPERATOR(
 
 REGISTER_OPERATOR(push_box_extended_sparse, ops::PushBoxExtendedSparseOp);
 
-REGISTER_OP_CPU_KERNEL(pull_box_extended_sparse,
-                       ops::PullBoxExtendedSparseCPUKernel<float>,
-                       ops::PullBoxExtendedSparseCPUKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(push_box_extended_sparse,
-                       ops::PushBoxExtendedSparseCPUKernel<float>,
-                       ops::PushBoxExtendedSparseCPUKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PullBoxExtendedSparseCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PushBoxExtendedSparseCPUKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
index cfa317a3d392fb0bbaffccc3ae539cdf4aeda68d..570c367c93182d794c707ead0039c2e1895b788d 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -27,7 +27,7 @@ class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -38,9 +38,16 @@ class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_box_extended_sparse,
-                        ops::PullBoxExtendedSparseCUDAKernel<float>,
-                        ops::PullBoxExtendedSparseCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(push_box_extended_sparse,
-                        ops::PushBoxExtendedSparseCUDAKernel<float>,
-                        ops::PushBoxExtendedSparseCUDAKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PullBoxExtendedSparseCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PushBoxExtendedSparseCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
index eff3bfd2a5f3c3bd721712e5cc82aba309a59632..b9508a279505ea389f9957963ee21094b48dce85 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -108,7 +108,7 @@ static void PushBoxExtendedSparseFunctor(
 #endif
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -116,7 +116,7 @@ class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index c58a176d5263558fc422ab2d08909930d5e1ca13..a8f91c85485c7ca1078b0416fc6f8b9d103c0108 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -135,5 +135,8 @@ REGISTER_OPERATOR(pull_box_sparse,
                   ops::PushBoxSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushBoxSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    pull_box_sparse, CPU, ALL_LAYOUT, ops::PullBoxSparseKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_box_sparse, CPU, ALL_LAYOUT, ops::PushBoxSparseKernel, float) {}
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index dd41fd6ff0f4f262526b0887916adb14e7bafa32..1ebfa11a2b2e6519473af595d97a807ade7b28c5 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -113,7 +113,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
 #endif
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxSparseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -121,7 +121,7 @@ class PullBoxSparseKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxSparseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.kps b/paddle/fluid/operators/pull_box_sparse_op.kps
index 4b0580c5e1ab5cbb8d5b3d2dbd5929c03014aee7..1e4a3640bdac3f75753cd9f6ca383dc94d841a73 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.kps
+++ b/paddle/fluid/operators/pull_box_sparse_op.kps
@@ -45,16 +45,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(pull_box_sparse,
-                   KP,
-                   plat::XPUPlace,
-                   ops::PullBoxSparseKernel<float>);
-REGISTER_OP_KERNEL(push_box_sparse,
-                   KP,
-                   plat::XPUPlace,
-                   ops::PushBoxSparseKernel<float>);
-#else
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
-#endif
+PD_REGISTER_STRUCT_KERNEL(
+    pull_box_sparse, KPS, ALL_LAYOUT, ops::PullBoxSparseKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_box_sparse, KPS, ALL_LAYOUT, ops::PushBoxSparseKernel, float) {}
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cc b/paddle/fluid/operators/pull_gpups_sparse_op.cc
index 821cfdab6f10c17bf70fb24fa329ebf9d138d07d..afaa9af3fda20a9b9e93ec28aa8fbe1fc7c4bc74 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cc
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cc
@@ -145,9 +145,16 @@ REGISTER_OPERATOR(pull_gpups_sparse,
                   ops::PushGpuPSSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushGpuPSSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_gpups_sparse, ops::PushGpuPSSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_gpups_sparse,
-                       ops::PullGpuPSSparseCPUKernel<float>,
-                       ops::PullGpuPSSparseCPUKernel<double>)
-REGISTER_OP_CPU_KERNEL(push_gpups_sparse,
-                       ops::PushGpuPSSparseCPUKernel<float>,
-                       ops::PushGpuPSSparseCPUKernel<double>)
+
+PD_REGISTER_STRUCT_KERNEL(pull_gpups_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PullGpuPSSparseCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_gpups_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PushGpuPSSparseCPUKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cu b/paddle/fluid/operators/pull_gpups_sparse_op.cu
index ff68c42c8eb1b1fc7f8d8158975831c216248e10..a936d810216e612782594a7af2b3552b40a2e5c0 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cu
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 using phi::PADDLE_CUDA_NUM_THREADS;
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -28,7 +28,7 @@ class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -39,9 +39,15 @@ class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_gpups_sparse,
-                        ops::PullGpuPSSparseCUDAKernel<float>,
-                        ops::PullGpuPSSparseCUDAKernel<double>)
-REGISTER_OP_CUDA_KERNEL(push_gpups_sparse,
-                        ops::PushGpuPSSparseCUDAKernel<float>,
-                        ops::PushGpuPSSparseCUDAKernel<double>)
+PD_REGISTER_STRUCT_KERNEL(pull_gpups_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PullGpuPSSparseCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_gpups_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PushGpuPSSparseCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index 2d844a4ce2bf09cf6e1f71345caf9216d2f20a67..d8fdadd99cbd46123394295956e18044282723e5 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -97,7 +97,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
 #endif
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -105,7 +105,7 @@ class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushGpuPSSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc
index 7dc9ae98e0e41cfd94e39bdfb19997ef016cc785..4850bf33ae89cd757d0343812c056feef9b25a64 100644
--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -143,5 +143,7 @@ REGISTER_OPERATOR(pull_sparse,
                   ops::PushSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_sparse, ops::PushSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_sparse, ops::PullSparseCPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_sparse, ops::PushSparseCPUKernel<float>)
+PD_REGISTER_STRUCT_KERNEL(
+    pull_sparse, CPU, ALL_LAYOUT, ops::PullSparseCPUKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_sparse, CPU, ALL_LAYOUT, ops::PushSparseCPUKernel, float) {}
diff --git a/paddle/fluid/operators/pull_sparse_op.h b/paddle/fluid/operators/pull_sparse_op.h
index ecc3a5e1021dee8819335d4bd1aeaed82b3a93e7..263511b65180da0d347323b2cd2ef557841f979d 100644
--- a/paddle/fluid/operators/pull_sparse_op.h
+++ b/paddle/fluid/operators/pull_sparse_op.h
@@ -66,7 +66,7 @@ void PushSparseFunctor(const framework::ExecutionContext& ctx) {
                                                 &grads);
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -74,7 +74,7 @@ class PullSparseCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc
index 88a0ac86c2532dfaaa340dbddb9b2ec41eebc640..993950c360c12cdbe8f5ad6c2a9ed37a3897a2d2 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -135,5 +135,7 @@ REGISTER_OPERATOR(pull_sparse_v2,
                   ops::PushSparseV2OpMaker<paddle::framework::OpDesc>,
                   ops::PushSparseV2OpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_sparse_v2, ops::PushSparseV2Op);
-REGISTER_OP_CPU_KERNEL(pull_sparse_v2, ops::PullSparseV2CPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_sparse_v2, ops::PushSparseV2CPUKernel<float>)
+PD_REGISTER_STRUCT_KERNEL(
+    pull_sparse_v2, CPU, ALL_LAYOUT, ops::PullSparseV2CPUKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_sparse_v2, CPU, ALL_LAYOUT, ops::PushSparseV2CPUKernel, float) {}
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.h b/paddle/fluid/operators/pull_sparse_v2_op.h
index c24d0a4f338e7d95a7f700a9470bd824da6415a2..95ce71838578076ad3ff29094e3f10ba0d0ae72c 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.h
+++ b/paddle/fluid/operators/pull_sparse_v2_op.h
@@ -25,7 +25,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullSparseV2CPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,7 +33,7 @@ class PullSparseV2CPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushSparseV2CPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 8f9a2f92814d5d0b022bf7a36a656c9a6baa0fa9..7ca431e8ea5d10346cd219c377887225a002caf4 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -202,7 +202,6 @@ register_unity_group(
   pad_op.cc)
 register_unity_group(
   cc
-  modified_huber_loss_op.cc
   partial_sum_op.cc
   pixel_shuffle_op.cc
   pool_op.cc