register fluid kerenls to phi [part 4] (#52116)

* update * fix bug * fix bug * revert diag_op * revert expand_op and expand_as_op * fix bug * fix bug

register fluid kerenls to phi [part 4] (#52116)
* update * fix bug * fix bug * revert diag_op * revert expand_op and expand_as_op * fix bug * fix bug
63efdaee · huangjiyi · GitHub · 34069c46 · 63efdaee · 63efdaee
48 changed file
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -267,7 +267,7 @@ The required data format for this layer is one of the following:
 };

 template <typename T>
-class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
+class DataNormKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    // const bool is_test = ctx.Attr<bool>("is_test");
@@ -509,7 +509,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
 };

 template <typename T>
-class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
+class DataNormGradKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<phi::DenseTensor>("X");
@@ -764,12 +764,11 @@ REGISTER_OPERATOR(data_norm,
                  ops::DataNormGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);

-REGISTER_OP_CPU_KERNEL(data_norm,
-                       ops::DataNormKernel<phi::CPUContext, float>,
-                       ops::DataNormKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(data_norm_grad,
-                       ops::DataNormGradKernel<phi::CPUContext, float>,
-                       ops::DataNormGradKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    data_norm, CPU, ALL_LAYOUT, ops::DataNormKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    data_norm_grad, CPU, ALL_LAYOUT, ops::DataNormGradKernel, float, double) {}
+
 REGISTER_OP_VERSION(data_norm).AddCheckpoint(
    R"ROC(
              upgrad data_norm op by adding scale_w to support scale and shift.)ROC",

--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -102,7 +102,7 @@ __global__ void KernelUpdateParam(int C,
 }

 template <typename T>
-class DataNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
+class DataNormKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<phi::DenseTensor>("X");
@@ -154,7 +154,7 @@ class DataNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
 };

 template <typename T>
-class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
+class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<phi::DenseTensor>("X");
@@ -267,9 +267,8 @@ class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(data_norm,
-                        ops::DataNormKernel<phi::GPUContext, float>,
-                        ops::DataNormKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(data_norm_grad,
-                        ops::DataNormGradKernel<phi::GPUContext, float>,
-                        ops::DataNormGradKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    data_norm, GPU, ALL_LAYOUT, ops::DataNormKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    data_norm_grad, GPU, ALL_LAYOUT, ops::DataNormGradKernel, float, double) {}
--- a/paddle/fluid/operators/data_norm_op.h
+++ b/paddle/fluid/operators/data_norm_op.h
@@ -19,13 +19,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DataNormKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override;
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DataNormGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override;

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -348,7 +348,6 @@ class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = phi::CPUContext;
 REGISTER_OPERATOR(
    deformable_psroi_pooling,
    ops::DeformablePSROIPoolOp,
@@ -357,9 +356,16 @@ REGISTER_OPERATOR(
    ops::DeformablePSROIPoolGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(deformable_psroi_pooling_grad,
                  ops::DeformablePSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling,
-                       ops::DeformablePSROIPoolCPUKernel<CPU, float>,
-                       ops::DeformablePSROIPoolCPUKernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling_grad,
-                       ops::DeformablePSROIPoolGradCPUKernel<CPU, float>,
-                       ops::DeformablePSROIPoolGradCPUKernel<CPU, double>);
+
+PD_REGISTER_STRUCT_KERNEL(deformable_psroi_pooling,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DeformablePSROIPoolCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(deformable_psroi_pooling_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DeformablePSROIPoolGradCPUKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -178,7 +178,7 @@ __global__ void DeformablePSROIPoolForwardKernel(const int count,
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -479,7 +479,7 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -626,10 +626,16 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CUDA = phi::GPUContext;
-REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling,
-                        ops::DeformablePSROIPoolCUDAKernel<CUDA, float>,
-                        ops::DeformablePSROIPoolCUDAKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling_grad,
-                        ops::DeformablePSROIPoolGradCUDAKernel<CUDA, float>,
-                        ops::DeformablePSROIPoolGradCUDAKernel<CUDA, double>);
+
+PD_REGISTER_STRUCT_KERNEL(deformable_psroi_pooling,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DeformablePSROIPoolCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(deformable_psroi_pooling_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DeformablePSROIPoolGradCUDAKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -166,7 +166,7 @@ void DeformablePSROIPoolForwardCPUKernel(const int count,
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -468,7 +468,7 @@ void DeformablePSROIPoolBackwardAccCPUKernel(const int count,
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -101,7 +101,6 @@ $$Out = \frac{scale*X}{ max\_range }$$
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    dequantize_abs_max,
@@ -109,6 +108,10 @@ REGISTER_OPERATOR(
    ops::DequantizeMaxAbsOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(dequantize_abs_max,
-                       ops::DequantizeMaxAbsKernel<CPU, int8_t>,
-                       ops::DequantizeMaxAbsKernel<CPU, int16_t>);
+
+PD_REGISTER_STRUCT_KERNEL(dequantize_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DequantizeMaxAbsKernel,
+                          int8_t,
+                          int16_t) {}
--- a/paddle/fluid/operators/dequantize_abs_max_op.cu
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cu
@@ -53,7 +53,10 @@ template struct DequantizeFunctor<phi::GPUContext, int16_t>;
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CUDA = phi::GPUContext;
-REGISTER_OP_CUDA_KERNEL(dequantize_abs_max,
-                        ops::DequantizeMaxAbsKernel<CUDA, int8_t>,
-                        ops::DequantizeMaxAbsKernel<CUDA, int16_t>);
+
+PD_REGISTER_STRUCT_KERNEL(dequantize_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DequantizeMaxAbsKernel,
+                          int8_t,
+                          int16_t) {}
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
@@ -36,7 +36,7 @@ struct DequantizeFunctor {
                  phi::DenseTensor* out);
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {

--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -107,7 +107,6 @@ This calculation is an opposite operation of QuantizeLogOp:
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    dequantize_log,
@@ -115,4 +114,6 @@ REGISTER_OPERATOR(
    ops::DequantizeLogOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(dequantize_log, ops::DequantizeLogKernel<CPU, int8_t>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    dequantize_log, CPU, ALL_LAYOUT, ops::DequantizeLogKernel, int8_t) {}
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -60,5 +60,6 @@ template struct DequantizeFunctor<phi::GPUContext, int8_t>;
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CUDA = phi::GPUContext;
-REGISTER_OP_CUDA_KERNEL(dequantize_log, ops::DequantizeLogKernel<CUDA, int8_t>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    dequantize_log, GPU, ALL_LAYOUT, ops::DequantizeLogKernel, int8_t) {}
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
@@ -34,7 +34,7 @@ struct DequantizeFunctor {
                  phi::DenseTensor* out);
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DequantizeLogKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {

--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -262,6 +262,9 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(density_prior_box,
-                       ops::DensityPriorBoxOpKernel<float>,
-                       ops::DensityPriorBoxOpKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(density_prior_box,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DensityPriorBoxOpKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -83,7 +83,7 @@ static __global__ void GenDensityPriorBox(const int height,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -188,6 +188,10 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(density_prior_box,
-                        ops::DensityPriorBoxOpCUDAKernel<float>,
-                        ops::DensityPriorBoxOpCUDAKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(density_prior_box,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DensityPriorBoxOpCUDAKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -224,7 +224,6 @@ REGISTER_OPERATOR(
    ops::DetectionMAPOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    detection_map,
-    ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
-    ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    detection_map, CPU, ALL_LAYOUT, ops::DetectionMAPOpKernel, float, double) {}
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -56,7 +56,7 @@ inline void GetAccumulation(std::vector<std::pair<T, int>> in_pairs,
  }
 }

-template <typename Place, typename T>
+template <typename T, typename DeviceContext>
 class DetectionMAPOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -68,5 +68,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm,
                             ops::DGCClipByNormOp,
                             ops::DGCClipByNormOpMaker);

-REGISTER_OP_CPU_KERNEL(dgc_clip_by_norm,
-                       ops::DGCClipByNormKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    dgc_clip_by_norm, CPU, ALL_LAYOUT, ops::DGCClipByNormKernel, float) {}
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
@@ -15,5 +15,5 @@ limitations under the License. */
 #include "paddle/fluid/operators/dgc_clip_by_norm_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(dgc_clip_by_norm,
-                        ops::DGCClipByNormKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    dgc_clip_by_norm, GPU, ALL_LAYOUT, ops::DGCClipByNormKernel, float) {}
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/dgc_op.cu
+++ b/paddle/fluid/operators/dgc_op.cu
@@ -16,4 +16,4 @@ limitations under the License. */

 namespace ops = paddle::operators;

-REGISTER_OP_CUDA_KERNEL(dgc, ops::DGCOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(dgc, GPU, ALL_LAYOUT, ops::DGCOpKernel, float) {}
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -49,7 +49,7 @@ inline float get_period_sparcity(const std::vector<float>& sparsity,
  return sparsity[idx];
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DGCOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -277,9 +277,12 @@ REGISTER_OPERATOR(
    ops::FakeDequantizeMaxAbsOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
-                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
-                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
+PD_REGISTER_STRUCT_KERNEL(fake_dequantize_max_abs,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeDequantizeMaxAbsKernel,
+                          float,
+                          double) {}

 REGISTER_OPERATOR(
    fake_channel_wise_dequantize_max_abs,
@@ -287,9 +290,12 @@ REGISTER_OPERATOR(
    ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
-                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
-                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
+PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_dequantize_max_abs,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeChannelWiseDequantizeMaxAbsKernel,
+                          float,
+                          double) {}

 REGISTER_OP_VERSION(fake_channel_wise_dequantize_max_abs)
    .AddCheckpoint(

--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -16,14 +16,19 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"

 namespace ops = paddle::operators;
-using CUDA = phi::GPUContext;
 using float16 = paddle::platform::float16;
-REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
-                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
-                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>,
-                        ops::FakeDequantizeMaxAbsKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_channel_wise_dequantize_max_abs,
-    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
-    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>,
-    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float16>);
+
+PD_REGISTER_STRUCT_KERNEL(fake_dequantize_max_abs,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeDequantizeMaxAbsKernel,
+                          float,
+                          double,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_dequantize_max_abs,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeChannelWiseDequantizeMaxAbsKernel,
+                          float,
+                          double,
+                          float16) {}
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -44,7 +44,7 @@ struct ChannelDequantizeFunctor {
                  phi::DenseTensor* out);
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
@@ -62,7 +62,7 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -878,8 +878,11 @@ REGISTER_OPERATOR(
    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
-                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeAbsMaxKernel,
+                          float) {}

 REGISTER_OPERATOR(
    fake_quantize_dequantize_abs_max,
@@ -887,8 +890,11 @@ REGISTER_OPERATOR(
    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
-                       ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeDequantizeAbsMaxKernel,
+                          float) {}

 REGISTER_OPERATOR(
    fake_quantize_range_abs_max,
@@ -896,8 +902,11 @@ REGISTER_OPERATOR(
    ops::FakeQuantizeRangeAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
-                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_range_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeRangeAbsMaxKernel,
+                          float) {}

 REGISTER_OPERATOR(
    fake_quantize_moving_average_abs_max,
@@ -905,8 +914,11 @@ REGISTER_OPERATOR(
    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
-                       ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_moving_average_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeMovingAverageAbsMaxKernel,
+                          float) {}

 REGISTER_OPERATOR(
    fake_quantize_dequantize_moving_average_abs_max,
@@ -914,9 +926,11 @@ REGISTER_OPERATOR(
    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_moving_average_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel,
+                          float) {}

 REGISTER_OPERATOR(
    fake_channel_wise_quantize_abs_max,
@@ -924,8 +938,11 @@ REGISTER_OPERATOR(
    ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
-                       ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_quantize_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeChannelWiseQuantizeAbsMaxKernel,
+                          float) {}

 REGISTER_OPERATOR(
    moving_average_abs_max_scale,
@@ -933,13 +950,19 @@ REGISTER_OPERATOR(
    ops::MovingAverageAbsMaxScaleOpMaker,
    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
-                       ops::MovingAverageAbsMaxScaleKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(moving_average_abs_max_scale,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MovingAverageAbsMaxScaleKernel,
+                          float) {}

 REGISTER_OPERATOR(stright_throuth_estimator_grad,
                  ops::StrightThroughEstimatorGradOp);
-REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad,
-                       ops::StrightThroughEstimatorGradKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(stright_throuth_estimator_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::StrightThroughEstimatorGradKernel,
+                          float) {}

 REGISTER_OPERATOR(
    fake_channel_wise_quantize_dequantize_abs_max,
@@ -947,9 +970,11 @@ REGISTER_OPERATOR(
    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fake_channel_wise_quantize_dequantize_abs_max,
-    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_quantize_dequantize_abs_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel,
+                          float) {}

 REGISTER_OP_VERSION(fake_channel_wise_quantize_abs_max)
    .AddCheckpoint(

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -16,35 +16,58 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"

 namespace ops = paddle::operators;
-using CUDA = phi::GPUContext;
 using float16 = paddle::platform::float16;
-REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
-                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>,
-                        ops::FakeQuantizeAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_abs_max,
-                        ops::FakeQuantizeDequantizeAbsMaxKernel<CUDA, float>,
-                        ops::FakeQuantizeDequantizeAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_channel_wise_quantize_abs_max,
-    ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>,
-    ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
-                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>,
-                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_quantize_moving_average_abs_max,
-    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>,
-    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
-                        ops::MovingAverageAbsMaxScaleKernel<CUDA, float>,
-                        ops::MovingAverageAbsMaxScaleKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad,
-                        ops::StrightThroughEstimatorGradKernel<CUDA, float>,
-                        ops::StrightThroughEstimatorGradKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_channel_wise_quantize_dequantize_abs_max,
-    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
+
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeAbsMaxKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeDequantizeAbsMaxKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_quantize_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeChannelWiseQuantizeAbsMaxKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_range_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeRangeAbsMaxKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_moving_average_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeMovingAverageAbsMaxKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(moving_average_abs_max_scale,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::MovingAverageAbsMaxScaleKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_moving_average_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(stright_throuth_estimator_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::StrightThroughEstimatorGradKernel,
+                          float,
+                          float16) {}
+PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_quantize_dequantize_abs_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel,
+                          float) {}
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -176,7 +176,7 @@ class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
                              phi::DenseTensor *out) const = 0;
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeQuantizeAbsMaxKernel : public FakeAbsMaxKernelBase<DeviceContext, T> {
 protected:
  void RunClipFunctor(const DeviceContext &dev_ctx,
@@ -190,7 +190,7 @@ class FakeQuantizeAbsMaxKernel : public FakeAbsMaxKernelBase<DeviceContext, T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeQuantizeDequantizeAbsMaxKernel
    : public FakeAbsMaxKernelBase<DeviceContext, T> {
 protected:
@@ -205,7 +205,7 @@ class FakeQuantizeDequantizeAbsMaxKernel
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -232,7 +232,7 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
    : public framework::OpKernel<T> {
 public:
@@ -257,7 +257,7 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -304,7 +304,7 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -367,9 +367,9 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
                              phi::DenseTensor *out) const = 0;
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeQuantizeMovingAverageAbsMaxKernel
-    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
+    : public FakeMovingAverageAbsMaxKernelBase<T, DeviceContext> {
 protected:
  void RunClipFunctor(const DeviceContext &dev_ctx,
                      const phi::DenseTensor &in,
@@ -382,9 +382,9 @@ class FakeQuantizeMovingAverageAbsMaxKernel
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
-    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
+    : public FakeMovingAverageAbsMaxKernelBase<T, DeviceContext> {
 protected:
  void RunClipFunctor(const DeviceContext &dev_ctx,
                      const phi::DenseTensor &in,
@@ -397,7 +397,7 @@ class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -445,7 +445,7 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {

--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -130,5 +130,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad,
                             ops::DecayedAdagradOp,
                             ops::DecayedAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(decayed_adagrad,
-                       ops::DecayedAdagradOpKernel<phi::CPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    decayed_adagrad, CPU, ALL_LAYOUT, ops::DecayedAdagradOpKernel, float) {}
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
@@ -14,5 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(decayed_adagrad,
-                        ops::DecayedAdagradOpKernel<phi::GPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    decayed_adagrad, GPU, ALL_LAYOUT, ops::DecayedAdagradOpKernel, float) {}
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -76,5 +76,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_momentum,
                             ops::DGCMomentumOp,
                             ops::DGCMomentumOpMaker);

-REGISTER_OP_CPU_KERNEL(dgc_momentum,
-                       ops::DGCMomentumKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    dgc_momentum, CPU, ALL_LAYOUT, ops::DGCMomentumKernel, float) {}
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
@@ -15,5 +15,6 @@
 #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(dgc_momentum,
-                        ops::DGCMomentumKernel<phi::GPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    dgc_momentum, GPU, ALL_LAYOUT, ops::DGCMomentumKernel, float) {}
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -23,7 +23,7 @@
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DGCMomentumKernel : public framework::OpKernel<T> {
 public:
  DGCMomentumKernel() {}

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -118,6 +118,8 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb_init,
                             ops::DistributedFusedLambInitOp,
                             ops::DistributedFusedLambInitOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    distributed_fused_lamb_init,
-    ops::DistributedFusedLambInitOpKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb_init,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DistributedFusedLambInitOpKernel,
+                          float) {}
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -340,7 +340,7 @@ static T ClipByBound(T x, T low_value, T high_value) {
 }

 template <typename T>
-class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
+class DistributedFusedLambInitOpKernel<T, phi::GPUContext>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -790,6 +790,8 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    distributed_fused_lamb_init,
-    ops::DistributedFusedLambInitOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb_init,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DistributedFusedLambInitOpKernel,
+                          float) {}
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {

-template <typename DevCtx, typename T>
+template <typename T, typename DevCtx>
 class DistributedFusedLambInitOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -170,6 +170,8 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb,
                             ops::DistributedFusedLambOp,
                             ops::DistributedFusedLambOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    distributed_fused_lamb,
-    ops::DistributedFusedLambOpKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DistributedFusedLambOpKernel,
+                          float) {}
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -1330,7 +1330,7 @@ static void LaunchElementwiseAddWithCastKernel(const phi::GPUContext &dev_ctx,
 }

 template <typename T>
-class DistributedFusedLambOpKernel<phi::GPUContext, T>
+class DistributedFusedLambOpKernel<T, phi::GPUContext>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -2300,6 +2300,8 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;

-REGISTER_OP_CUDA_KERNEL(
-    distributed_fused_lamb,
-    ops::DistributedFusedLambOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DistributedFusedLambOpKernel,
+                          float) {}
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace operators {

-template <typename DevCtx, typename T>
+template <typename T, typename DevCtx>
 class DistributedFusedLambOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {

--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -131,6 +131,6 @@ CCS16 - Deep Learning with Differential Privacy.

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker);
-REGISTER_OP_CPU_KERNEL(dpsgd,
-                       ops::DpsgdOpKernel<phi::CPUContext, float>,
-                       ops::DpsgdOpKernel<phi::CPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    dpsgd, CPU, ALL_LAYOUT, ops::DpsgdOpKernel, float, double) {}
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DpsgdOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {

--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -151,6 +151,8 @@ REGISTER_OPERATOR(distributed_lookup_table,
                  ops::DistributedLookupTableOp,
                  ops::DistributedLookupTableOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    distributed_lookup_table,
-    ops::DistributedLookupTableKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(distributed_lookup_table,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DistributedLookupTableKernel,
+                          float) {}
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
@@ -17,6 +17,8 @@
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    distributed_lookup_table,
-    ops::DistributedLookupTableKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(distributed_lookup_table,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DistributedLookupTableKernel,
+                          float) {}
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -23,7 +23,7 @@
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DistributedLookupTableKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {

--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -134,7 +134,9 @@ REGISTER_OPERATOR(distributed_push_sparse,
                  ops::DistributedPushSparseOp,
                  ops::DistributedPushSparseOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    distributed_push_sparse,
-    ops::DistributedPushSparseKernel<phi::CPUContext, float>,
-    ops::DistributedPushSparseKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(distributed_push_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::DistributedPushSparseKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
@@ -17,7 +17,9 @@
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    distributed_push_sparse,
-    ops::DistributedPushSparseKernel<phi::GPUContext, float>,
-    ops::DistributedPushSparseKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(distributed_push_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::DistributedPushSparseKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
@@ -23,7 +23,7 @@
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class DistributedPushSparseKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {