From fecbc9584ebbf88b80c097504ca034b688fefd6e Mon Sep 17 00:00:00 2001
From: QingshuChen <chenqingshu@baidu.com>
Date: Fri, 29 Jul 2022 16:46:12 +0800
Subject: [PATCH] add some fp16 op for kunlun resnet50 model (#44672)

* add some fp16 op for kunlun resnet50 model
*test=kunlun

* tmp
*test=kunlun
---
 .../operators/fused/resnet_unit_op_xpu.cc     | 121 +++--
 .../optimizers/lars_momentum_op_xpu.cc        |  29 +-
 .../fluid/platform/device/xpu/xpu2_op_list.h  |  19 +-
 .../phi/kernels/xpu/elementwise_add_kernel.cc |  15 +-
 .../kernels/xpu/log_softmax_grad_kernel.cc    |  36 +-
 paddle/phi/kernels/xpu/log_softmax_kernel.cc  |  14 +-
 .../xpu/test_update_loss_scaling_op_xpu.py    | 451 +++++++++---------
 7 files changed, 375 insertions(+), 310 deletions(-)
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
index cce506c67a..e9ad179960 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -23,6 +23,8 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 class ResNetUnitXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
@@ -63,9 +65,12 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
     std::string act_type = ctx.Attr<std::string>("act_type");
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
 
-    std::vector<const T *> x_list = {input_x->data<T>()};
-    std::vector<const T *> w_list = {filter_x->data<T>()};
-    std::vector<T *> conv_y_list = {conv_out_x->mutable_data<T>(place)};
+    std::vector<const XPUType *> x_list = {
+        reinterpret_cast<const XPUType *>(input_x->data<T>())};
+    std::vector<const XPUType *> w_list = {
+        reinterpret_cast<const XPUType *>(filter_x->data<T>())};
+    std::vector<XPUType *> conv_y_list = {
+        reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};
 
     std::vector<std::vector<int>> x_shape_list = {
         phi::vectorize<int>(input_x->dims())};
@@ -107,9 +112,10 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
       Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
 
-      x_list.push_back(input_z->data<T>());
-      w_list.push_back(filter_z->data<T>());
-      conv_y_list.push_back(conv_out_z->mutable_data<T>(place));
+      x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
+      w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
+      conv_y_list.push_back(
+          reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));
 
       x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));
 
@@ -133,17 +139,17 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       if (fuse_add) {
         const Tensor *input_z = ctx.Input<Tensor>("Z");
         auto input_z_shape = phi::vectorize<int>(input_z->dims());
-        x_list.push_back(input_z->data<T>());
+        x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
         x_shape_list.push_back(input_z_shape);
         x_maxlist.push_back(nullptr);
       }
     }
-    int r = xpu::resnet_unit_fusion<T, T, T, int16_t>(
+    int r = xpu::resnet_unit_fusion<XPUType, XPUType, XPUType, int16_t>(
         dev_ctx.x_context(),
         x_list,
         w_list,
         conv_y_list,
-        output->mutable_data<T>(place),
+        reinterpret_cast<XPUType *>(output->mutable_data<T>(place)),
         x_shape_list,
         filter_x_shape[0],
         ksize_list,
@@ -172,6 +178,8 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
@@ -208,11 +216,16 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
 
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
 
-    std::vector<const T *> x_list = {x->data<T>()};
-    std::vector<const T *> w_list = {filter_x->data<T>()};
-    std::vector<const T *> conv_y_list = {conv_out_x->data<T>()};
-    std::vector<T *> dx_list = {x_grad->mutable_data<T>(place)};
-    std::vector<T *> dw_list = {filter_x_grad->mutable_data<T>(place)};
+    std::vector<const XPUType *> x_list = {
+        reinterpret_cast<const XPUType *>(x->data<T>())};
+    std::vector<const XPUType *> w_list = {
+        reinterpret_cast<const XPUType *>(filter_x->data<T>())};
+    std::vector<const XPUType *> conv_y_list = {
+        reinterpret_cast<const XPUType *>(conv_out_x->data<T>())};
+    std::vector<XPUType *> dx_list = {
+        reinterpret_cast<XPUType *>(x_grad->mutable_data<T>(place))};
+    std::vector<XPUType *> dw_list = {
+        reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};
 
     std::vector<std::vector<int>> x_shape_list = {
         phi::vectorize<int>(x->dims())};
@@ -262,11 +275,14 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
       Tensor *scale_z_grad =
           ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
       Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
-      x_list.push_back(z->data<T>());
-      w_list.push_back(filter_z->data<T>());
-      conv_y_list.push_back(conv_out_z->data<T>());
-      dx_list.push_back(z_grad->mutable_data<T>(place));
-      dw_list.push_back(filter_z_grad->mutable_data<T>(place));
+      x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
+      w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
+      conv_y_list.push_back(
+          reinterpret_cast<const XPUType *>(conv_out_z->data<T>()));
+      dx_list.push_back(
+          reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
+      dw_list.push_back(
+          reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
       x_shape_list.push_back(phi::vectorize<int>(z->dims()));
 
       auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
@@ -288,38 +304,39 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
     } else {
       if (fuse_add) {
         auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
-        dx_list.push_back(z_grad->mutable_data<T>(place));
+        dx_list.push_back(
+            reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
       }
     }
 
-    int r =
-        xpu::resnet_unit_grad_fusion<T, T, T, int16_t>(dev_ctx.x_context(),
-                                                       x_list,
-                                                       w_list,
-                                                       y_grad->data<T>(),
-                                                       output->data<T>(),
-                                                       conv_y_list,
-                                                       dx_list,
-                                                       dw_list,
-                                                       x_shape_list,
-                                                       filter_x_shape[0],
-                                                       ksize_list,
-                                                       stride_list,
-                                                       paddings,
-                                                       dilations,
-                                                       group,
-                                                       x_maxlist,
-                                                       w_maxlist,
-                                                       scale_list,
-                                                       batch_mean_list,
-                                                       batch_invstd_list,
-                                                       dscale_list,
-                                                       dbias_list,
-                                                       xpu::Activation_t::RELU,
-                                                       eps,
-                                                       is_nchw,
-                                                       has_shortcut,
-                                                       fuse_add);
+    int r = xpu::resnet_unit_grad_fusion<XPUType, XPUType, XPUType, int16_t>(
+        dev_ctx.x_context(),
+        x_list,
+        w_list,
+        reinterpret_cast<const XPUType *>(y_grad->data<T>()),
+        reinterpret_cast<const XPUType *>(output->data<T>()),
+        conv_y_list,
+        dx_list,
+        dw_list,
+        x_shape_list,
+        filter_x_shape[0],
+        ksize_list,
+        stride_list,
+        paddings,
+        dilations,
+        group,
+        x_maxlist,
+        w_maxlist,
+        scale_list,
+        batch_mean_list,
+        batch_invstd_list,
+        dscale_list,
+        dbias_list,
+        xpu::Activation_t::RELU,
+        eps,
+        is_nchw,
+        has_shortcut,
+        fuse_add);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
   }
 };
@@ -329,5 +346,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel<float>);
-REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(resnet_unit,
+                       ops::ResNetUnitXPUKernel<plat::float16>,
+                       ops::ResNetUnitXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(resnet_unit_grad,
+                       ops::ResNetUnitGradXPUKernel<plat::float16>,
+                       ops::ResNetUnitGradXPUKernel<float>);
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
index 626e071c20..1f9a9eb251 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
@@ -22,6 +22,8 @@ namespace operators {
 
 template <typename T>
 class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     bool multi_precision = ctx.Attr<bool>("multi_precision");
@@ -35,14 +37,14 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
     auto master_param = ctx.MultiInput<framework::LoDTensor>("MasterParam");
     auto master_param_out =
         ctx.MultiOutput<framework::LoDTensor>("MasterParamOut");
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T epsilon = ctx.Attr<float>("epsilon");
-    T rescale_grad = ctx.Attr<float>("rescale_grad");
+    float mu = static_cast<T>(ctx.Attr<float>("mu"));
+    float lars_coeff = ctx.Attr<float>("lars_coeff");
+    float epsilon = ctx.Attr<float>("epsilon");
+    float rescale_grad = ctx.Attr<float>("rescale_grad");
 
-    std::vector<T*> param_list;
-    std::vector<T*> grad_list;
-    std::vector<T*> param_out_list;
+    std::vector<XPUType*> param_list;
+    std::vector<XPUType*> grad_list;
+    std::vector<XPUType*> param_out_list;
     std::vector<float*> velocity_list;
     std::vector<float*> velocity_out_list;
     std::vector<float*> lrs;
@@ -52,9 +54,12 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
     std::vector<float*> master_param_out_list;
     int op_num = param.size();
     for (int i = 0; i < op_num; ++i) {
-      param_list.push_back(const_cast<T*>(param[i]->data<T>()));
-      grad_list.push_back(const_cast<T*>(grad[i]->data<T>()));
-      param_out_list.push_back(param_out[i]->mutable_data<T>(ctx.GetPlace()));
+      param_list.push_back(
+          reinterpret_cast<XPUType*>(const_cast<T*>((param[i]->data<T>()))));
+      grad_list.push_back(
+          reinterpret_cast<XPUType*>(const_cast<T*>(grad[i]->data<T>())));
+      param_out_list.push_back(reinterpret_cast<XPUType*>(
+          param_out[i]->mutable_data<T>(ctx.GetPlace())));
       velocity_list.push_back(const_cast<float*>(velocity[i]->data<float>()));
       velocity_out_list.push_back(
           velocity_out[i]->mutable_data<float>(ctx.GetPlace()));
@@ -111,5 +116,7 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(lars_momentum,
+                       ops::LarsMomentumOpXPUKernel<paddle::platform::float16>,
+                       ops::LarsMomentumOpXPUKernel<float>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 28ff2bfba5..e3c46ae5b7 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -231,7 +231,9 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"generate_proposals_v2",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"grad_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"greater_equal",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -254,9 +256,8 @@ XPUOpMap& get_kl2_ops() {
       {"label_smooth",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lars_momentum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -380,9 +381,12 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"resnet_unit",
+       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"resnet_unit_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -502,6 +506,9 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"update_loss_scaling",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"unsqueeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
index 34d39b0a83..9c5b521849 100644
--- a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -24,13 +24,15 @@ void GradAddXPUKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& y,
                       DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
   dev_ctx.template Alloc<T>(out);
   auto x_shape = phi::vectorize<int>(x.dims());
   auto y_shape = phi::vectorize<int>(y.dims());
   int r = xpu::broadcast_add(dev_ctx.x_context(),
-                             x.data<T>(),
-                             y.data<T>(),
-                             out->data<T>(),
+                             reinterpret_cast<const XPUType*>(x.data<T>()),
+                             reinterpret_cast<const XPUType*>(y.data<T>()),
+                             reinterpret_cast<XPUType*>(out->data<T>()),
                              x_shape,
                              y_shape);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
@@ -38,4 +40,9 @@ void GradAddXPUKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {}
+PD_REGISTER_KERNEL(grad_add,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::GradAddXPUKernel,
+                   phi::dtype::float16,
+                   float) {}
diff --git a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
index c9165f3ef7..26f532f17b 100644
--- a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
@@ -26,6 +26,7 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
                           const DenseTensor& out_grad,
                           int axis,
                           DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   const int rank = out.dims().size();
   axis = funcs::CanonicalAxis(axis, rank);
 
@@ -40,24 +41,29 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_NE(
         tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
 
-    int r =
-        xpu::exp(dev_ctx.x_context(), out.data<T>(), tmp_ptr, out_grad.numel());
+    int r = xpu::exp<XPUType>(dev_ctx.x_context(),
+                              reinterpret_cast<const XPUType*>(out.data<T>()),
+                              reinterpret_cast<XPUType*>(tmp_ptr),
+                              out_grad.numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
-    r = xpu::reciprocal(
-        dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel());
+    r = xpu::reciprocal<XPUType>(dev_ctx.x_context(),
+                                 reinterpret_cast<const XPUType*>(tmp_ptr),
+                                 reinterpret_cast<XPUType*>(tmp2_ptr),
+                                 out_grad.numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
-    r = xpu::mul(dev_ctx.x_context(),
-                 tmp2_ptr,
-                 out_grad.data<T>(),
-                 tmp2_ptr,
-                 out_grad.numel());
+    r = xpu::mul<XPUType>(dev_ctx.x_context(),
+                          reinterpret_cast<const XPUType*>(tmp2_ptr),
+                          reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+                          reinterpret_cast<XPUType*>(tmp2_ptr),
+                          out_grad.numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
-    r = xpu::softmax_grad(dev_ctx.x_context(),
-                          tmp_ptr,
-                          tmp2_ptr,
-                          x_grad->data<T>(),
-                          out_shape,
-                          axis);
+    r = xpu::softmax_grad<XPUType>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(tmp_ptr),
+        reinterpret_cast<const XPUType*>(tmp2_ptr),
+        reinterpret_cast<XPUType*>(x_grad->data<T>()),
+        out_shape,
+        axis);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
   }
 }
diff --git a/paddle/phi/kernels/xpu/log_softmax_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_kernel.cc
index 1f084d0e6c..0250b08e50 100644
--- a/paddle/phi/kernels/xpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/xpu/log_softmax_kernel.cc
@@ -25,6 +25,7 @@ void LogSoftmaxKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       int axis,
                       DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   const int rank = x.dims().size();
   axis = funcs::CanonicalAxis(axis, rank);
 
@@ -32,11 +33,16 @@ void LogSoftmaxKernel(const Context& dev_ctx,
     auto x_shape = phi::vectorize<int>(x.dims());
     dev_ctx.template Alloc<T>(out);
     if (axis < 0) axis += rank;
-    int r = xpu::softmax<T>(
-        dev_ctx.x_context(), x.data<T>(), out->data<T>(), x_shape, axis);
+    int r = xpu::softmax<XPUType>(dev_ctx.x_context(),
+                                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                                  reinterpret_cast<XPUType*>(out->data<T>()),
+                                  x_shape,
+                                  axis);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
-    r = xpu::log<T>(
-        dev_ctx.x_context(), out->data<T>(), out->data<T>(), out->numel());
+    r = xpu::log<XPUType>(dev_ctx.x_context(),
+                          reinterpret_cast<const XPUType*>(out->data<T>()),
+                          reinterpret_cast<XPUType*>(out->data<T>()),
+                          out->numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "log");
   }
 }
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
index 5ed10d159a..41e277d7a3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
@@ -23,231 +23,242 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
-class TestUpdateLossScalingOp(XPUOpTest):
-
-    def setUp(self):
-        self.op_type = "update_loss_scaling"
-        self.init()
-        found_inf = np.array([False], dtype=np.bool_)
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-
-        self.inputs = {
-            'X': [('x0', x)],
-            'FoundInfinite': found_inf,
-            'PrevLossScaling': self.prev_loss_scaling,
-            'InGoodSteps': self.num_good_steps,
-            'InBadSteps': self.num_bad_steps
-        }
-
-        self.outputs = {
-            'Out': [('out0', x)],
-            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
-            'OutGoodSteps': self.zero_steps,
-            'OutBadSteps': self.zero_steps
-        }
-
-    def init(self):
-        self.incr_ratio = 2.0
-        self.decr_ratio = 0.8
-        self.dtype = np.float32
-        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
-        self.num_good_steps = np.array([999], dtype=np.int32)
-        self.num_bad_steps = np.array([1], dtype=np.int32)
-        self.zero_steps = np.array([0], dtype=np.int32)
-        self.attrs = {
-            'incr_every_n_steps': 1000,
-            'decr_every_n_nan_or_inf': 2,
-            'incr_ratio': self.incr_ratio,
-            'decr_ratio': self.decr_ratio,
-        }
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['Out'])
-
-
-class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
-
-    def setUp(self):
-        self.op_type = "update_loss_scaling"
-        self.init()
-        found_inf = np.array([True], dtype=np.bool_)
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        i = np.random.randint(0, 1024, 1)
-        j = np.random.randint(0, 1024, 1)
-        x[i[0]][j[0]] = np.inf
-
-        self.inputs = {
-            'X': [('x0', x)],
-            'FoundInfinite': found_inf,
-            'PrevLossScaling': self.prev_loss_scaling,
-            'InGoodSteps': self.num_good_steps,
-            'InBadSteps': self.num_bad_steps
-        }
-
-        self.outputs = {
-            'Out': [('out0', np.zeros_like(x))],
-            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
-            'OutGoodSteps': self.zero_steps,
-            'OutBadSteps': self.zero_steps
-        }
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-        #self.check_output()
-
-
-class TestUpdateLossScalingLayer(unittest.TestCase):
-
-    def loss_scaling_check(self, scope=fluid.Scope()):
-        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
-        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
-        x = [a, b]
-        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+class XPUTestUpdateLossScalingOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = "update_loss_scaling"
+        self.use_dynamic_create_class = False
+
+    class TestUpdateLossScalingOp(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "update_loss_scaling"
+            self.init()
+            found_inf = np.array([False], dtype=np.bool_)
+            x = np.random.random((1024, 1024)).astype(self.dtype)
+
+            self.inputs = {
+                'X': [('x0', x)],
+                'FoundInfinite': found_inf,
+                'PrevLossScaling': self.prev_loss_scaling,
+                'InGoodSteps': self.num_good_steps,
+                'InBadSteps': self.num_bad_steps
+            }
+
+            self.outputs = {
+                'Out': [('out0', x)],
+                'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+                'OutGoodSteps': self.zero_steps,
+                'OutBadSteps': self.zero_steps
+            }
+
+        def init(self):
+            self.incr_ratio = 2.0
+            self.decr_ratio = 0.8
+            self.dtype = np.float32
+            self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+            self.num_good_steps = np.array([999], dtype=np.int32)
+            self.num_bad_steps = np.array([1], dtype=np.int32)
+            self.zero_steps = np.array([0], dtype=np.int32)
+            self.attrs = {
+                'incr_every_n_steps': 1000,
+                'decr_every_n_nan_or_inf': 2,
+                'incr_ratio': self.incr_ratio,
+                'decr_ratio': self.decr_ratio,
+            }
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['Out'])
+
+    class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+
+        def setUp(self):
+            self.op_type = "update_loss_scaling"
+            self.init()
+            found_inf = np.array([True], dtype=np.bool_)
+            x = np.random.random((1024, 1024)).astype(self.dtype)
+            i = np.random.randint(0, 1024, 1)
+            j = np.random.randint(0, 1024, 1)
+            x[i[0]][j[0]] = np.inf
+
+            self.inputs = {
+                'X': [('x0', x)],
+                'FoundInfinite': found_inf,
+                'PrevLossScaling': self.prev_loss_scaling,
+                'InGoodSteps': self.num_good_steps,
+                'InBadSteps': self.num_bad_steps
+            }
+
+            self.outputs = {
+                'Out': [('out0', np.zeros_like(x))],
+                'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+                'OutGoodSteps': self.zero_steps,
+                'OutBadSteps': self.zero_steps
+            }
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+            #self.check_output()
+
+    class TestUpdateLossScalingLayer(unittest.TestCase):
+
+        def loss_scaling_check(self, scope=fluid.Scope()):
+            a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+            b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+            x = [a, b]
+            found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+            prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                           shape=[1],
+                                           dtype='float32')
+            num_good_steps = fluid.data(name="num_good_steps",
+                                        shape=[1],
+                                        dtype='int32')
+            num_bad_steps = fluid.data(name="num_bad_steps",
                                        shape=[1],
-                                       dtype='float32')
-        num_good_steps = fluid.data(name="num_good_steps",
-                                    shape=[1],
-                                    dtype='int32')
-        num_bad_steps = fluid.data(name="num_bad_steps",
-                                   shape=[1],
-                                   dtype='int32')
-
-        a_v = np.random.random([1024, 1024]).astype('float32')
-        b_v = np.random.random([512, 128]).astype('float32')
-        found_inf_v = np.array([False]).astype('bool')
-        prev_loss_scaling_v = np.array([2048]).astype('float32')
-        num_good_steps_v = np.array([999], dtype=np.int32)
-        num_bad_steps_v = np.array([1], dtype=np.int32)
-
-        incr_every_n_steps = 1000
-        decr_every_n_nan_or_inf = 2
-        incr_ratio = 2
-        decr_ratio = 0.8
-
-        result = amp_nn.update_loss_scaling(x,
-                                            found_inf,
-                                            prev_loss_scaling,
-                                            num_good_steps,
-                                            num_bad_steps,
-                                            incr_every_n_steps,
-                                            decr_every_n_nan_or_inf,
-                                            incr_ratio,
-                                            decr_ratio,
-                                            name="update_loss_scaling")
-
-        place = fluid.XPUPlace(0)
-        exe = fluid.Executor(place)
-        with fluid.scope_guard(scope):
-            exe.run(fluid.default_startup_program())
-            result_v = exe.run(feed={
-                'a': a_v,
-                'b': b_v,
-                'found_inf': found_inf_v,
-                'prev_loss_scaling': prev_loss_scaling_v,
-                'num_good_steps': num_good_steps_v,
-                'num_bad_steps': num_bad_steps_v
-            },
-                               fetch_list=[
-                                   result, x, found_inf, prev_loss_scaling,
-                                   num_good_steps, num_bad_steps
-                               ])
-        assert np.array_equal(result_v[0], a_v)
-        assert np.array_equal(result_v[1], b_v)
-        assert np.array_equal(result_v[0], result_v[2])
-        assert np.array_equal(result_v[1], result_v[3])
-        assert np.array_equal(result_v[4], found_inf_v)
-        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
-        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
-        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
-
-    def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
-        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
-        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
-        x = [a, b]
-        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       dtype='int32')
+
+            a_v = np.random.random([1024, 1024]).astype('float32')
+            b_v = np.random.random([512, 128]).astype('float32')
+            found_inf_v = np.array([False]).astype('bool')
+            prev_loss_scaling_v = np.array([2048]).astype('float32')
+            num_good_steps_v = np.array([999], dtype=np.int32)
+            num_bad_steps_v = np.array([1], dtype=np.int32)
+
+            incr_every_n_steps = 1000
+            decr_every_n_nan_or_inf = 2
+            incr_ratio = 2
+            decr_ratio = 0.8
+
+            result = amp_nn.update_loss_scaling(x,
+                                                found_inf,
+                                                prev_loss_scaling,
+                                                num_good_steps,
+                                                num_bad_steps,
+                                                incr_every_n_steps,
+                                                decr_every_n_nan_or_inf,
+                                                incr_ratio,
+                                                decr_ratio,
+                                                name="update_loss_scaling")
+
+            place = fluid.XPUPlace(0)
+            exe = fluid.Executor(place)
+            with fluid.scope_guard(scope):
+                exe.run(fluid.default_startup_program())
+                result_v = exe.run(feed={
+                    'a': a_v,
+                    'b': b_v,
+                    'found_inf': found_inf_v,
+                    'prev_loss_scaling': prev_loss_scaling_v,
+                    'num_good_steps': num_good_steps_v,
+                    'num_bad_steps': num_bad_steps_v
+                },
+                                   fetch_list=[
+                                       result, x, found_inf, prev_loss_scaling,
+                                       num_good_steps, num_bad_steps
+                                   ])
+            assert np.array_equal(result_v[0], a_v)
+            assert np.array_equal(result_v[1], b_v)
+            assert np.array_equal(result_v[0], result_v[2])
+            assert np.array_equal(result_v[1], result_v[3])
+            assert np.array_equal(result_v[4], found_inf_v)
+            assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+            assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+            assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+        def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
+            a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+            b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+            x = [a, b]
+            found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+            prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                           shape=[1],
+                                           dtype='float32')
+            num_good_steps = fluid.data(name="num_good_steps",
+                                        shape=[1],
+                                        dtype='int32')
+            num_bad_steps = fluid.data(name="num_bad_steps",
                                        shape=[1],
-                                       dtype='float32')
-        num_good_steps = fluid.data(name="num_good_steps",
-                                    shape=[1],
-                                    dtype='int32')
-        num_bad_steps = fluid.data(name="num_bad_steps",
-                                   shape=[1],
-                                   dtype='int32')
-
-        a_v = np.random.random([1024, 1024]).astype('float32')
-        b_v = np.random.random([512, 128]).astype('float32')
-        i = np.random.randint(0, 1024, 1)
-        j = np.random.randint(0, 1024, 1)
-        a_v[i[0]][j[0]] = np.inf
-        found_inf_v = np.array([True]).astype('bool')
-        prev_loss_scaling_v = np.array([2048]).astype('float32')
-        num_good_steps_v = np.array([999], dtype=np.int32)
-        num_bad_steps_v = np.array([1], dtype=np.int32)
-
-        incr_every_n_steps = 1000
-        decr_every_n_nan_or_inf = 2
-        incr_ratio = 2
-        decr_ratio = 0.8
-
-        result = amp_nn.update_loss_scaling(x,
-                                            found_inf,
-                                            prev_loss_scaling,
-                                            num_good_steps,
-                                            num_bad_steps,
-                                            incr_every_n_steps,
-                                            decr_every_n_nan_or_inf,
-                                            incr_ratio,
-                                            decr_ratio,
-                                            name="update_loss_scaling")
-
-        place = fluid.XPUPlace(0)
-        exe = fluid.Executor(place)
-        with fluid.scope_guard(scope):
-            exe.run(fluid.default_startup_program())
-            result_v = exe.run(feed={
-                'a': a_v,
-                'b': b_v,
-                'found_inf': found_inf_v,
-                'prev_loss_scaling': prev_loss_scaling_v,
-                'num_good_steps': num_good_steps_v,
-                'num_bad_steps': num_bad_steps_v
-            },
-                               fetch_list=[
-                                   result, x, found_inf, prev_loss_scaling,
-                                   num_good_steps, num_bad_steps
-                               ])
-        assert np.array_equal(result_v[0], np.zeros_like(a_v))
-        assert np.array_equal(result_v[1], np.zeros_like(b_v))
-        assert np.array_equal(result_v[2], np.zeros_like(a_v))
-        assert np.array_equal(result_v[3], np.zeros_like(b_v))
-        assert np.array_equal(result_v[4], found_inf_v)
-        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
-        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
-        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
-
-    def test_loss_scaling(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                self.loss_scaling_check()
-
-    def test_loss_scaling_inf(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                self.loss_scaling_check_inf()
-
+                                       dtype='int32')
+
+            a_v = np.random.random([1024, 1024]).astype('float32')
+            b_v = np.random.random([512, 128]).astype('float32')
+            i = np.random.randint(0, 1024, 1)
+            j = np.random.randint(0, 1024, 1)
+            a_v[i[0]][j[0]] = np.inf
+            found_inf_v = np.array([True]).astype('bool')
+            prev_loss_scaling_v = np.array([2048]).astype('float32')
+            num_good_steps_v = np.array([999], dtype=np.int32)
+            num_bad_steps_v = np.array([1], dtype=np.int32)
+
+            incr_every_n_steps = 1000
+            decr_every_n_nan_or_inf = 2
+            incr_ratio = 2
+            decr_ratio = 0.8
+
+            result = amp_nn.update_loss_scaling(x,
+                                                found_inf,
+                                                prev_loss_scaling,
+                                                num_good_steps,
+                                                num_bad_steps,
+                                                incr_every_n_steps,
+                                                decr_every_n_nan_or_inf,
+                                                incr_ratio,
+                                                decr_ratio,
+                                                name="update_loss_scaling")
+
+            place = fluid.XPUPlace(0)
+            exe = fluid.Executor(place)
+            with fluid.scope_guard(scope):
+                exe.run(fluid.default_startup_program())
+                result_v = exe.run(feed={
+                    'a': a_v,
+                    'b': b_v,
+                    'found_inf': found_inf_v,
+                    'prev_loss_scaling': prev_loss_scaling_v,
+                    'num_good_steps': num_good_steps_v,
+                    'num_bad_steps': num_bad_steps_v
+                },
+                                   fetch_list=[
+                                       result, x, found_inf, prev_loss_scaling,
+                                       num_good_steps, num_bad_steps
+                                   ])
+            assert np.array_equal(result_v[0], np.zeros_like(a_v))
+            assert np.array_equal(result_v[1], np.zeros_like(b_v))
+            assert np.array_equal(result_v[2], np.zeros_like(a_v))
+            assert np.array_equal(result_v[3], np.zeros_like(b_v))
+            assert np.array_equal(result_v[4], found_inf_v)
+            assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+            assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+            assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+        def test_loss_scaling(self):
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check()
+
+        def test_loss_scaling_inf(self):
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check_inf()
+
+
+support_types = get_xpu_op_support_types('update_loss_scaling')
+for stype in support_types:
+    create_test_class(globals(), XPUTestUpdateLossScalingOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab