diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index d5ccf1297922f5dfb08993aa37200db194be9a71..2c7f28b3a522311244f54df589f11c22c40fb8ba 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220510")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220510")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 14b5662b24aeb9d5a608b3a4da371248be45f6f6..c4ea6a3c6bc669dc2c5154b9a74fa0fe745269db 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/transform.h"
 
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -28,6 +32,49 @@ struct CastDataTypeFunctor {
   }
 };
 
+#if defined(PADDLE_WITH_XPU)
+
+template <typename InType, typename OutType>
+static void XPUCastData(const framework::Tensor& in, framework::Tensor* out,
+                        const platform::XPUDeviceContext* dev_ctx) {
+  using XPUInTDType = typename XPUTypeTrait<InType>::Type;
+  using XPUOutTDType = typename XPUTypeTrait<OutType>::Type;
+  int r = xpu::cast_v2<XPUInTDType, XPUOutTDType>(
+      dev_ctx->x_context(),
+      reinterpret_cast<const XPUInTDType*>(in.data<InType>()),
+      reinterpret_cast<XPUOutTDType*>(out->mutable_data<OutType>(in.place())),
+      in.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+  dev_ctx->Wait();
+}
+
+template <typename InType>
+static void XPUTransDataType(
+    const framework::Tensor& in, framework::Tensor* out,
+    const paddle::framework::proto::VarType::Type& dst_type,
+    const platform::DeviceContext* ctx) {
+  auto* context = static_cast<const platform::XPUDeviceContext*>(ctx);
+
+#define XPUCastCallback(cpp_type, proto_type)          \
+  do {                                                 \
+    if (dst_type == proto_type) {                      \
+      XPUCastData<InType, cpp_type>(in, out, context); \
+    }                                                  \
+  } while (0)
+
+  if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 &&
+      dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 &&
+      dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) {
+    _ForEachDataType_(XPUCastCallback);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Data type (%s) is not supported in XPU when casting data type.",
+        DataTypeToString(dst_type)));
+  }
+}
+
+#endif
+
 template <typename InType>
 struct CastDataType {
   CastDataType(const framework::Tensor& in, framework::Tensor* out,
@@ -88,6 +135,34 @@ void TransDataType(const Tensor& in,
   auto dst_type = type;
   auto ctx = pool.Get(in.place());
 
+#if defined(PADDLE_WITH_XPU)
+  switch (src_type) {
+    case proto::VarType::FP16:
+      XPUTransDataType<platform::float16>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::FP32:
+      XPUTransDataType<float>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::BOOL:
+      XPUTransDataType<bool>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT16:
+      XPUTransDataType<int16_t>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT32:
+      XPUTransDataType<int>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT64:
+      XPUTransDataType<int64_t>(in, out, dst_type, ctx);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported in XPU when casting data type.",
+          DataTypeToString(src_type)));
+  }
+
+#else
+
   switch (src_type) {
     case proto::VarType::FP16:
       framework::VisitDataType(dst_type,
@@ -123,6 +198,7 @@ void TransDataType(const Tensor& in,
           "Data type (%s) is not supported when casting data type.",
           DataTypeToString(src_type)));
   }
+#endif
 }
 
 void TransComplexToReal(const proto::VarType::Type& dst_type,
@@ -131,7 +207,6 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
   auto& pool = platform::DeviceContextPool::Instance();
   auto* ctx = pool.Get(in.place());
   out->Resize(in.dims());
-
   // complex -> real
   switch (src_type) {
     case proto::VarType::COMPLEX64:
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index aa5fdd86745d6932052347f3dc11b14e3d447d20..ead6f94417b6ea0353fb42c08f239eeca38c6196 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -21,58 +21,67 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predicted");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-    loss->mutable_data<T>(ctx.GetPlace());
-    int n = predict->numel();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, predict->data<T>(),
-                          labels->data<T>(), loss->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    /*** TODO wait XDNN new interface
+        auto* predict = ctx.Input<Tensor>("Predicted");
+        auto* labels = ctx.Input<Tensor>("Labels");
+        auto* loss = ctx.Output<Tensor>("Loss");
+        auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+        loss->mutable_data<T>(ctx.GetPlace());
+        int n = predict->numel();
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        int r =
+            xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon,
+    predict->data<T>(),
+                              labels->data<T>(), loss->data<T>());
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+                "XPU log_loss kernel return wrong value[%d], please check
+    whether "
+                "Baidu Kunlun Card is properly installed.",
+                r));
+    ***/
   }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predicted");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-    if (!dpred) {
-      return;
-    }
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-    dpred->mutable_data<T>(ctx.GetPlace());
-    int n = predict->numel();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
-                              predict->data<T>(), labels->data<T>(),
-                              dloss->data<T>(), dpred->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    /*** TODO wait XDNN new interface
+
+        auto* predict = ctx.Input<Tensor>("Predicted");
+        auto* labels = ctx.Input<Tensor>("Labels");
+        auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+        auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+        if (!dpred) {
+          return;
+        }
+        auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+        dpred->mutable_data<T>(ctx.GetPlace());
+        int n = predict->numel();
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
+                                  predict->data<T>(), labels->data<T>(),
+                                  dloss->data<T>(), dpred->data<T>());
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+                "XPU log_loss kernel return wrong value[%d], please check
+    whether "
+                "Baidu Kunlun Card is properly installed.",
+                r));
+    ***/
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    log_loss, ops::LogLossXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+// namespace ops = paddle::operators;
+// REGISTER_OP_XPU_KERNEL(
+//     log_loss, ops::LogLossXPUKernel<paddle::platform::XPUDeviceContext,
+//     float>);
+// REGISTER_OP_XPU_KERNEL(
+//     log_loss_grad,
+//     ops::LogLossGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
 
 #endif
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 3cc1be4de8a82ff263824ab4852178f735596d45..82e4b90468a38c5b539fda9cb6f911c5080d1297 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -42,68 +42,26 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
-    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
-    size_t label_int32_size = num_samples * sizeof(int);
-    size_t label_int64_size = num_samples * sizeof(int64_t);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int* indices_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
-                   indices_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(indices_int32_size)));
-    int* label_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
-                   label_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(label_int32_size)));
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int size = num_samples * class_dim;
+    int* indices_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int32_ptr);
+    int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr);
 
-    int* indices_int32_host =
-        reinterpret_cast<int*>(std::malloc(indices_int32_size));
-    int64_t* indices_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
-    int* label_int32_host =
-        reinterpret_cast<int*>(std::malloc(label_int32_size));
-    int64_t* label_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
-    dev_ctx.Wait();
-    memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(),
-                 indices_data, indices_int64_size);
-    memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(),
-                 label_data, label_int64_size);
-    for (size_t i = 0; i < num_samples; ++i) {
-      label_int32_host[i] = label_int64_host[i];
-      for (size_t j = 0; j < class_dim; ++j) {
-        indices_int32_host[i * class_dim + j] =
-            indices_int64_host[i * class_dim + j];
-      }
-    }
-    memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(),
-                 indices_int32_host, indices_int32_size);
-    memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(),
-                 label_int32_host, label_int32_size);
-    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
-                          label_int32_device, num_samples, class_dim,
-                          correct_data, total_data, accuracy_data);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU accuracy kernel error!"));
-    dev_ctx.Wait();
-    xpu_free(indices_int32_device);
-    xpu_free(label_int32_device);
-    std::free(indices_int32_host);
-    std::free(indices_int64_host);
-    std::free(label_int32_host);
-    std::free(label_int64_host);
+    int r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), indices_data,
+                                           indices_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), label_data,
+                                       label_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::accuracy(dev_ctx.x_context(), indices_int32_ptr, label_int32_ptr,
+                      num_samples, class_dim, correct_data, total_data,
+                      accuracy_data);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
index e7cbe4aa8dd4b36983dca5413ccdcb8ceac63a3c..643f70b260206c786ce7c6782ab9abd2f76a6de5 100644
--- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
@@ -25,101 +25,111 @@ template <typename DeviceContext, typename T>
 class LambOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using paddle::framework::LoDTensor;
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
+    /*** TODO wait XDNN new interface
+        using paddle::framework::LoDTensor;
+        const auto* param_var = ctx.InputVar("Param");
+        PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "The Var(%s)'s type should be LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Param").front(),
+                              framework::ToTypeName(param_var->Type())));
 
-    using paddle::framework::LoDTensor;
+        using paddle::framework::LoDTensor;
 
-    // inputs
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T weight_decay = static_cast<T>(ctx.Attr<float>("weight_decay"));
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
-                                  "Param", "Lamb");
-    auto* grad_var = ctx.InputVar("Grad");
-    auto& mom1 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment1"), "Input",
-                                 "Moment1", "Lamb");
-    auto& mom2 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment2"), "Input",
-                                 "Moment2", "Lamb");
-    auto& lr = GET_DATA_SAFELY(ctx.Input<LoDTensor>("LearningRate"), "Input",
-                               "LearningRate", "Lamb");
+        // inputs
+        T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+        T weight_decay = static_cast<T>(ctx.Attr<float>("weight_decay"));
+        T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+        T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+        auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
+                                      "Param", "Lamb");
+        auto* grad_var = ctx.InputVar("Grad");
+        auto& mom1 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment1"), "Input",
+                                     "Moment1", "Lamb");
+        auto& mom2 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment2"), "Input",
+                                     "Moment2", "Lamb");
+        auto& lr = GET_DATA_SAFELY(ctx.Input<LoDTensor>("LearningRate"),
+    "Input",
+                                   "LearningRate", "Lamb");
 
-    auto& beta1_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta1Pow"), "Input",
-                                      "Beta1Pow", "Lamb");
-    auto& beta2_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta2Pow"), "Input",
-                                      "Beta2Pow", "Lamb");
+        auto& beta1_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta1Pow"),
+    "Input",
+                                          "Beta1Pow", "Lamb");
+        auto& beta2_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta2Pow"),
+    "Input",
+                                          "Beta2Pow", "Lamb");
 
-    auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
-                                      "Output", "ParamOut", "Lamb");
-    auto& mom1_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment1Out"),
-                                     "Output", "Moment1Out", "Lamb");
-    auto& mom2_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment2Out"),
-                                     "Output", "Moment2Out", "Lamb");
-    auto& beta1_pow_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta1PowOut"),
-                                          "Output", "Beta1PowOut", "Lamb");
-    auto& beta2_pow_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta2PowOut"),
-                                          "Output", "Beta2PowOut", "Lamb");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
+                                          "Output", "ParamOut", "Lamb");
+        auto& mom1_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment1Out"),
+                                         "Output", "Moment1Out", "Lamb");
+        auto& mom2_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment2Out"),
+                                         "Output", "Moment2Out", "Lamb");
+        auto& beta1_pow_out =
+    GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta1PowOut"),
+                                              "Output", "Beta1PowOut", "Lamb");
+        auto& beta2_pow_out =
+    GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta2PowOut"),
+                                              "Output", "Beta2PowOut", "Lamb");
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto& grad = *ctx.Input<LoDTensor>("Grad");
-      int r = xpu::lamb(dev_ctx.x_context(), grad.template data<T>(),
-                        mom1.template data<T>(), mom2.template data<T>(),
-                        param.template data<T>(), beta1_pow.template data<T>(),
-                        beta2_pow.template data<T>(), beta1, beta2, epsilon,
-                        weight_decay, lr.template data<T>(),
-                        mom1_out.template mutable_data<T>(ctx.GetPlace()),
-                        mom2_out.template mutable_data<T>(ctx.GetPlace()),
-                        param_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta2_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        param.numel());
+        if (grad_var->IsType<framework::LoDTensor>()) {
+          auto& grad = *ctx.Input<LoDTensor>("Grad");
+          int r = xpu::lamb(dev_ctx.x_context(), grad.template data<T>(),
+                            mom1.template data<T>(), mom2.template data<T>(),
+                            param.template data<T>(), beta1_pow.template
+    data<T>(),
+                            beta2_pow.template data<T>(), beta1, beta2, epsilon,
+                            weight_decay, lr.template data<T>(),
+                            mom1_out.template mutable_data<T>(ctx.GetPlace()),
+                            mom2_out.template mutable_data<T>(ctx.GetPlace()),
+                            param_out.template mutable_data<T>(ctx.GetPlace()),
+                            beta1_pow_out.template
+    mutable_data<T>(ctx.GetPlace()),
+                            beta2_pow_out.template
+    mutable_data<T>(ctx.GetPlace()),
+                            param.numel());
 
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of LambOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of LambOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      } else {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: OTHER "
-                              "XPU API returns error code: %d.",
-                              r));
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Variable type not supported by lamb_op. Expect LoDTensor, "
-          "but got %s",
-          framework::ToTypeName(param_var->Type())));
-    }
+          if (r == xpu::Error_t::INVALID_PARAM) {
+            PADDLE_ENFORCE_EQ(
+                r, xpu::Error_t::SUCCESS,
+                platform::errors::InvalidArgument(
+                    "XPU kernel error of LambOp, error message: INVALID_PARAM, "
+                    "please check your input & output."));
+          } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+            PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                              platform::errors::Unavailable(
+                                  "XPU kernel error of LambOp, error message: "
+                                  "RUNTIME_ERROR, please check whether Baidu "
+                                  "Kunlun Card is properly installed."));
+          } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+            PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                              platform::errors::ResourceExhausted(
+                                  "XPU kernel error of LambOp, error "
+                                  "message: NO_ENOUGH_WORKSPACE, XPU "
+                                  "has no enough memory."));
+          } else {
+            PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                              platform::errors::ResourceExhausted(
+                                  "XPU kernel error of LambOp, error "
+                                  "message: OTHER "
+                                  "XPU API returns error code: %d.",
+                                  r));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Variable type not supported by lamb_op. Expect LoDTensor, "
+              "but got %s",
+              framework::ToTypeName(param_var->Type())));
+        }
+    **/
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    lamb, ops::LambOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+// namespace ops = paddle::operators;
+// REGISTER_OP_XPU_KERNEL(
+//     lamb, ops::LambOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index 85c2d42c841f020e44994546ea3dafb86de0c8f8..873056c7f67fe12aa285d2280072df82e90e8e31 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -40,113 +40,122 @@ template <typename DeviceContext, typename T>
 class RmspropOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using paddle::framework::LoDTensor;
-
-    // check Param & Grad tensor type
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "Tensor holds the wrong type，Expected Var(%s)'s "
-                          "type is LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "Tensor holds the wrong type，Expected Var(%s)'s "
-                          "type is LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    // inputs
-    auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
-                                  "Param", "Rmsprop");
-    auto& meanSquare = GET_DATA_SAFELY(ctx.Input<LoDTensor>("MeanSquare"),
-                                       "Input", "MeanSquare", "Rmsprop");
-    auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input", "Grad",
-                                 "Rmsprop");
-    auto& mom = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment"), "Input",
-                                "Moment", "Rmsprop");
-
-    auto* learning_rate = ctx.Input<Tensor>("LearningRate");
-    PADDLE_ENFORCE_EQ(learning_rate->dims().size(), 1,
-                      platform::errors::InvalidArgument(
-                          "learining rate should have dimension = 1."
-                          " But received learning rate dim [%s] ",
-                          learning_rate->dims().size()));
-    T lr = static_cast<T>(GetAttrFromTensor(learning_rate));
-
-    // constants
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T decay = static_cast<T>(ctx.Attr<float>("decay"));
-    T momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-
-    // outputs
-    auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
-                                      "Output", "ParamOut", "Rmsprop");
-    auto& mom_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MomentOut"),
-                                    "Output", "MomentOut", "Rmsprop");
-    auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MeanSquareOut"),
-                                         "Output", "MeanSquareOut", "Rmsprop");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    ///// rmsprop优化算法
-    ///
-    /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]);
-    ///
-    /// mom_out[i] = momentum * mom[i] + lr *
-    /// (g[i] / ((float)sqrt(ms_out[i] + epsilon)));
-    ///
-    /// p_out[i] = p[i] - mom_out[i];
-    /// DLL_EXPORT int rmsprop(Context* ctx, const float* p,
-    /// const float* ms, const float* g, const float* mom,
-    /// float epsilon, float rho, float momentum, float lr,
-    /// float *ms_out, float *mom_out, float *p_out, int n)
-    int r = xpu::rmsprop(dev_ctx.x_context(), param.template data<T>(),
-                         meanSquare.template data<T>(), grad.template data<T>(),
-                         mom.template data<T>(), epsilon, decay, momentum, lr,
-                         mom_sqrt_out.template mutable_data<T>(ctx.GetPlace()),
-                         mom_out.template mutable_data<T>(ctx.GetPlace()),
-                         param_out.template mutable_data<T>(ctx.GetPlace()),
-                         param.numel());
-
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::InvalidArgument(
-              "XPU kernel error of RmspropOp, error message: INVALID_PARAM, "
-              "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::Unavailable(
-                            "XPU kernel error of RmspropOp, error message: "
-                            "RUNTIME_ERROR, please check whether Baidu "
-                            "Kunlun Card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: NO_ENOUGH_WORKSPACE, XPU "
-                            "has no enough memory."));
-    } else {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: OTHER "
-                            "XPU API returns error code: %d.",
-                            r));
-    }
+    /*** TODO wait XDNN new interface
+        using paddle::framework::LoDTensor;
+
+        // check Param & Grad tensor type
+        const auto* param_var = ctx.InputVar("Param");
+        PADDLE_ENFORCE_EQ(param_var->IsType<LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "Tensor holds the wrong type，Expected Var(%s)'s "
+                              "type is LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Param").front(),
+                              framework::ToTypeName(param_var->Type())));
+
+        const auto* grad_var = ctx.InputVar("Grad");
+        PADDLE_ENFORCE_EQ(grad_var->IsType<LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "Tensor holds the wrong type，Expected Var(%s)'s "
+                              "type is LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Grad").front(),
+                              framework::ToTypeName(grad_var->Type())));
+
+        // inputs
+        auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
+                                      "Param", "Rmsprop");
+        auto& meanSquare = GET_DATA_SAFELY(ctx.Input<LoDTensor>("MeanSquare"),
+                                           "Input", "MeanSquare", "Rmsprop");
+        auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input",
+    "Grad",
+                                     "Rmsprop");
+        auto& mom = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment"), "Input",
+                                    "Moment", "Rmsprop");
+
+        auto* learning_rate = ctx.Input<Tensor>("LearningRate");
+        PADDLE_ENFORCE_EQ(learning_rate->dims().size(), 1,
+                          platform::errors::InvalidArgument(
+                              "learining rate should have dimension = 1."
+                              " But received learning rate dim [%s] ",
+                              learning_rate->dims().size()));
+        T lr = static_cast<T>(GetAttrFromTensor(learning_rate));
+
+        // constants
+        T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+        T decay = static_cast<T>(ctx.Attr<float>("decay"));
+        T momentum = static_cast<T>(ctx.Attr<float>("momentum"));
+
+        // outputs
+        auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
+                                          "Output", "ParamOut", "Rmsprop");
+        auto& mom_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MomentOut"),
+                                        "Output", "MomentOut", "Rmsprop");
+        auto& mom_sqrt_out =
+    GET_DATA_SAFELY(ctx.Output<LoDTensor>("MeanSquareOut"),
+                                             "Output", "MeanSquareOut",
+    "Rmsprop");
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+        ///// rmsprop优化算法
+        ///
+        /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]);
+        ///
+        /// mom_out[i] = momentum * mom[i] + lr *
+        /// (g[i] / ((float)sqrt(ms_out[i] + epsilon)));
+        ///
+        /// p_out[i] = p[i] - mom_out[i];
+        /// DLL_EXPORT int rmsprop(Context* ctx, const float* p,
+        /// const float* ms, const float* g, const float* mom,
+        /// float epsilon, float rho, float momentum, float lr,
+        /// float *ms_out, float *mom_out, float *p_out, int n)
+        int r = xpu::rmsprop(dev_ctx.x_context(), param.template data<T>(),
+                             meanSquare.template data<T>(), grad.template
+    data<T>(),
+                             mom.template data<T>(), epsilon, decay, momentum,
+    lr,
+                             mom_sqrt_out.template
+    mutable_data<T>(ctx.GetPlace()),
+                             mom_out.template mutable_data<T>(ctx.GetPlace()),
+                             param_out.template mutable_data<T>(ctx.GetPlace()),
+                             param.numel());
+
+        if (r == xpu::Error_t::INVALID_PARAM) {
+          PADDLE_ENFORCE_EQ(
+              r, xpu::Error_t::SUCCESS,
+              platform::errors::InvalidArgument(
+                  "XPU kernel error of RmspropOp, error message: INVALID_PARAM,
+    "
+                  "please check your input & output."));
+        } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+          PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                            platform::errors::Unavailable(
+                                "XPU kernel error of RmspropOp, error message: "
+                                "RUNTIME_ERROR, please check whether Baidu "
+                                "Kunlun Card is properly installed."));
+        } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+          PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                            platform::errors::ResourceExhausted(
+                                "XPU kernel error of RmspropOp, error "
+                                "message: NO_ENOUGH_WORKSPACE, XPU "
+                                "has no enough memory."));
+        } else {
+          PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                            platform::errors::ResourceExhausted(
+                                "XPU kernel error of RmspropOp, error "
+                                "message: OTHER "
+                                "XPU API returns error code: %d.",
+                                r));
+        }
+    ***/
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    rmsprop,
-    ops::RmspropOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+// namespace ops = paddle::operators;
+// REGISTER_OP_XPU_KERNEL(
+//     rmsprop,
+//     ops::RmspropOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
index 9dabca1b66a771ed62431715e7a69d285774297e..e7c03be95cae1e1cfb01ab5ec42252f1e888e55e 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class SGDOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
@@ -48,40 +52,31 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
                             "numel = [%s], ParamOut's numel = [%s]",
                             grad->numel(), sz));
 
-      const T *lr = learning_rate->data<T>();
+      const T *lr_t = learning_rate->data<T>();
+      auto &dev_ctx = ctx.template device_context<DeviceContext>();
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      const float *lr = nullptr;
+      if (std::is_same<T, paddle::platform::float16>::value) {
+        float *lr_float =
+            RAII_GUARD.alloc_l3_or_gm<float>(learning_rate->numel());
+        int r = xpu::cast_v2<XPUType, float>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType *>(lr_t),
+            lr_float, learning_rate->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+        lr = lr_float;
+      } else {
+        lr = reinterpret_cast<const float *>(lr_t);
+      }
+
       const T *param_data = param->data<T>();
       const T *grad_data = grad->data<T>();
       T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
 
-      auto &dev_ctx = ctx.template device_context<DeviceContext>();
-      int r = xpu::sgd(dev_ctx.x_context(), sz, grad_data, param_data, lr,
-                       out_data);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of SgdOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of SgdOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of SgdOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false, true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Param & Grad in "
-                            "SgdOp-XPU. Excepted "
-                            "LodTensor, But received [%s] and [%s]",
-                            paddle::framework::ToTypeName(param_var->Type())));
+      int r = xpu::sgd(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType *>(grad_data),
+                       reinterpret_cast<const XPUType *>(param_data), lr,
+                       reinterpret_cast<XPUType *>(out_data), sz);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sgd");
     }
   }
 };
@@ -90,6 +85,8 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_XPU_KERNEL(
-    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index a76bdd4ae967987748abe4aefa144ce3ac83a545..e8c3eee5b538ba326986e78148aa6a18f7bb392e 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -145,7 +145,6 @@ XPUOpMap& get_kl1_ops() {
       {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"iou_similarity",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -175,9 +174,6 @@ XPUOpMap& get_kl1_ops() {
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lookup_table_v2_grad",
@@ -236,7 +232,6 @@ XPUOpMap& get_kl1_ops() {
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::BOOL, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align_grad",
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 0dcab845bc9ca1b9d4dc7ae02e8f9b4c63ac4d83..99f8e5ace9c0088cd304bc3735ceb1696984dc3a 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -328,6 +328,8 @@ XPUOpMap& get_kl2_ops() {
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits",
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
index 7aaa78856811f260eb27663026f1c7ed4c3301a0..b0bb9a37c16bd70c28b548203202be7015ed6243 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -23,41 +23,52 @@ import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.init_dtype()
-        n = 8192
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
-        label = np.random.randint(0, 2, (n, 1)).astype('int64')
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int32"),
-            'Total': np.array([n]).astype("int32")
-        }
-        self.attrs = {'use_xpu': True}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+class XPUTestAccuracyOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'accuracy'
+        self.use_dynamic_create_class = False
+
+    class TestXPUAccuracyOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "accuracy"
+            self.init_dtype()
+            n = 8192
+            infer = np.random.random((n, 1)).astype(self.dtype)
+            indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+            label = np.random.randint(0, 2, (n, 1)).astype('int64')
+            self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+            num_correct = 0
+            for rowid in range(n):
+                for ele in indices[rowid]:
+                    if ele == label[rowid]:
+                        num_correct += 1
+                        break
+            self.outputs = {
+                'Accuracy':
+                np.array([num_correct / float(n)]).astype(self.dtype),
+                'Correct': np.array([num_correct]).astype("int32"),
+                'Total': np.array([n]).astype("int32")
+            }
+            self.attrs = {'use_xpu': True}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
 
+support_types = get_xpu_op_support_types('accuracy')
+for stype in support_types:
+    create_test_class(globals(), XPUTestAccuracyOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
index c29150ef921c2dc3c9d94ca767c5f1263c15b00d..67fd9f871207b2fcdc74e57a6223ee9904dcc2ce 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -25,30 +25,43 @@ import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
-class TestSGDOp(OpTest):
-    def setUp(self):
-        self.op_type = "sgd"
-        self.conf()
-        w = np.random.random((self.h, self.w)).astype("float32")
-        g = np.random.random((self.h, self.w)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
-        self.outputs = {'ParamOut': w - lr * g}
+class XPUTestSgdOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sgd'
+        self.use_dynamic_create_class = False
 
-    def conf(self):
-        self.h = 102
-        self.w = 105
+    class TestSGDOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "sgd"
+            self.dtype = self.in_type
+            self.conf()
+            w = np.random.random((self.h, self.w)).astype(self.dtype)
+            g = np.random.random((self.h, self.w)).astype(self.dtype)
+            lr = np.array([0.1]).astype(self.dtype)
 
-    def test_check_output_with_place(self):
-        self.check_output_with_place(paddle.XPUPlace(0))
+            self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+            self.outputs = {'ParamOut': w - lr * g}
 
+        def conf(self):
+            self.h = 102
+            self.w = 105
 
-class TestSGDOpCase8X(TestSGDOp):
-    def conf(self):
-        self.h = 10
-        self.w = 64
+        def test_check_output_with_place(self):
+            self.check_output_with_place(paddle.XPUPlace(0))
+
+    class TestSGDOpCase8X(TestSGDOp):
+        def conf(self):
+            self.h = 10
+            self.w = 64
+
+
+support_types = get_xpu_op_support_types('sgd')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSgdOp, stype)
 
 
 class TestSGDOpWithLargeInput(unittest.TestCase):