[MLU] cherry-pick from develop to release/2.4 (#48313)

* [MLU] fix compute error of dropout op (#45923) * [MLU] add mergedAdam kernel. (#45965) * [MLU] add int64 support for mlu one_hot_v2 (#46313) * [MLU] fix profiler compile failure (#46208) * [MLU] add barrier_op kernel. (#46417) * [MLU] fluid: add mluop (#46429) * [MLU] add huber_loss kernel. (#46455) * [MLU] add mlu kernel for add_reduce_max_grad (#45651) Co-authored-by: N liupeiyu <liupeiyu@cambricon.com> * [MLU] add_fluid_mluop_yolo_box (#46573) * [MLU] fix phi::Tensor compile error of mlu. (#46649) * [MLU] add fluid MLUOps prior_box (#46585) * [MLU] fix cmake error (#46772) * [MLU]fix unittest of sync_bn (#46797) * [MLU] add masterparam support for mlu adamw. (#46804) * [MLU] add int64 support for allgather. (#46830) * [MLU] fix compile error & add mlu blacklist function. (#47439) * [MLU] fix softmax_with_cross_entropy failed in 370-X8. * [MLU] fix cncl stuck caused by multiple initializations. * [MLU] fix code style check. Co-authored-by: N qipengh <huangqipeng@cambricon.com> Co-authored-by: N cifar10 <41565156+cifar10@users.noreply.github.com> Co-authored-by: Lux et Veritas <1004239791@qq.com> Co-authored-by: N liupeiyu <liupeiyu@cambricon.com> Co-authored-by: N ronnywang <ronny1996@163.com>

[MLU] cherry-pick from develop to release/2.4 (#48313)
* [MLU] fix compute error of dropout op (#45923) * [MLU] add mergedAdam kernel. (#45965) * [MLU] add int64 support for mlu one_hot_v2 (#46313) * [MLU] fix profiler compile failure (#46208) * [MLU] add barrier_op kernel. (#46417) * [MLU] fluid: add mluop (#46429) * [MLU] add huber_loss kernel. (#46455) * [MLU] add mlu kernel for add_reduce_max_grad (#45651) Co-authored-by: N liupeiyu <liupeiyu@cambricon.com> * [MLU] add_fluid_mluop_yolo_box (#46573) * [MLU] fix phi::Tensor compile error of mlu. (#46649) * [MLU] add fluid MLUOps prior_box (#46585) * [MLU] fix cmake error (#46772) * [MLU]fix unittest of sync_bn (#46797) * [MLU] add masterparam support for mlu adamw. (#46804) * [MLU] add int64 support for allgather. (#46830) * [MLU] fix compile error & add mlu blacklist function. (#47439) * [MLU] fix softmax_with_cross_entropy failed in 370-X8. * [MLU] fix cncl stuck caused by multiple initializations. * [MLU] fix code style check. Co-authored-by: N qipengh <huangqipeng@cambricon.com> Co-authored-by: N cifar10 <41565156+cifar10@users.noreply.github.com> Co-authored-by: Lux et Veritas <1004239791@qq.com> Co-authored-by: N liupeiyu <liupeiyu@cambricon.com> Co-authored-by: N ronnywang <ronny1996@163.com>
6e154fc6 · Chenxiao Niu · GitHub · 96e974a0 · 6e154fc6 · 6e154fc6
37 changed file
--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
@@ -15,12 +15,14 @@ set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
 include_directories(${NEUWARE_INCLUDE_DIR})

 set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
+set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
 set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
 set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
 set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)

 generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB})
+set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
+                     ${CNPAPI_LIB})

 if(WITH_CNCL)
  message(STATUS "Compile with CNCL!")

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -146,6 +146,48 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
      kernel_signature_(std::move(kernel_signature)),
      phi_kernel_(phi_kernel) {}

+#ifdef PADDLE_WITH_MLU
+
+static void tokenize(const std::string& ops,
+                     char delim,
+                     std::unordered_set<std::string>* op_set) {
+  std::string::size_type beg = 0;
+  for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
+       ++end) {
+    op_set->insert(ops.substr(beg, end - beg));
+    beg = end + 1;
+  }
+
+  op_set->insert(ops.substr(beg));
+}
+
+static bool is_in_mlu_black_list(const std::string& op_name) {
+  static bool inited = false;
+  static std::unordered_set<std::string> mlu_black_list;
+  static std::mutex s_mtx;
+  if (!inited) {
+    std::lock_guard<std::mutex> guard(s_mtx);
+    if (!inited) {
+      if (std::getenv("MLU_BLACK_LIST") != nullptr) {
+        std::string ops(std::getenv("MLU_BLACK_LIST"));
+        tokenize(ops, ',', &mlu_black_list);
+      }
+      inited = true;
+      VLOG(3) << "MLU Black List: ";
+      for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
+           ++iter) {
+        VLOG(3) << *iter << " ";
+      }
+    }
+  }
+  if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
+    return true;
+  }
+  return false;
+}
+
+#endif
+
 template <typename VarType>
 PreparedOp PrepareImpl(
    const NameVarMap<VarType>& ins,
@@ -194,6 +236,12 @@ PreparedOp PrepareImpl(

 #endif

+#ifdef PADDLE_WITH_MLU
+  if (is_in_mlu_black_list(op.Type())) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+  }
+#endif
+
  bool has_phi_kernel = false;

  const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());

--- a/paddle/fluid/operators/collective/barrier_op_mlu.cc
+++ b/paddle/fluid/operators/collective/barrier_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
+
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data();
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    auto* comm = cncl_comm->comm();
+    auto comm_stream = cncl_comm->stream();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    dev_ctx.Wait();
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with CNCL."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
--- a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"

 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -27,15 +28,14 @@ template <typename T>
 class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
 #if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");

    int nranks = ctx.Attr<int>("nranks");
    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
    PADDLE_ENFORCE_EQ(
        nranks,
@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
    out->mutable_data<T>(out_dims, place);

    uint32_t send_numel = x->numel();
-    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+    void* send_buff;
+    void* recv_buff;
+    phi::DenseTensor in_tensor, out_tensor;
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast from int64 to int32 since cncl do not support int64
+      in_tensor.mutable_data<int32_t>(x->dims(), place);
+      out_tensor.mutable_data<int32_t>(out->dims(), place);
+      MLUCnnlTensorDesc x_int64_desc(*x);
+      MLUCnnlTensorDesc x_int32_desc(in_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    x_int64_desc.get(),
+                    GetBasePtr(x),
+                    x_int32_desc.get(),
+                    GetBasePtr(&in_tensor));
+      send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
+    } else {
+      in_tensor.ShareDataWith(*x);
+      out_tensor.ShareDataWith(*out);
+      send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
+    }

    mluStream stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }
+    cnclDataType_t dtype = platform::ToCNCLDataType(
+        framework::TransToProtoVarType(in_tensor.dtype()));

    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
        send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast back from int64 out_tensor to out
+      MLUCnnlTensorDesc out_int64_desc(*out);
+      MLUCnnlTensorDesc out_int32_desc(out_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    out_int32_desc.get(),
+                    GetBasePtr(&out_tensor),
+                    out_int64_desc.get(),
+                    GetBasePtr(out));
+    }
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with MLU."));
@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
                       ops::CAllGatherOpMLUKernel<int>,
                       ops::CAllGatherOpMLUKernel<int8_t>,
                       ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<int64_t>,
                       ops::CAllGatherOpMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -42,19 +42,23 @@ if(WITH_XPU)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op_xpu.cc)
  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 elseif(WITH_MLU)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op_mlu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
 elseif(WITH_ASCEND_CL)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op_npu.cc)
  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
 else()
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op.cu)
  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
  # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 endif()

@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                  box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc

--- a/paddle/fluid/operators/detection/prior_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class PriorBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
+    float step_w = ctx.Attr<float>("step_w");
+    float step_h = ctx.Attr<float>("step_h");
+    float offset = ctx.Attr<float>("offset");
+    bool clip = ctx.Attr<bool>("clip");
+    bool min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
+
+    int im_width = image->dims()[3];
+    int im_height = image->dims()[2];
+    int width = input->dims()[3];
+    int height = input->dims()[2];
+
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    bool flip = ctx.Attr<bool>("flip");
+    std::vector<float> new_aspect_ratios;
+    ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    phi::DenseTensor ratios;
+    paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
+    MLUOpTensorDesc new_aspect_ratios_desc(ratios);
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    phi::DenseTensor min;
+    paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
+    MLUOpTensorDesc min_sizes_desc(min);
+
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    phi::DenseTensor max;
+    paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
+    MLUOpTensorDesc max_sizes_desc(max);
+
+    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
+    phi::DenseTensor var_tensor;
+    paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
+    MLUOpTensorDesc variances_attr_desc(var_tensor);
+
+    auto place = ctx.GetPlace();
+
+    boxes->mutable_data<T>(place);
+    variances->mutable_data<T>(place);
+
+    MLUOpTensorDesc var_desc(*variances);
+    MLUOpTensorDesc output_desc(*boxes);
+    MLUOP::OpPriorBox(ctx,
+                      min_sizes_desc.get(),
+                      GetBasePtr(&min),
+                      new_aspect_ratios_desc.get(),
+                      GetBasePtr(&ratios),
+                      variances_attr_desc.get(),
+                      GetBasePtr(&var_tensor),
+                      max_sizes_desc.get(),
+                      GetBasePtr(&max),
+                      height,
+                      width,
+                      im_height,
+                      im_width,
+                      step_h,
+                      step_w,
+                      offset,
+                      clip,
+                      min_max_aspect_ratios_order,
+                      output_desc.get(),
+                      GetBasePtr(boxes),
+                      var_desc.get(),
+                      GetBasePtr(variances));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class YoloBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* scores = ctx.Output<phi::DenseTensor>("Scores");
+    const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto class_num = ctx.Attr<int>("class_num");
+    auto conf_thresh = ctx.Attr<float>("conf_thresh");
+    auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    auto clip_bbox = ctx.Attr<bool>("clip_bbox");
+    auto scale = ctx.Attr<float>("scale_x_y");
+    auto iou_aware = ctx.Attr<bool>("iou_aware");
+    auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
+
+    int anchor_num = anchors.size() / 2;
+    int64_t size = anchors.size();
+    auto dim_x = x->dims();
+    int n = dim_x[0];
+    int s = anchor_num;
+    int h = dim_x[2];
+    int w = dim_x[3];
+
+    // The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
+    // H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
+    // anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
+    std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
+    std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
+
+    // In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
+    // of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
+    // classification scores of boxes.
+    std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
+    std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
+
+    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
+    phi::DenseTensor boxes_tensor_mluops =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
+    phi::DenseTensor scores_tensor_mluops =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
+                                                   dev_ctx);
+    MLUOpTensorDesc boxes_trans_desc_mluops(
+        4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
+    MLUCnnlTensorDesc boxes_trans_desc_cnnl(
+        4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
+    MLUOpTensorDesc scores_trans_desc_mluops(
+        4, scores_dim_mluops.data(), ToMluOpDataType<T>());
+    MLUCnnlTensorDesc scores_trans_desc_cnnl(
+        4, scores_dim_mluops.data(), ToCnnlDataType<T>());
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    scores->mutable_data<T>(ctx.GetPlace());
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
+
+    MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
+    MLUOpTensorDesc img_size_desc(
+        *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
+    Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
+    anchors_temp.Resize({size});
+    paddle::framework::TensorFromVector(
+        anchors, ctx.device_context(), &anchors_temp);
+    MLUOpTensorDesc anchors_desc(anchors_temp);
+    MLUCnnlTensorDesc boxes_desc_cnnl(
+        4, boxes_out_dim.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc scores_desc_cnnl(
+        4, scores_out_dim.data(), ToCnnlDataType<T>());
+
+    MLUOP::OpYoloBox(ctx,
+                     x_desc.get(),
+                     GetBasePtr(x),
+                     img_size_desc.get(),
+                     GetBasePtr(img_size),
+                     anchors_desc.get(),
+                     GetBasePtr(&anchors_temp),
+                     class_num,
+                     conf_thresh,
+                     downsample_ratio,
+                     clip_bbox,
+                     scale,
+                     iou_aware,
+                     iou_aware_factor,
+                     boxes_trans_desc_mluops.get(),
+                     GetBasePtr(&boxes_tensor_mluops),
+                     scores_trans_desc_mluops.get(),
+                     GetBasePtr(&scores_tensor_mluops));
+    const std::vector<int> perm = {0, 1, 3, 2};
+
+    // transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
+    MLUCnnl::Transpose(ctx,
+                       perm,
+                       4,
+                       boxes_trans_desc_cnnl.get(),
+                       GetBasePtr(&boxes_tensor_mluops),
+                       boxes_desc_cnnl.get(),
+                       GetBasePtr(boxes));
+
+    // transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
+    // class_num]
+    MLUCnnl::Transpose(ctx,
+                       perm,
+                       4,
+                       scores_trans_desc_cnnl.get(),
+                       GetBasePtr(&scores_tensor_mluops),
+                       scores_desc_cnnl.get(),
+                       GetBasePtr(scores));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -39,8 +39,17 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
    MLUCnnlTensorDesc x_desc(*x);
    MLUCnnlTensorDesc out_desc(*out);

-    if (!is_test) {
-      // exec dropout op for training only.
+    if (is_test && is_upscale) {
+      // dropout op for inference: out = input.
+      framework::TensorCopy(
+          *x,
+          ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          out);
+      return;
+    } else if (!is_test) {
+      // dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
+      // out = input * mask.
      int seed_data = 0;
      if (seed_tensor) {
        if (platform::is_mlu_place(seed_tensor->place())) {
@@ -79,50 +88,44 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
      const int device_id = ctx.GetPlace().GetDeviceId();
      auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);

-      const float prob = is_upscale ? dropout_prob : 0.0f;
+      // compute out = input * mask / ( 1.0 - dropout_prob )
      MLUCnnl::FusedDropout(ctx,
                            mlu_gen_random->get(),
                            x_desc.get(),
                            GetBasePtr(x),
-                            prob,
+                            dropout_prob,
                            GetBasePtr(&(mlu_gen_random->get_state())),
                            mask_desc.get(),
                            GetBasePtr(mask),
                            out_desc.get(),
                            GetBasePtr(out));
-    } else {
-      // exec dropout op for inference only.
+
      if (is_upscale) {
-        framework::TensorCopy(
-            *x,
-            ctx.GetPlace(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            out);
-      } else {
-        auto scale = static_cast<T>(1.0f - dropout_prob);
-        Tensor scale_tensor(x->dtype());
-        scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
-        MLUCnnlTensorDesc scale_desc(scale_tensor);
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &scale,
-                      scale_desc.get(),
-                      GetBasePtr(&scale_tensor));
-
-        auto data_type = ToCnnlDataType<T>();
-        MLUCnnlOpTensorDesc op_tensor_desc(
-            CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
-        MLUCnnl::OpTensor(ctx,
-                          op_tensor_desc.get(),
-                          x_desc.get(),
-                          GetBasePtr(x),
-                          scale_desc.get(),
-                          GetBasePtr(&scale_tensor),
-                          out_desc.get(),
-                          GetBasePtr(out),
-                          data_type);
+        return;
      }
    }
+
+    // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
+    Tensor scale_tensor(x->dtype());
+    Tensor bias_tensor(x->dtype());
+    scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    FillMLUTensorWithHostValue(
+        ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
+
+    MLUCnnl::Scale(ctx,
+                   0,
+                   is_test ? x_desc.get() : out_desc.get(),
+                   is_test ? GetBasePtr(x) : GetBasePtr(out),
+                   scale_desc.get(),
+                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
+                   out_desc.get(),
+                   GetBasePtr(out));
  }
 };


--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = phi::DenseTensor;
+
+template <typename T>
+class HuberLossMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* residual = ctx.Output<Tensor>("Residual");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto delta = ctx.Attr<float>("delta");
+
+    auto place = ctx.GetPlace();
+
+    // compute y-x
+    cnnlDataType_t data_type = ToCnnlDataType<T>();
+    residual->mutable_data<T>(x->dims(), place);
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlOpTensorDesc sub_op_desc(
+        CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx,
+                      sub_op_desc.get(),
+                      x_desc.get(),
+                      GetBasePtr(y),
+                      x_desc.get(),
+                      GetBasePtr(x),
+                      x_desc.get(),
+                      GetBasePtr(residual),
+                      data_type);
+
+    // compute smoothl1loss
+    out->mutable_data<T>(x->dims(), place);
+    cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+        CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                           // here
+    MLUCnnl::SmoothL1LossForward(ctx,
+                                 x_desc.get(),
+                                 GetBasePtr(x),
+                                 x_desc.get(), /* target has same shape as x */
+                                 GetBasePtr(y),
+                                 static_cast<float>(delta),
+                                 smoothl1_algo,
+                                 x_desc.get(), /* out has same shape as x */
+                                 GetBasePtr(out));
+
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(out->dims().size() - 1, 0);
+
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Scale(ctx,
+                   axis,
+                   out_desc.get(),
+                   GetBasePtr(out),
+                   scale_desc.get(),
+                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
+                   out_desc.get(),
+                   GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class HuberLossGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* residual = ctx.Input<Tensor>("Residual");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = ctx.Attr<float>("delta");
+
+    auto place = ctx.GetPlace();
+
+    Tensor t_grad_rd;
+    t_grad_rd =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+    MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
+    if (dx || dy) {
+      Tensor t_zero;
+      t_zero =
+          ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
+
+      MLUCnnlTensorDesc residual_desc(*residual);
+      MLUCnnlTensorDesc dout_desc(*dout);
+
+      cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+          CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                             // here
+      MLUCnnl::SmoothL1LossBackward(ctx,
+                                    residual_desc.get(),
+                                    GetBasePtr(residual),
+                                    residual_desc.get(),
+                                    GetBasePtr(&t_zero),
+                                    dout_desc.get(),
+                                    GetBasePtr(dout),
+                                    static_cast<float>(delta),
+                                    smoothl1_algo,
+                                    t_grad_rd_desc.get(),
+                                    GetBasePtr(&t_grad_rd));
+    }
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
+
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dx);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dx));
+    }
+    if (dy) {
+      dy->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dy);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dy));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(huber_loss,
+                       ops::HuberLossMLUKernel<float>,
+                       ops::HuberLossMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(huber_loss_grad,
+                       ops::HuberLossGradMLUKernel<float>,
+                       ops::HuberLossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -256,6 +256,186 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
  }
 }

+class MLUOpTensorDescPool {
+ public:
+  mluOpTensorDescriptor_t Pop() {
+    mluOpTensorDescriptor_t raw_desc;
+    if (q_.try_dequeue(raw_desc)) {
+      return raw_desc;
+    } else {
+      mluOpCreateTensorDescriptor(&raw_desc);
+      return raw_desc;
+    }
+  }
+
+  void Recycle(mluOpTensorDescriptor_t desc) {
+    mluOpResetTensorDescriptor(desc);
+    q_.enqueue(desc);
+  }
+
+  ~MLUOpTensorDescPool() {
+    auto size = q_.size_approx();
+    if (size > 0) {
+      std::vector<mluOpTensorDescriptor_t> vec(size);
+      q_.try_dequeue_bulk(vec.data(), size);
+      for (auto desc : vec) {
+        mluOpDestroyTensorDescriptor(desc);
+      }
+    }
+  }
+
+ private:
+  moodycamel::ConcurrentQueue<mluOpTensorDescriptor_t> q_;
+};
+
+static MLUOpTensorDescPool g_mluop_tensor_desc_pool;
+
+MLUOpTensorDesc& MLUOpTensorDesc::operator=(MLUOpTensorDesc&& rhs) {
+  if (raw_tensor_desc) {
+    g_mluop_tensor_desc_pool.Recycle(raw_tensor_desc);
+  }
+  raw_tensor_desc = rhs.raw_tensor_desc;
+  rhs.raw_tensor_desc = nullptr;
+  return *this;
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype) {
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 const mluOpTensorLayout_t layout) {
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(
+      raw_tensor_desc, layout, tensor_dtype, tensor_dim, dim_sizes));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position)
+    : MLUOpTensorDesc(tensor_dim, dim_sizes, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 const mluOpTensorLayout_t layout) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      layout,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 const mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype) {
+  auto dims = phi::vectorize<int>(tensor.dims());
+  int tensor_dim = dims.size();
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  if (tensor_dim == 0) {
+    int scalar_dims[1] = {1};
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(
+        raw_tensor_desc, layout, tensor_dtype, 1, scalar_dims));
+  } else {
+    std::vector<int> tensor_dim_sizes_int(dims.begin(), dims.end());
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        mluOpSetTensorDescriptor(raw_tensor_desc,
+                                 layout,
+                                 tensor_dtype,
+                                 tensor_dim,
+                                 tensor_dim_sizes_int.data()));
+  }
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor)
+    : MLUOpTensorDesc(
+          tensor, MLUOP_LAYOUT_ARRAY, ToMluOpDataType(tensor.dtype())) {}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position)
+    : MLUOpTensorDesc(tensor, layout, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position,
+                                 float scale)
+    : MLUOpTensorDesc(tensor, layout, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptorPositionAndScale(
+      raw_tensor_desc, position, scale));
+}
+
+MLUOpTensorDesc::~MLUOpTensorDesc() {
+  if (raw_tensor_desc) {
+    g_mluop_tensor_desc_pool.Recycle(raw_tensor_desc);
+  }
+}
+
 MLUCnnlActivationDesc::MLUCnnlActivationDesc(
    const cnnlActivationMode_t act_mode, const float ceof) {
  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
@@ -1563,17 +1743,35 @@ MLURNNDesc::~MLURNNDesc() {
    void* indices_out) {
  cnnlHandle_t handle = GetHandleFromCTX(ctx);

-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor(handle,
-                                            input_desc,
-                                            input,
-                                            k,
-                                            dim,
-                                            largest,
-                                            sorted,
-                                            values_output_desc,
-                                            values_out,
-                                            indices_output_desc,
-                                            indices_out));
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetTopKTensorWorkspaceSize(handle,
+                                                            input_desc,
+                                                            k,
+                                                            dim,
+                                                            largest,
+                                                            values_output_desc,
+                                                            indices_output_desc,
+                                                            &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor_v3(handle,
+                                               input_desc,
+                                               input,
+                                               k,
+                                               dim,
+                                               largest,
+                                               sorted,
+                                               false /*lower_index_first*/,
+                                               workspace_ptr,
+                                               workspace_size,
+                                               values_output_desc,
+                                               values_out,
+                                               indices_output_desc,
+                                               indices_out));
 }

 /* static */ void MLUCnnl::StridedSlice(
@@ -4527,6 +4725,78 @@ MLURNNDesc::~MLURNNDesc() {
                                                 output));
 }

+/* static */ void MLUCnnl::SmoothL1LossForward(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t t_desc,
+    const void* target,
+    const float beta,
+    const cnnlSmoothL1LossAlgorithm_t algorithm,
+    const cnnlTensorDescriptor_t y_desc,
+    void* y) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSmoothL1LossForwardWorkspaceSize(
+      handle, x_desc, algorithm, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSmoothL1LossForward_v2(handle,
+                                                        x_desc,
+                                                        x,
+                                                        t_desc,
+                                                        target,
+                                                        beta,
+                                                        algorithm,
+                                                        workspace_ptr,
+                                                        workspace_size,
+                                                        y_desc,
+                                                        y));
+}
+
+/* static */ void MLUCnnl::SmoothL1LossBackward(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t target_desc,
+    const void* target,
+    const cnnlTensorDescriptor_t dy_desc,
+    const void* dy,
+    const float beta,
+    const cnnlSmoothL1LossAlgorithm_t algorithm,
+    const cnnlTensorDescriptor_t dx_desc,
+    void* dx) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSmoothL1LossBackwardWorkspaceSize(
+      handle, x_desc, algorithm, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSmoothL1LossBackward_v2(handle,
+                                                         x_desc,
+                                                         x,
+                                                         target_desc,
+                                                         target,
+                                                         dy_desc,
+                                                         dy,
+                                                         beta,
+                                                         algorithm,
+                                                         workspace_ptr,
+                                                         workspace_size,
+                                                         dx_desc,
+                                                         dx));
+}
+
 /* static */ void MLUCnnl::EmbeddingForward(
    const ExecutionContext& ctx,
    const int padding_idx,
@@ -5148,5 +5418,94 @@ MLURNNDesc::~MLURNNDesc() {
                                                              diff_x));
 }

+/* static */ void MLUOP::OpYoloBox(const ExecutionContext& ctx,
+                                   const mluOpTensorDescriptor_t x_desc,
+                                   const void* x,
+                                   const mluOpTensorDescriptor_t img_size_desc,
+                                   const void* img_size,
+                                   const mluOpTensorDescriptor_t anchors_desc,
+                                   const void* anchors,
+                                   const int class_num,
+                                   const float conf_thresh,
+                                   const int downsample_ratio,
+                                   const bool clip_bbox,
+                                   const float scale,
+                                   const bool iou_aware,
+                                   const float iou_aware_factor,
+                                   const mluOpTensorDescriptor_t boxes_desc,
+                                   void* boxes,
+                                   const mluOpTensorDescriptor_t scores_desc,
+                                   void* scores) {
+  mluOpHandle_t handle = GetMLUOpHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpYoloBox(handle,
+                                          x_desc,
+                                          x,
+                                          img_size_desc,
+                                          img_size,
+                                          anchors_desc,
+                                          anchors,
+                                          class_num,
+                                          conf_thresh,
+                                          downsample_ratio,
+                                          clip_bbox,
+                                          scale,
+                                          iou_aware,
+                                          iou_aware_factor,
+                                          boxes_desc,
+                                          boxes,
+                                          scores_desc,
+                                          scores));
+}
+
+/* static */ void MLUOP::OpPriorBox(
+    const ExecutionContext& ctx,
+    const mluOpTensorDescriptor_t min_sizes_desc,
+    const void* min_sizes,
+    const mluOpTensorDescriptor_t aspect_ratios_desc,
+    const void* aspect_ratios,
+    const mluOpTensorDescriptor_t variances_desc,
+    const void* variances,
+    const mluOpTensorDescriptor_t max_sizes_desc,
+    const void* max_sizes,
+    const int height,
+    const int width,
+    const int im_height,
+    const int im_width,
+    const float step_h,
+    const float step_w,
+    const float offset,
+    const bool clip,
+    const bool min_max_aspect_ratios_order,
+    const mluOpTensorDescriptor_t output_desc,
+    void* output,
+    const mluOpTensorDescriptor_t var_desc,
+    void* var) {
+  mluOpHandle_t handle = GetMLUOpHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpPriorBox(handle,
+                                           min_sizes_desc,
+                                           min_sizes,
+                                           aspect_ratios_desc,
+                                           aspect_ratios,
+                                           variances_desc,
+                                           variances,
+                                           max_sizes_desc,
+                                           max_sizes,
+                                           height,
+                                           width,
+                                           im_height,
+                                           im_width,
+                                           step_h,
+                                           step_w,
+                                           offset,
+                                           clip,
+                                           min_max_aspect_ratios_order,
+                                           output_desc,
+                                           output,
+                                           var_desc,
+                                           var));
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cn_api.h>
 #include <cnnl.h>
 #include <concurrentqueue.h>
+#include <mlu_op.h>

 #include <string>
 #include <vector>
@@ -138,6 +139,54 @@ inline cnnlDataType_t ToCnnlDataType() {
  return ToCnnlDataType(type);
 }

+inline mluOpDataType_t ToMluOpDataType(
+    const paddle::experimental::DataType& dtype) {
+  mluOpDataType_t type = MLUOP_DTYPE_FLOAT;
+  switch (dtype) {
+    case DataType::FLOAT16:
+      type = MLUOP_DTYPE_HALF;
+      break;
+    case DataType::FLOAT32:
+      type = MLUOP_DTYPE_FLOAT;
+      break;
+    case DataType::FLOAT64:
+      type = MLUOP_DTYPE_DOUBLE;
+      break;
+    case DataType::INT8:
+      type = MLUOP_DTYPE_INT8;
+      break;
+    case DataType::INT16:
+      type = MLUOP_DTYPE_INT16;
+      break;
+    case DataType::INT32:
+      type = MLUOP_DTYPE_INT32;
+      break;
+    case DataType::INT64:
+      type = MLUOP_DTYPE_INT64;
+      break;
+    case DataType::BOOL:
+      type = MLUOP_DTYPE_BOOL;
+      break;
+    case DataType::UINT8:
+      type = MLUOP_DTYPE_UINT8;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+inline mluOpDataType_t ToMluOpDataType(
+    const paddle::framework::proto::VarType::Type& type) {
+  return ToMluOpDataType(framework::TransToPhiDataType(type));
+}
+
+template <typename T>
+inline mluOpDataType_t ToMluOpDataType() {
+  auto type = framework::ToDataType(std::type_index(typeid(T)));
+  return ToMluOpDataType(type);
+}
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -152,6 +201,10 @@ inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
  return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
 }

+inline static mluOpHandle_t GetMLUOpHandleFromCTX(const ExecutionContext& ctx) {
+  return ctx.template device_context<MLUDeviceContext>().mluOp_handle();
+}
+
 inline static const MLUDeviceContext& GetDevCtxFromCTX(
    const ExecutionContext& ctx) {
  return ctx.template device_context<MLUDeviceContext>();
@@ -281,6 +334,74 @@ class MLUCnnlTensorDesc {
  cnnlTensorDescriptor_t raw_tensor_desc = nullptr;
 };

+class MLUOpTensorDesc {
+ public:
+  MLUOpTensorDesc() {}
+
+  // SE_DISALLOW_COPY_AND_ASSIGN
+  MLUOpTensorDesc(const MLUOpTensorDesc& desc) = delete;
+  MLUOpTensorDesc& operator=(const MLUOpTensorDesc&) = delete;
+
+  MLUOpTensorDesc(MLUOpTensorDesc&& rhs)
+      : raw_tensor_desc(rhs.raw_tensor_desc) {
+    rhs.raw_tensor_desc = nullptr;
+  }
+
+  MLUOpTensorDesc& operator=(MLUOpTensorDesc&& rhs);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  const mluOpTensorLayout_t layout);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  const mluOpTensorLayout_t layout);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  const mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype);
+
+  explicit MLUOpTensorDesc(const Tensor& tensor);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype,
+                  int position,
+                  float scale);
+
+  ~MLUOpTensorDesc();
+
+  const mluOpTensorDescriptor_t get() const { return raw_tensor_desc; }
+
+ private:
+  mluOpTensorDescriptor_t raw_tensor_desc = nullptr;
+};
+
 class MLUCnnlActivationDesc {
 public:
  MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
@@ -1921,6 +2042,28 @@ class MLUCnnl {
                              const cnnlTensorDescriptor_t output_desc,
                              void* output);

+  static void SmoothL1LossForward(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t x_desc,
+                                  const void* x,
+                                  const cnnlTensorDescriptor_t t_desc,
+                                  const void* target,
+                                  const float beta,
+                                  const cnnlSmoothL1LossAlgorithm_t algorithm,
+                                  const cnnlTensorDescriptor_t y_desc,
+                                  void* y);
+
+  static void SmoothL1LossBackward(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t x_desc,
+                                   const void* x,
+                                   const cnnlTensorDescriptor_t target_desc,
+                                   const void* target,
+                                   const cnnlTensorDescriptor_t dy_desc,
+                                   const void* dy,
+                                   const float beta,
+                                   const cnnlSmoothL1LossAlgorithm_t algorithm,
+                                   const cnnlTensorDescriptor_t dx_desc,
+                                   void* dx);
+
  static void EmbeddingForward(const ExecutionContext& ctx,
                               const int padding_idx,
                               const cnnlTensorDescriptor_t weight_desc,
@@ -2149,6 +2292,50 @@ class MLUCnnl {
      void* diff_x);
 };

+class MLUOP {
+ public:
+  static void OpYoloBox(const ExecutionContext& ctx,
+                        const mluOpTensorDescriptor_t x_desc,
+                        const void* x,
+                        const mluOpTensorDescriptor_t img_size_desc,
+                        const void* img_size,
+                        const mluOpTensorDescriptor_t anchors_desc,
+                        const void* anchors,
+                        const int class_num,
+                        const float conf_thresh,
+                        const int downsample_ratio,
+                        const bool clip_bbox,
+                        const float scale,
+                        const bool iou_aware,
+                        const float iou_aware_factor,
+                        const mluOpTensorDescriptor_t boxes_desc,
+                        void* boxes,
+                        const mluOpTensorDescriptor_t scores_desc,
+                        void* scores);
+
+  static void OpPriorBox(const ExecutionContext& ctx,
+                         const mluOpTensorDescriptor_t min_sizes_desc,
+                         const void* min_sizes,
+                         const mluOpTensorDescriptor_t aspect_ratios_desc,
+                         const void* aspect_ratios,
+                         const mluOpTensorDescriptor_t variances_desc,
+                         const void* variances,
+                         const mluOpTensorDescriptor_t max_sizes_desc,
+                         const void* max_sizes,
+                         const int height,
+                         const int width,
+                         const int im_height,
+                         const int im_width,
+                         const float step_h,
+                         const float step_w,
+                         const float offset,
+                         const bool clip,
+                         const bool min_max_aspect_ratios_order,
+                         const mluOpTensorDescriptor_t output_desc,
+                         void* output,
+                         const mluOpTensorDescriptor_t var_desc,
+                         void* var);
+};
 const std::map<const std::string, std::pair<std::vector<int>, std::vector<int>>>
    TransPermMap = {
        // trans_mode, (forward_perm, backward_perm)

--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
+REGISTER_OP_MLU_KERNEL(one_hot_v2,
+                       ops::OneHotV2MLUKernel<int32_t>,
+                       ops::OneHotV2MLUKernel<int64_t>);
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -291,11 +291,38 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
      skip_update = skip_update_vec[0];
    }
    bool with_decay = ctx.Attr<bool>("with_decay");
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* master_param_out = ctx.Output<LoDTensor>("MasterParamOut");
+    const auto* master_param = ctx.Input<LoDTensor>("MasterParam");
+
    VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay;
    if (!skip_update && with_decay) {
-      if (ctx.HasInput("MasterParam")) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Master Param is not supported on MLU"));
+      auto* param = ctx.Input<LoDTensor>("Param");
+      MLUCnnlTensorDesc param_desc(*param);
+      if (multi_precision) {
+        VLOG(3) << "[adamw] multi_precision, cast masterparam to param.";
+        bool has_master =
+            ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+        PADDLE_ENFORCE_EQ(
+            has_master,
+            true,
+            platform::errors::InvalidArgument(
+                "The Input(MasterParam) and Output(MasterParamOut) "
+                "should not be null when "
+                "the attr `multi_precision` is true"));
+        // cast masterparam (fp32) to param (fp16), then paramout (fp16) to
+        // masterparamout (fp32)
+        MLUCnnlTensorDesc master_param_desc(*master_param);
+        cnnlCastDataType_t cast_type = GetCastDataType(
+            framework::TransToProtoVarType(master_param->dtype()),
+            framework::TransToProtoVarType(param->dtype()));
+        MLUCnnl::Cast(ctx,
+                      cast_type,
+                      master_param_desc.get(),
+                      GetBasePtr(master_param),
+                      param_desc.get(),
+                      const_cast<void*>(GetBasePtr(param)));
      } else {
        const auto* param_var = ctx.InputVar("Param");
        PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(),
@@ -305,13 +332,12 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
                              "but the received is %s",
                              ctx.InputNames("Param").front(),
                              framework::ToTypeName(param_var->Type())));
-        auto* param = ctx.Input<LoDTensor>("Param");
+
        auto* lr = ctx.Input<LoDTensor>("LearningRate");
        float coeff = ctx.Attr<float>("coeff");

        // update param with decay coeff: mul(-1 * lr, coeff * param) + param
        MLUCnnlTensorDesc lr_desc(*lr);
-        MLUCnnlTensorDesc param_desc(*param);
        MLUCnnlOpTensorDesc mul_op_desc(
            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);

@@ -330,9 +356,244 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
      }
    }
    AdamMLUKernel<T>::Compute(ctx);
+    if (multi_precision) {
+      VLOG(3) << "[adamw] multi_precision, cast paramout to masterparamout.";
+      // cast paramout to masterparamout
+      master_param_out->mutable_data<float>(ctx.GetPlace());
+      cnnlCastDataType_t cast_type = GetCastDataType(
+          framework::TransToProtoVarType(param_out->dtype()),
+          framework::TransToProtoVarType(master_param_out->dtype()));
+      MLUCnnlTensorDesc param_out_desc(*param_out);
+      MLUCnnlTensorDesc master_param_out_desc(*master_param_out);
+
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    param_out_desc.get(),
+                    GetBasePtr(param_out),
+                    master_param_out_desc.get(),
+                    GetBasePtr(master_param_out));
+    }
  }
 };

+template <typename T>
+class MergedAdamMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Get inputs and outputs
+    auto params = ctx.MultiInput<framework::Tensor>("Param");
+    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto mom1s = ctx.MultiInput<framework::Tensor>("Moment1");
+    auto mom2s = ctx.MultiInput<framework::Tensor>("Moment2");
+    auto beta1_pows = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    auto beta2_pows = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+    auto master_params = ctx.MultiInput<framework::Tensor>("MasterParam");
+    auto param_outs = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_outs = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_outs = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    // Check validation of inputs and outputs
+    size_t param_num = params.size();
+    PADDLE_ENFORCE_EQ(param_num,
+                      param_outs.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          param_outs.size(),
+                          param_num));
+
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(
+          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
+      ctx.device_context().Wait();
+      skip_update = skip_update_vec[0];
+    }
+    // skip_update=true, just copy input to output, and TensorCopy will call
+    // mutable_data
+
+    if (skip_update) {
+      VLOG(4) << "MergedAdam skip update";
+      for (size_t i = 0; i < param_num; ++i) {
+        framework::TensorCopy(
+            *params[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            param_outs[i]);
+        framework::TensorCopy(
+            *mom1s[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            mom1_outs[i]);
+        framework::TensorCopy(
+            *mom2s[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            mom2_outs[i]);
+        framework::TensorCopy(
+            *beta1_pows[i],
+            beta1_pows[i]->place(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            beta1_pow_outs[i]);
+        framework::TensorCopy(
+            *beta2_pows[i],
+            beta2_pows[i]->place(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            beta2_pow_outs[i]);
+      }
+      return;
+    }
+
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    // Get beta1, beta2 and epsilon from attribute.
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(experimental::DataType::FLOAT32);
+    Tensor beta2_tmp(experimental::DataType::FLOAT32);
+    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
+    MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
+    MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &beta1,
+                  beta1_tmp_desc.get(),
+                  GetBasePtr(&beta1_tmp));
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &beta2,
+                  beta2_tmp_desc.get(),
+                  GetBasePtr(&beta2_tmp));
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &epsilon,
+                  epsilon_tmp_desc.get(),
+                  GetBasePtr(&epsilon_tmp));
+    beta1_tensor = &beta1_tmp;
+    beta2_tensor = &beta2_tmp;
+    epsilon_tensor = &epsilon_tmp;
+
+    // Loop to compute
+    for (size_t i = 0; i < param_num; ++i) {
+      VLOG(4) << "[MergedAdam] loop: " << i;
+      param_outs[i]->ShareDataWith(*params[i]);
+      mom1_outs[i]->ShareDataWith(*mom1s[i]);
+      mom2_outs[i]->ShareDataWith(*mom2s[i]);
+
+      LoDTensor beta1_pow_tmp;
+      LoDTensor beta2_pow_tmp;
+      if (beta1_pows[i]->place() == platform::CPUPlace()) {
+        T beta1 = *beta1_pows[i]->data<T>();
+        beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
+        MLUCnnl::Fill(ctx,
+                      CNNL_POINTER_MODE_HOST,
+                      &beta1,
+                      beta1_pow_tmp_desc.get(),
+                      GetBasePtr(&beta1_pow_tmp));
+        beta1_pows[i] = &beta1_pow_tmp;
+      }
+      if (beta2_pows[i]->place() == platform::CPUPlace()) {
+        T beta2 = *beta2_pows[i]->data<T>();
+        beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
+        MLUCnnl::Fill(ctx,
+                      CNNL_POINTER_MODE_HOST,
+                      &beta2,
+                      beta2_pow_tmp_desc.get(),
+                      GetBasePtr(&beta2_pow_tmp));
+        beta2_pows[i] = &beta2_pow_tmp;
+      }
+
+      VLOG(3) << "beta1_pow.numel() : " << beta1_pows[i]->numel()
+              << "beta2_pow.numel() : " << beta2_pows[i]->numel();
+      VLOG(3) << "param.numel(): " << params[i]->numel();
+      PADDLE_ENFORCE_EQ(beta1_pow_outs[i]->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "beta1 pow output size should be 1, but received "
+                            "value is:%d.",
+                            beta1_pow_outs[i]->numel()));
+
+      PADDLE_ENFORCE_EQ(beta2_pow_outs[i]->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "beta2 pow output size should be 1, but received "
+                            "value is:%d.",
+                            beta2_pow_outs[i]->numel()));
+      MLUCnnlTensorDesc param_desc(*params[i]);
+      MLUCnnlTensorDesc mom1_desc(*mom1s[i]);
+      MLUCnnlTensorDesc mom2_desc(*mom2s[i]);
+      MLUCnnlTensorDesc grad_desc(*grads[i]);
+      MLUCnnl::ApplyAdam(ctx,
+                         param_desc.get(),
+                         GetBasePtr(param_outs[i]),
+                         mom1_desc.get(),
+                         GetBasePtr(mom1_outs[i]),
+                         mom2_desc.get(),
+                         GetBasePtr(mom2_outs[i]),
+                         grad_desc.get(),
+                         GetBasePtr(grads[i]),
+                         GetBasePtr(lrs[i]),
+                         GetBasePtr(beta1_tensor),
+                         GetBasePtr(beta2_tensor),
+                         GetBasePtr(beta1_pows[i]),
+                         GetBasePtr(beta2_pows[i]),
+                         GetBasePtr(epsilon_tensor),
+                         /*use_nesterov*/ false);
+      if (!use_global_beta_pow) {
+        beta1_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
+        beta2_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
+
+        MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
+        MLUCnnlOpTensorDesc mul_op_desc(
+            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+
+        MLUCnnl::OpTensor(ctx,
+                          mul_op_desc.get(),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_pows[i]),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_tensor),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_pow_outs[i]),
+                          ToCnnlDataType<T>());
+
+        MLUCnnl::OpTensor(ctx,
+                          mul_op_desc.get(),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_pows[i]),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_tensor),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_pow_outs[i]),
+                          ToCnnlDataType<T>());
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle

@@ -346,3 +607,7 @@ REGISTER_OP_MLU_KERNEL(adam,
 REGISTER_OP_MLU_KERNEL(adamw,
                       ops::AdamWMLUKernel<float>,
                       ops::AdamWMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(merged_adam,
+                       ops::MergedAdamMLUKernel<float>,
+                       ops::MergedAdamMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -141,10 +141,9 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
          handle, pool_mode, out_w, out_h, &extra_input_size);

      if (extra_input_size > 0) {
-        phi::CPUContext cpu_ctx;
-        framework::Tensor extra_host_tensor =
-            ctx.AllocateTmpTensor<int8_t, phi::CPUContext>(
-                {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
+        framework::Tensor extra_host_tensor;
+        extra_host_tensor.mutable_data<int8_t>(
+            {static_cast<int64_t>(extra_input_size)}, platform::CPUPlace());
        cnnlInitPoolingExtraInput(handle,
                                  pool_desc.get(),
                                  trans_in_x_desc.get(),

--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -92,6 +92,112 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
  }
 };

+template <typename T>
+class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Input<Tensor>("Out");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    int in_dtype = context.Attr<int>("in_dtype");
+
+    PADDLE_ENFORCE_EQ(
+        in_dtype == -1,
+        true,
+        platform::errors::InvalidArgument(
+            "MLU only support in_dtype == -1 in reduce_max_grad op."));
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(context.GetPlace());
+
+    auto place = context.GetPlace();
+
+    // broadcast
+    auto x_dims_vec = phi::vectorize(x->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+
+    Tensor tmp_out, tmp_out_grad;
+    auto tmp_out_dims_vec = x_dims_vec;
+    for (auto d : reduce_dims) {
+      if (d < 0) {
+        d += x_dims_vec.size();
+      }
+      tmp_out_dims_vec[d] = 1;
+    }
+
+    tmp_out.ShareDataWith(*out);
+    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
+    tmp_out_grad.ShareDataWith(*out_grad);
+    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
+
+    Tensor transformed_out(x->type());
+    transformed_out.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out.mutable_data<T>(place);
+
+    MLUCnnlTensorDesc tmp_out_desc(tmp_out);
+    MLUCnnlTensorDesc transformed_out_desc(transformed_out);
+
+    MLUCnnl::BroadcastTo(context,
+                         tmp_out_desc.get(),
+                         GetBasePtr(&tmp_out),
+                         transformed_out_desc.get(),
+                         GetBasePtr(&transformed_out));
+
+    Tensor transformed_out_grad(x->type());
+    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out_grad.mutable_data<T>(place);
+    MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
+    MLUCnnlTensorDesc transformed_out_grad_desc(transformed_out_grad);
+
+    MLUCnnl::BroadcastTo(context,
+                         tmp_out_grad_desc.get(),
+                         GetBasePtr(&tmp_out_grad),
+                         transformed_out_grad_desc.get(),
+                         GetBasePtr(&transformed_out_grad));
+
+    // compare
+    Tensor equal_cond;
+    equal_cond.mutable_data<bool>(x_grad->dims(), place);
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc equal_cond_desc(equal_cond);
+
+    MLUCnnl::Logic(context,
+                   CNNL_LOGIC_OP_EQ,
+                   x_desc.get(),
+                   GetBasePtr(x),
+                   transformed_out_desc.get(),
+                   GetBasePtr(&transformed_out),
+                   equal_cond_desc.get(),
+                   GetBasePtr(&equal_cond));
+
+    // select
+    Tensor t_zero;
+    t_zero.mutable_data<T>(x_grad->dims(), place);
+    FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
+    t_zero.Resize(x_grad->dims());
+
+    MLUCnnlTensorDesc t_zero_desc(t_zero);
+    MLUCnnlTensorDesc x_grad_desc(*x_grad);
+
+    MLUCnnl::Select(context,
+                    equal_cond_desc.get(),
+                    GetBasePtr(&equal_cond),
+                    transformed_out_grad_desc.get(),
+                    GetBasePtr(&transformed_out_grad),
+                    t_zero_desc.get(),
+                    GetBasePtr(&t_zero),
+                    x_grad_desc.get(),
+                    GetBasePtr(x_grad));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -102,3 +208,7 @@ REGISTER_OP_MLU_KERNEL(reduce_max,
                       ops::ReduceMaxMLUKernel<float>,
                       ops::ReduceMaxMLUKernel<plat::float16>,
                       ops::ReduceMaxMLUKernel<int>);
+REGISTER_OP_MLU_KERNEL(reduce_max_grad,
+                       ops::ReduceMaxGradMLUKernel<float>,
+                       ops::ReduceMaxGradMLUKernel<plat::float16>,
+                       ops::ReduceMaxGradMLUKernel<int>);
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+using Tensor = phi::DenseTensor;
+using Variable = framework::Variable;
+using LoDTensorArray = framework::LoDTensorArray;
+using DDim = framework::DDim;
+
 static void ProcessStridedSliceParams(
    const std::vector<int>& axes,
    const DDim& input_dims,

--- a/paddle/fluid/platform/device/mlu/device_context.cc
+++ b/paddle/fluid/platform/device/mlu/device_context.cc
@@ -28,11 +28,13 @@ MLUContext::MLUContext(const MLUPlace& place, const int priority) {
  MLUDeviceGuard guard(place_.device);
  stream_.reset(new stream::MLUStream(place_, priority));
  InitCNNLContext();
+  InitMLUOPContext();
 }

 MLUContext::~MLUContext() {
  MLUDeviceGuard guard(place_.device);
  DestoryCNNLContext();
+  DestoryMLUOPContext();
 }

 MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
@@ -41,6 +43,7 @@ MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
  driver_version_ = GetMLUDriverVersion(place_.device);
  runtime_version_ = GetMLURuntimeVersion(place_.device);
  cnnl_version_ = GetMLUCnnlVersion(place_.device);
+  mluOp_version_ = GetMLUOpVersion(place_.device);

  LOG_FIRST_N(WARNING, 1)
      << "Please NOTE: device: " << static_cast<int>(place_.device)
@@ -51,7 +54,9 @@ MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
      << ", Runtime API Version: " << runtime_version_ / 10000 << "."
      << (runtime_version_ / 100) % 100 << "." << runtime_version_ % 100
      << ", Cnnl API Version: " << cnnl_version_ / 10000 << "."
-      << (cnnl_version_ / 100) % 100 << "." << cnnl_version_ % 100;
+      << (cnnl_version_ / 100) % 100 << "." << cnnl_version_ % 100
+      << ", MluOp API Version: " << mluOp_version_ / 10000 << "."
+      << (mluOp_version_ / 100) % 100 << "." << mluOp_version_ % 100;

  default_ctx_.reset(new MLUContext(place_));
 }
@@ -70,6 +75,10 @@ mluCnnlHandle MLUDeviceContext::cnnl_handle() const {
  return context()->CnnlHandle();
 }

+mluOpHandle MLUDeviceContext::mluOp_handle() const {
+  return context()->MluOpHandle();
+}
+
 mluStream MLUDeviceContext::stream() const { return context()->RawStream(); }

 #endif

--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
@@ -53,12 +53,19 @@ class MLUContext {

  const mluCnnlHandle& CnnlHandle() const { return cnnl_handle_; }

+  const mluOpHandle& MluOpHandle() const { return mluOp_handle_; }
+
 private:
  void InitCNNLContext() {
    PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreate(&cnnl_handle_));
    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetQueue(cnnl_handle_, RawStream()));
  }

+  void InitMLUOPContext() {
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpCreate(&mluOp_handle_));
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetQueue(mluOp_handle_, RawStream()));
+  }
+
  void DestoryCNNLContext() {
    if (cnnl_handle_) {
      PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroy(cnnl_handle_));
@@ -66,10 +73,18 @@ class MLUContext {
    cnnl_handle_ = nullptr;
  }

+  void DestoryMLUOPContext() {
+    if (mluOp_handle_) {
+      PADDLE_ENFORCE_MLU_SUCCESS(mluOpDestroy(mluOp_handle_));
+    }
+    mluOp_handle_ = nullptr;
+  }
+
  MLUPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
  std::unique_ptr<stream::MLUStream> stream_;
  mluCnnlHandle cnnl_handle_;
+  mluOpHandle mluOp_handle_;

  DISABLE_COPY_AND_ASSIGN(MLUContext);
 };
@@ -89,6 +104,9 @@ class MLUDeviceContext : public DeviceContext {
  /*! \brief  Return cnnl handle in the device context. */
  mluCnnlHandle cnnl_handle() const;

+  /*! \brief  Return mluOp handle in the device context. */
+  mluOpHandle mluOp_handle() const;
+
  /*! \brief  Return mlu stream in the device context. */
  mluStream stream() const;

@@ -135,6 +153,7 @@ class MLUDeviceContext : public DeviceContext {
  int driver_version_;
  int runtime_version_;
  int cnnl_version_;
+  int mluOp_version_;
  MLUPlace place_;
  std::shared_ptr<MLUContext> default_ctx_;


--- a/paddle/fluid/platform/device/mlu/enforce.h
+++ b/paddle/fluid/platform/device/mlu/enforce.h
@@ -41,6 +41,7 @@ struct MLUStatusType {};

 DEFINE_MLU_STATUS_TYPE(cnrtStatus, cnrtSuccess, CNRT);
 DEFINE_MLU_STATUS_TYPE(cnnlStatus, CNNL_STATUS_SUCCESS, CNNL);
+DEFINE_MLU_STATUS_TYPE(mluOpStatus, MLUOP_STATUS_SUCCESS, MLUOP);
 DEFINE_MLU_STATUS_TYPE(cnStatus, CN_SUCCESS, CN);
 #ifdef PADDLE_WITH_CNCL
 DEFINE_MLU_STATUS_TYPE(cnclStatus, CNCL_RET_SUCCESS, CNCL);
@@ -68,6 +69,15 @@ inline std::string build_mlu_error_msg(cnnlStatus stat) {
  return sout.str();
 }

+/*************** MLU OP ERROR ***************/
+inline bool is_error(mluOpStatus stat) { return stat != MLUOP_STATUS_SUCCESS; }
+
+inline std::string build_mlu_error_msg(mluOpStatus stat) {
+  std::ostringstream sout;
+  sout << "MLU OP error(" << stat << "), " << mluOpGetErrorString(stat) << ". ";
+  return sout.str();
+}
+
 /*************** CN API ERROR ***************/
 inline bool is_error(cnStatus stat) { return stat != CN_SUCCESS; }


--- a/paddle/fluid/platform/device/mlu/mlu_info.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -126,6 +126,13 @@ int GetMLUCnnlVersion(int id) {
  return x * 10000 + y * 100 + z;
 }

+int GetMLUOpVersion(int id) {
+  CheckDeviceId(id);
+  int x, y, z;
+  mluOpGetLibVersion(&x, &y, &z);
+  return x * 10000 + y * 100 + z;
+}
+
 int GetMLUCurrentDeviceId() {
  int device_id;
  PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetDevice(&device_id));

--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -16,10 +16,11 @@ limitations under the License. */

 #ifdef PADDLE_WITH_MLU
 #include <cn_api.h>
-#include <cndrv_id.h>
 #include <cnnl.h>
 #include <cnpapi.h>
+#include <cnpapi_cndrv_id.h>
 #include <cnrt.h>
+#include <mlu_op.h>
 #ifdef PADDLE_WITH_CNCL
 #include <cncl.h>
 #endif
@@ -30,11 +31,13 @@ namespace paddle {
 using cnStatus = CNresult;
 using cnrtStatus = cnrtRet_t;
 using cnnlStatus = cnnlStatus_t;
+using mluOpStatus = mluOpStatus_t;
 #ifdef PADDLE_WITH_CNCL
 using cnclStatus = cnclResult_t;
 #endif
 using mluStream = cnrtQueue_t;
 using mluCnnlHandle = cnnlHandle_t;
+using mluOpHandle = mluOpHandle_t;
 using mluEventHandle = cnrtNotifier_t;
 using mluDeviceHandle = CNdev;

@@ -49,6 +52,9 @@ int GetMLURuntimeVersion(int id);
 //! Get the cnnl version of the ith MLU.
 int GetMLUCnnlVersion(int id);

+//! Get the mluOp version of the ith MLU.
+int GetMLUOpVersion(int id);
+
 //! Get the total number of MLU devices in system.
 int GetMLUDeviceCount();


--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -29,7 +29,10 @@
 #include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
+#endif
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
 #include "paddle/fluid/platform/profiler/utils.h"

@@ -80,9 +83,11 @@ Profiler::Profiler(const ProfilerOptions& options,
  if (trace_switch.test(kProfileGPUOptionBit)) {
    tracers_.emplace_back(&CudaTracer::GetInstance(), false);
  }
+#ifdef PADDLE_WITH_MLU
  if (trace_switch.test(kProfileMLUOptionBit)) {
    tracers_.emplace_back(&MluTracer::GetInstance(), false);
  }
+#endif
  if (trace_switch.test(kProfileCustomDeviceOptionBit)) {
    for (const auto& dev_type : custom_device_types) {
      tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);

--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -34,7 +34,10 @@ import unittest
 from multiprocessing import Process
 import paddle.fluid.layers as layers
 from functools import reduce
-from test_sync_batch_norm_base_mlu import TestSyncBatchNormRunnerBase, runtime_main
+from test_sync_batch_norm_base_mlu import (
+    TestSyncBatchNormRunnerBase,
+    runtime_main,
+)
 from op_test import OpTest, _set_use_system_allocator

 from test_sync_batch_norm_op import create_or_get_tensor
@@ -44,11 +47,11 @@ paddle.enable_static()


 class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
-
    def __init__(self):
        self.global_ring_id = 0

        self.dtype = np.float32
+        self.bn_dtype = np.float32
        self.N = 8
        self.C = 16
        self.H = 32
@@ -56,29 +59,36 @@ class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
        self.dshape = [self.N, self.C, self.H, self.W]
        self.atol = 1e-3

-    def get_model(self,
-                  main,
-                  startup,
-                  place,
-                  layout,
-                  seed,
-                  sync_bn=False,
-                  only_forward=False):
+    def get_model(
+        self,
+        main,
+        startup,
+        place,
+        layout,
+        seed,
+        sync_bn=False,
+        only_forward=False,
+    ):
        """Build program."""
        use_cudnn = False
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
-                data = fluid.layers.data(name='input',
-                                         shape=self.dshape,
-                                         dtype=self.dtype,
-                                         append_batch_size=False)
+                data = fluid.layers.data(
+                    name='input',
+                    shape=self.dshape,
+                    dtype=self.dtype,
+                    append_batch_size=False,
+                )
                conv = fluid.layers.conv2d(
                    input=data,
                    num_filters=32,
                    filter_size=1,
                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
                    bias_attr=False,
-                    use_cudnn=use_cudnn)
+                    use_cudnn=use_cudnn,
+                )
+                if self.bn_dtype == np.float16:
+                    conv = fluid.layers.cast(conv, 'float16')
                bn = fluid.layers.batch_norm(
                    conv,
                    param_attr=fluid.ParamAttr(name='bn_scale'),
@@ -86,9 +96,10 @@ class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
                    moving_mean_name='bn_moving_mean',
                    moving_variance_name='bn_moving_variance',
                    data_layout=layout,
-                    is_test=only_forward)
-                # if self.dtype == np.float16:
-                #     bn = fluid.layers.cast(bn, 'float32')
+                    is_test=only_forward,
+                )
+                if self.bn_dtype == np.float16:
+                    bn = fluid.layers.cast(bn, 'float32')
                sigmoid = fluid.layers.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                # if not sync_bn:

--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -41,10 +41,10 @@ def DataTypeCast(date_type):


 class TestCollectiveAPIRunnerBase(object):
-
    def get_model(self, train_prog, startup_prog, rank, indata=None):
        raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )

    def run_trainer(self, args):
        train_prog = fluid.Program()
@@ -66,12 +66,12 @@ class TestCollectiveAPIRunnerBase(object):
            fetch_list = []
            for elem in result:
                fetch_list.append(elem.name)
-            out = exe.run(train_prog,
-                          feed={'tindata': indata},
-                          fetch_list=fetch_list)
+            out = exe.run(
+                train_prog, feed={'tindata': indata}, fetch_list=fetch_list
+            )
        else:
            out = self.get_model(train_prog, startup_prog, rank, indata)
-            #print(out, sys.stderr)
+            # print(out, sys.stderr)
        sys.stdout.buffer.write(pickle.dumps(out))


@@ -96,19 +96,20 @@ from contextlib import closing


 class TestDistBase(unittest.TestCase):
-
    def setUp(self):
        self._port_set = set()
        self._trainers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
        self._python_interp = sys.executable

    def _find_free_port(self):
-
        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

@@ -121,13 +122,13 @@ class TestDistBase(unittest.TestCase):
    def _run_cluster(self, model_file, envs):
        worker_endpoints = self._ps_endpoints.split(",")
        w0_ep, w1_ep = worker_endpoints
-        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        # print("w0_ep:",w0_ep," w1_ep:",w1_ep)
        env0 = {
            "FLAGS_selected_mlus": "0",
            "PADDLE_TRAINER_ID": "0",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
        }

        env1 = {
@@ -135,9 +136,9 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ID": "1",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
        }
-        #update environment
+        # update environment
        env0.update(envs)
        env1.update(envs)
        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
@@ -148,16 +149,20 @@ class TestDistBase(unittest.TestCase):
        tr1_cmd = tr_cmd % (self._python_interp, model_file)
        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
-        #print(tr0_cmd)
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
-
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        # print(tr0_cmd)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )

        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -170,17 +175,23 @@ class TestDistBase(unittest.TestCase):
            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         data_type,
-                         path_id="0",
-                         static_mode="1",
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self,
+        model_file,
+        col_type,
+        data_type,
+        path_id="0",
+        static_mode="1",
+        check_error_log=False,
+        need_envs={},
+    ):
        required_envs = {
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -194,7 +205,7 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_WITH_GLOO": '0',
            "BACKEND": "cncl",
            "PATH_ID": path_id,
-            "DATA_TYPE": data_type
+            "DATA_TYPE": data_type,
        }
        required_envs.update(need_envs)
        if check_error_log:
@@ -202,7 +213,8 @@ class TestDistBase(unittest.TestCase):
            required_envs["GLOG_logtostderr"] = "1"
            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
        np_data_type = DataTypeCast(data_type)
        np.random.seed(pid0)
        input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -210,21 +222,19 @@ class TestDistBase(unittest.TestCase):
        input2 = np.random.random((10, 1000)).astype(np_data_type)
        if col_type == "broadcast":
            need_result = input2
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "allreduce":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "reduce":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
        elif col_type == "allgather":
            need_result = np.vstack((input1, input2))
            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))

--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -53,10 +53,10 @@ def DataTypeCast(date_type):


 class TestCollectiveRunnerBase(object):
-
    def get_model(self, train_prog, startup_prog, col_type):
        raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )

    def wait_server_ready(self, endpoints):
        while True:
@@ -64,13 +64,15 @@ class TestCollectiveRunnerBase(object):
            not_ready_endpoints = []
            for ep in endpoints:
                ip_port = ep.split(":")
-                with closing(socket.socket(socket.AF_INET,
-                                           socket.SOCK_STREAM)) as sock:
+                with closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                ) as sock:
                    sock.settimeout(2)
                    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                    if hasattr(socket, 'SO_REUSEPORT'):
-                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
-                                        1)
+                        sock.setsockopt(
+                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1
+                        )

                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                    if result != 0:
@@ -78,44 +80,51 @@ class TestCollectiveRunnerBase(object):
                        not_ready_endpoints.append(ep)
            if not all_ok:
                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" +
-                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.write(
+                    "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+                )
                sys.stderr.flush()
                time.sleep(3)
            else:
                break

+    # endpoints should be ["ip1:port1","ip2:port2"]

-#endpoints should be ["ip1:port1","ip2:port2"]
-
-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
+    def initCommunicator(
+        self, program, rank, nranks, wait_port, current_endpoint, endpoints
+    ):
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
        if rank == 0 and wait_port:
            self.wait_server_ready(other_endpoints)
        block = program.global_block()
-        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
-                                       persistable=True,
-                                       type=core.VarDesc.VarType.RAW)
-
-        block.append_op(type='c_gen_cncl_id',
-                        inputs={},
-                        outputs={'Out': cncl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-
-        block.append_op(type='c_comm_init',
-                        inputs={'X': cncl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': self.global_ring_id
-                        })
+        cncl_id_var = block.create_var(
+            name=nameGen.generate('cncl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+        )
+
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+            },
+        )

    def run_trainer(self, args):
        train_prog = fluid.Program()
@@ -124,8 +133,9 @@ class TestCollectiveRunnerBase(object):
        rank = args["trainerid"]
        current_endpoint = args["currentendpoint"]
        nranks = 2
-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
+        self.initCommunicator(
+            startup_prog, rank, nranks, True, current_endpoint, endpoints
+        )
        self.rank = rank
        result = self.get_model(train_prog, startup_prog, args["col_type"])
        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
@@ -135,9 +145,9 @@ class TestCollectiveRunnerBase(object):
        np.random.seed(os.getpid())
        np_data_type = DataTypeCast(args["data_type"])
        indata = np.random.random((10, 1000)).astype(np_data_type)
-        out = exe.run(train_prog,
-                      feed={'tindata': indata},
-                      fetch_list=[result.name])
+        out = exe.run(
+            train_prog, feed={'tindata': indata}, fetch_list=[result.name]
+        )
        sys.stdout.buffer.write(pickle.dumps(out))


@@ -160,19 +170,20 @@ from contextlib import closing


 class TestDistBase(unittest.TestCase):
-
    def setUp(self):
        self._port_set = set()
        self._trainers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
        self._python_interp = sys.executable

    def _find_free_port(self):
-
        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

@@ -191,7 +202,7 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ID": "0",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
        }

        env1 = {
@@ -199,9 +210,9 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ID": "1",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
        }
-        #update environment
+        # update environment
        env0.update(envs)
        env1.update(envs)
        tr_cmd = "%s %s"
@@ -210,15 +221,19 @@ class TestDistBase(unittest.TestCase):
        tr0_pipe = open("/tmp/tr0_err.log", "wb")
        tr1_pipe = open("/tmp/tr1_err.log", "wb")

-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )

-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )

        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -227,15 +242,21 @@ class TestDistBase(unittest.TestCase):
        # close trainer file
        tr0_pipe.close()
        tr1_pipe.close()
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         data_type,
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self,
+        model_file,
+        col_type,
+        data_type,
+        check_error_log=False,
+        need_envs={},
+    ):
        required_envs = {
            "FLAGS_eager_delete_tensor_gb": "0.0",
            "PATH": os.getenv("PATH"),
@@ -251,7 +272,8 @@ class TestDistBase(unittest.TestCase):
            required_envs["GLOG_v"] = "3"
            required_envs["GLOG_logtostderr"] = "1"
        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
        np_data_type = DataTypeCast(data_type)
        np.random.seed(pid0)
        input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -259,63 +281,55 @@ class TestDistBase(unittest.TestCase):
        input2 = np.random.random((10, 1000)).astype(np_data_type)
        if col_type == "broadcast":
            need_result = input2
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "allreduce_sum":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "allreduce_prod":
            need_result = input1 * input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "allreduce_max":
            need_result = np.maximum(input1, input2)
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "allreduce_min":
            need_result = np.minimum(input1, input2)
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "reduce_sum":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "reduce_prod":
            need_result = input1 * input2
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "reduce_max":
            need_result = np.maximum(input1, input2)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "reduce_min":
            need_result = np.minimum(input1, input2)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "allgather":
            need_result = np.vstack((input1, input2))
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        else:
            pass
--- a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
@@ -29,26 +29,44 @@ SEED = 2022


 class TestDropoutOp(OpTest):
-
    def setUp(self):
-        self.op_type = "dropout"
        self.set_mlu()
        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.init_inputs_shape()
+        self.init_attrs()
+        self.op_type = 'dropout'
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
+            'dropout_prob': self.dropout_prob,
+            'fix_seed': self.fix_seed,
+            'is_test': self.is_test,
+            'dropout_implementation': self.dropout_implementation,
        }

+        out = self.inputs['X'] * (1.0 - self.dropout_prob)
+        if self.is_test == False:
+            mask = None
+            if self.dropout_prob == 0.0:
+                mask = np.ones(self.shape).astype('uint8')
+            elif self.dropout_prob == 1.0:
+                mask = np.zeros(self.shape).astype('uint8')
+            self.outputs = {'Out': out, 'Mask': mask}
+        else:
+            self.outputs = {'Out': out}
+
    def init_dtype(self):
        self.dtype = np.float32

+    def init_inputs_shape(self):
+        self.shape = [32, 64]
+
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = False
+        self.dropout_prob = 0.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "upscale_in_train"
+
    def set_mlu(self):
        self.__class__.use_mlu = True
        self.place = paddle.device.MLUPlace(0)
@@ -57,84 +75,107 @@ class TestDropoutOp(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
+        if (
+            hasattr(self.__class__, "no_need_check_grad")
+            and self.__class__.no_need_check_grad == True
+        ):
+            return
+
        self.check_grad_with_place(self.place, ['X'], 'Out')


 class TestDropoutOpInput1d(TestDropoutOp):
-    # change input shape
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((3, 62)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((3, 62)).astype('uint8')
-        }
-
-
-class TestDropoutOpInput1d_1(TestDropoutOp):
-    # the input is 1-D
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((2000)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((2000)).astype('uint8')
-        }
+    def init_inputs_shape(self):
+        self.shape = [2000]


 class TestDropoutOp2(TestDropoutOp):
-    # the dropout_prob is 1.0
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 1.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('uint8')
-        }
+    def init_inputs_shape(self):
+        self.shape = [32, 64]
+
+    def init_attrs(self):
+        self.dropout_prob = 1.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "upscale_in_train"


 class TestDropoutOp3(TestDropoutOp):
-    # the input dim is 3
+    def init_inputs_shape(self):
+        self.shape = [32, 64, 2]
+
+
+class TestDropoutOp4(TestDropoutOp):
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.35
+        self.fix_seed = True
+        self.is_test = True
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOp5(TestDropoutOp):
+    def init_inputs_shape(self):
+        self.shape = [32, 64, 3]
+
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.75
+        self.fix_seed = True
+        self.is_test = True
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOp6(TestDropoutOp):
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOpWithSeed(TestDropoutOp):
+    # the seed is a Tensor
    def setUp(self):
        self.op_type = "dropout"
        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64, 2)).astype(self.dtype)}
+        self.dtype = np.float32
+        self.inputs = {
+            "X": np.random.random((32, 64)).astype(self.dtype),
+            "Seed": np.asarray([125], dtype="int32"),
+        }
        self.attrs = {
            'dropout_prob': 0.0,
-            'fix_seed': True,
            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
        }
        self.outputs = {
            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('uint8')
+            'Mask': np.ones((32, 64)).astype('uint8'),
        }

+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestDropoutOpFp16(TestDropoutOp):
+    # float16
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+

 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference(OpTest):
@@ -148,7 +189,7 @@ class TestDropoutOpInference(OpTest):
            'dropout_prob': 0.35,
            'fix_seed': True,
            'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
        }
        self.outputs = {'Out': self.inputs['X']}

@@ -165,7 +206,6 @@ class TestDropoutOpInference(OpTest):

 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference2(TestDropoutOpInference):
-
    def setUp(self):
        self.op_type = "dropout"
        self.set_mlu()
@@ -174,45 +214,12 @@ class TestDropoutOpInference2(TestDropoutOpInference):
        self.attrs = {
            'dropout_prob': 0.75,
            'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
        }
        self.outputs = {'Out': self.inputs['X']}


-class TestDropoutOpWithSeed(TestDropoutOp):
-    # the seed is a Tensor
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {
-            "X": np.random.random((32, 64)).astype(self.dtype),
-            "Seed": np.asarray([125], dtype="int32")
-        }
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
-        }
-
-
-class TestDropoutOpFp16(TestDropoutOp):
-    # float16
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def set_mlu(self):
-        self.__class__.use_mlu = True
-        self.place = paddle.device.MLUPlace(0)
-        self.__class__.no_need_check_grad = True
-
-
 class TestDropoutAPI(unittest.TestCase):
-
    def setUp(self):
        np.random.seed(123)
        self.places = [fluid.CPUPlace(), paddle.device.MLUPlace(0)]
@@ -220,43 +227,44 @@ class TestDropoutAPI(unittest.TestCase):
    def check_static_result(self, place):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
-            res1 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                training=False,
-                                                mode='upscale_in_train')
-            res2 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=0,
-                                                training=True,
-                                                mode='upscale_in_train')
-            res3 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=0,
-                                                training=False,
-                                                mode='upscale_in_train')
-            res4 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=[0, 1],
-                                                training=True,
-                                                mode='upscale_in_train')
-            res5 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=[0, 1],
-                                                training=False,
-                                                mode='upscale_in_train')
-            res6 = paddle.nn.functional.dropout(x=input,
-                                                p=1.,
-                                                training=True,
-                                                mode='upscale_in_train')
+            res1 = paddle.nn.functional.dropout(
+                x=input, p=0.0, training=False, mode='upscale_in_train'
+            )
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0.0, axis=0, training=True, mode='upscale_in_train'
+            )
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0.0, axis=0, training=False, mode='upscale_in_train'
+            )
+            res4 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train',
+            )
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train',
+            )
+            res6 = paddle.nn.functional.dropout(
+                x=input, p=1.0, training=True, mode='upscale_in_train'
+            )
            res7 = paddle.fluid.layers.dropout(
                x=input,
-                dropout_prob=0.,
-                dropout_implementation='upscale_in_train')
-            res8 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=(0, 1),
-                                                training=False,
-                                                mode='upscale_in_train')
+                dropout_prob=0.0,
+                dropout_implementation='upscale_in_train',
+            )
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train',
+            )

            in_np = np.random.random([40, 40]).astype("float32")
            res_np = in_np
@@ -265,13 +273,17 @@ class TestDropoutAPI(unittest.TestCase):
            exe = fluid.Executor(place)
            res_list = [res1, res2, res3, res4, res5, res7, res8]
            for res in res_list:
-                fetches = exe.run(fluid.default_main_program(),
-                                  feed={"input": in_np},
-                                  fetch_list=[res])
+                fetches = exe.run(
+                    fluid.default_main_program(),
+                    feed={"input": in_np},
+                    fetch_list=[res],
+                )
                np.testing.assert_allclose(fetches[0], res_np)
-            fetches2 = exe.run(fluid.default_main_program(),
-                               feed={"input": in_np},
-                               fetch_list=[res6])
+            fetches2 = exe.run(
+                fluid.default_main_program(),
+                feed={"input": in_np},
+                fetch_list=[res6],
+            )
            np.testing.assert_allclose(fetches2[0], res_np2)

    def test_static(self):

--- a/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
@@ -28,12 +28,15 @@ def AffineGrid(theta, grid_shape):
    n = grid_shape[0]
    h = grid_shape[1]
    w = grid_shape[2]
-    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w,
-                      axis=0).T[:, :, np.newaxis]
-    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h,
-                      axis=0)[:, :, np.newaxis]
-    grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])],
-                          axis=2)  # h * w * 3
+    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[
+        :, :, np.newaxis
+    ]
+    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[
+        :, :, np.newaxis
+    ]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2
+    )  # h * w * 3
    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3

    ret = np.zeros([n, h * w, 2])
@@ -53,13 +56,17 @@ def getGridPointValue(data, x, y):
    out_H = x.shape[1]
    out_W = x.shape[2]

-    #out = np.zeros(data_shape, dtype='float32')
+    # out = np.zeros(data_shape, dtype='float32')
    out = np.zeros([N, C, out_H, out_W], dtype='float32')
    for i in range(N):
        for j in range(out_H):
            for k in range(out_W):
-                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
-                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
+                if (
+                    y[i, j, k] < 0
+                    or y[i, j, k] > in_H - 1
+                    or x[i, j, k] < 0
+                    or x[i, j, k] > in_W - 1
+                ):
                    out[i, :, j, k] = 0
                else:
                    out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -75,27 +82,28 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
    if align_corners:
        grid_slice = 0.5 * ((grid_slice.astype('float32') + 1.0) * max_val)
    else:
-        grid_slice = 0.5 * ((grid_slice.astype('float32') + 1.0) *
-                            (max_val + 1)) - 0.5
+        grid_slice = (
+            0.5 * ((grid_slice.astype('float32') + 1.0) * (max_val + 1)) - 0.5
+        )

    if padding_mode == "border":
        grid_slice = clip(grid_slice, 0, max_val)
    elif padding_mode == "reflection":
        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
-        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
-                                                                   0.5)
+        grid_abs = (
+            np.abs(grid_slice) if align_corners else np.abs(grid_slice + 0.5)
+        )
        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
        grid_slice = np.minimum(extra, double_range - extra)
-        grid_slice = grid_slice if align_corners else clip(
-            grid_slice - 0.5, 0, max_val)
+        grid_slice = (
+            grid_slice if align_corners else clip(grid_slice - 0.5, 0, max_val)
+        )
    return grid_slice


-def GridSampler(data,
-                grid,
-                align_corners=True,
-                mode="bilinear",
-                padding_mode="zeros"):
+def GridSampler(
+    data, grid, align_corners=True, mode="bilinear", padding_mode="zeros"
+):
    dims = data.shape
    N = dims[0]
    in_C = dims[1]
@@ -119,14 +127,18 @@ def GridSampler(data,
        y0 = np.floor(y).astype('int32')
        y1 = y0 + 1

-        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
+        wa = np.tile(
+            ((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wb = np.tile(
+            ((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wc = np.tile(
+            ((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wd = np.tile(
+            ((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )

        va = getGridPointValue(data, x0, y0)
        vb = getGridPointValue(data, x0, y1)
@@ -142,7 +154,6 @@ def GridSampler(data,


 class TestGridSamplerOp(OpTest):
-
    def setUp(self):
        self.place = paddle.device.MLUPlace(0)
        self.__class__.use_mlu = True
@@ -166,12 +177,12 @@ class TestGridSamplerOp(OpTest):
            'use_cudnn': False,
            "align_corners": self.align_corners,
            "padding_mode": self.padding_mode,
-            "mode": self.mode
+            "mode": self.mode,
        }
        self.outputs = {
-            'Output':
-            GridSampler(x, grid, self.align_corners, self.mode,
-                        self.padding_mode)
+            'Output': GridSampler(
+                x, grid, self.align_corners, self.mode, self.padding_mode
+            )
        }

    def test_check_output(self):
@@ -186,20 +197,17 @@ class TestGridSamplerOp(OpTest):
        self.mode = "bilinear"


-# TODO(fwg): Test this case when cnnl support align_corners = True.
-# class Case1(TestGridSamplerOp):
-#
-#     def initTestCase(self):
-#         self.x_shape = (2, 3, 5, 6)
-#         self.grid_shape = (2, 8, 9, 2)
-#         self.theta_shape = (2, 2, 3)
-#         self.align_corners = True
-#         self.padding_mode = "zeros"
-#         self.mode = "bilinear"
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"


 class LargeInputCase(TestGridSamplerOp):
-
    def initTestCase(self):
        self.x_shape = (2, 3, 128, 128)
        self.grid_shape = (2, 130, 130, 2)
@@ -209,16 +217,15 @@ class LargeInputCase(TestGridSamplerOp):
        self.mode = "bilinear"


-# TODO(fwg): Test this case when cnnl support align_corners = True.
-# class Case2(LargeInputCase):
-#
-#     def initTestCase(self):
-#         self.x_shape = (2, 3, 128, 128)
-#         self.grid_shape = (2, 130, 130, 2)
-#         self.theta_shape = (2, 2, 3)
-#         self.align_corners = True
-#         self.padding_mode = "zeros"
-#         self.mode = "bilinear"
+class Case2(LargeInputCase):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+
+
+def huber_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+class TestHuberLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'huber_loss'
+        self.set_mlu()
+        self.python_api = paddle.fluid.layers.huber_loss
+        self.python_out_sig = ["Out"]
+        self.delta = 1.0
+        self.init_input()
+        shape = self.set_shape()
+        residual = self.inputs['Y'] - self.inputs['X']
+        loss = np.vectorize(huber_loss_forward)(residual, self.delta).astype(
+            'float32'
+        )
+        self.attrs = {'delta': self.delta}
+        self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
+
+    def init_input(self):
+        shape = self.set_shape()
+        self.inputs = {
+            'X': np.random.uniform(0, 1.0, shape).astype('float32'),
+            'Y': np.random.uniform(0, 1.0, shape).astype('float32'),
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def set_shape(self):
+        return (100, 1)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place,
+            ['Y'],
+            'Out',
+            max_relative_error=0.008,
+            no_grad_set=set("residual"),
+        )
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+            max_relative_error=0.008,
+            no_grad_set=set('residual'),
+        )
+
+
+def TestHuberLossOp1(TestHuberLossOp):
+    def set_shape(self):
+        return 64
+
+
+def TestHuberLossOp2(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6)
+
+
+def TestHuberLossOp3(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6, 1)
+
+
+class TestHuberLossOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input and label must be Variable
+            xw = np.random.random((6, 6)).astype("float32")
+            xr = fluid.data(name='xr', shape=[None, 6], dtype="float32")
+            lw = np.random.random((6, 6)).astype("float32")
+            lr = fluid.data(name='lr', shape=[None, 6], dtype="float32")
+            delta = 1.0
+            self.assertRaises(TypeError, fluid.layers.huber_loss, xr, lw, delta)
+            self.assertRaises(TypeError, fluid.layers.huber_loss, xw, lr, delta)
+
+            # the dtype of input and label must be float32 or float64
+            xw2 = fluid.data(name='xw2', shape=[None, 6], dtype="int32")
+            lw2 = fluid.data(name='lw2', shape=[None, 6], dtype="int32")
+            self.assertRaises(
+                TypeError, fluid.layers.huber_loss, xw2, lr, delta
+            )
+            self.assertRaises(
+                TypeError, fluid.layers.huber_loss, xr, lw2, delta
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+sys.path.append('..')
+import unittest
+import paddle
+import numpy as np
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid.framework import in_dygraph_mode
+
+
+def run_adam_op(
+    params,
+    grads,
+    lrs,
+    moment1s,
+    moment2s,
+    beta1_pows,
+    beta2_pows,
+    master_params,
+    epsilon,
+    beta1,
+    beta2,
+    place,
+    multi_precision=False,
+    use_merged=False,
+):
+    assert len(params) == len(grads)
+    assert len(params) == len(lrs)
+    assert len(params) == len(moment1s)
+    assert len(params) == len(moment2s)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(master_params)
+    paddle.disable_static()
+    # paddle.set_device(place)
+
+    param_vars = [paddle.fluid.dygraph.to_variable(p) for p in params]
+    grad_vars = [paddle.fluid.dygraph.to_variable(g) for g in grads]
+    lr_vars = [paddle.fluid.dygraph.to_variable(l) for l in lrs]
+    moment1_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment1s]
+    moment2_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment2s]
+    beta1_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta1_pows]
+    beta2_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta2_pows]
+    master_param_vars = [
+        paddle.fluid.dygraph.to_variable(m_p) for m_p in master_params
+    ]
+
+    if not use_merged:
+        for i in range(len(param_vars)):
+            _, _, _, _, _, _ = _legacy_C_ops.adam(
+                param_vars[i],
+                grad_vars[i],
+                lr_vars[i],
+                moment1_vars[i],
+                moment2_vars[i],
+                beta1_pow_vars[i],
+                beta2_pow_vars[i],
+                master_param_vars[i],
+                param_vars[i],
+                moment1_vars[i],
+                moment2_vars[i],
+                beta1_pow_vars[i],
+                beta2_pow_vars[i],
+                master_param_vars[i],
+                'epsilon',
+                epsilon,
+                'beta1',
+                beta1,
+                'beta2',
+                beta2,
+                'multi_precision',
+                multi_precision,
+            )
+    else:
+        if in_dygraph_mode():
+            _, _, _, _, _, _ = _C_ops.merged_adam_(
+                param_vars,
+                grad_vars,
+                lr_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                beta1,
+                beta2,
+                epsilon,
+                multi_precision,
+                False,
+            )
+        else:
+            _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
+                param_vars,
+                grad_vars,
+                lr_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                param_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                'epsilon',
+                epsilon,
+                'beta1',
+                beta1,
+                'beta2',
+                beta2,
+                'multi_precision',
+                multi_precision,
+            )
+
+    outputs = {
+        'ParamOut': param_vars,
+        'Moment1Out': moment1_vars,
+        'Moment2Out': moment2_vars,
+        'Beta1PowOut': beta1_pow_vars,
+        'Beta2PowOut': beta2_pow_vars,
+        'MasterParamOut': master_param_vars,
+    }
+
+    return outputs
+
+
+class TestMergedAdam(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        # dtype = np.float16 if multi_precision and place == 'mlu' else np.float32
+        dtype = np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        lrs = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        moment1s = self.gen_rand_data(shapes, mp_dtype)
+        moment2s = self.gen_rand_data(shapes, mp_dtype)
+        beta1_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        beta2_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        master_params = [p.astype(mp_dtype) for p in params]
+        return (
+            params,
+            grads,
+            lrs,
+            moment1s,
+            moment2s,
+            beta1_pows,
+            beta2_pows,
+            master_params,
+        )
+
+    def check_with_place(self, place, multi_precision):
+        (
+            params,
+            grads,
+            lrs,
+            moment1s,
+            moment2s,
+            beta1_pows,
+            beta2_pows,
+            master_params,
+        ) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            return run_adam_op(
+                params=params,
+                grads=grads,
+                lrs=lrs,
+                moment1s=moment1s,
+                moment2s=moment2s,
+                beta1_pows=beta1_pows,
+                beta2_pows=beta2_pows,
+                master_params=master_params,
+                epsilon=0.9,
+                beta1=0.9,
+                beta2=0.99,
+                place=place,
+                multi_precision=multi_precision,
+                use_merged=use_merged,
+            )
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+
+        for key in outs1.keys():
+            value1 = outs1[key]
+            value2 = outs2[key]
+            for i in range(len(value1)):
+                if place == 'mlu':
+                    np.testing.assert_array_equal(value1[i], value2[i])
+                else:
+                    np.testing.assert_allclose(
+                        value1[i], value2[i], rtol=1e-05, atol=1e-07
+                    )
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            self.check_with_place(self.place, multi_precision)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+
+sys.path.append('..')
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+import math
+
+paddle.enable_static()
+
+
+class TestMLUPriorBox(OpTest):
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.set_mlu()
+        self.init_dtype()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset,
+        }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
+        self.set_max_sizes()
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.set_min_max_aspect_ratios_order()
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float64
+        ).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
+
+        self.clip = True
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 0:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w, self.image_h)
+        ).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w, self.layer_h)
+        ).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.0
+                        out_boxes[h, w, idx, :] = [
+                            (c_x - c_w) / self.image_w,
+                            (c_y - c_h) / self.image_h,
+                            (c_x + c_w) / self.image_w,
+                            (c_y + c_h) / self.image_h,
+                        ]
+                        idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.0) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(
+            self.variances, (self.layer_h, self.layer_w, self.num_priors, 1)
+        )
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestMLUPriorBoxWithoutMaxSize(TestMLUPriorBox):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
+class TestMLUPriorBoxWithoutSpecifiedOutOrder(TestMLUPriorBox):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
@@ -26,7 +26,6 @@ paddle.enable_static()


 class TestMLUReduceSumOp(OpTest):
-
    def setUp(self):
        self.init_op_type()
        self.initTestCase()
@@ -34,16 +33,16 @@ class TestMLUReduceSumOp(OpTest):
        self.attrs = {
            'dim': self.axis,
            'keep_dim': self.keep_dim,
-            'reduce_all': self.reduce_all
+            'reduce_all': self.reduce_all,
        }
        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
        if self.attrs['reduce_all']:
            self.outputs = {'Out': self.inputs['X'].sum()}
        else:
            self.outputs = {
-                'Out':
-                self.inputs['X'].sum(axis=self.axis,
-                                     keepdims=self.attrs['keep_dim'])
+                'Out': self.inputs['X'].sum(
+                    axis=self.axis, keepdims=self.attrs['keep_dim']
+                )
            }

    def set_mlu(self):
@@ -64,100 +63,92 @@ class TestMLUReduceSumOp(OpTest):

    def initTestCase(self):
        self.shape = (5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class TestSumOp5D(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (1, 2, 5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class TestSumOp6D(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (1, 1, 2, 5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class TestSumOp8D(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
        self.axis = (0, 3)


 class Test1DReduce(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = 120
-        self.axis = (0, )
+        self.axis = (0,)


 class Test2DReduce0(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (20, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class Test2DReduce1(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (20, 10)
-        self.axis = (1, )
+        self.axis = (1,)


 class Test3DReduce0(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
-        self.axis = (1, )
+        self.axis = (1,)


 class Test3DReduce1(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
-        self.axis = (2, )
+        self.axis = (2,)


 class Test3DReduce2(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
-        self.axis = (-2, )
+        self.axis = (-2,)


 class Test3DReduce3(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
        self.axis = (1, 2)


 class TestKeepDimReduce(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 10)
-        self.axis = (1, )
+        self.axis = (1,)
        self.keep_dim = True


 class TestKeepDim8DReduce(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
        self.axis = (3, 4, 5)
        self.keep_dim = True

+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', max_relative_error=0.03
+        )
+

 class TestReduceAll(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 2, 10)
-        self.axis = (0, )
+        self.axis = (0,)
        self.reduce_all = True



--- a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -31,7 +31,6 @@ paddle.enable_static()
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
 class TestSliceOp(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -42,7 +41,7 @@ class TestSliceOp(OpTest):
            'axes': self.axes,
            'starts': self.starts,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -57,9 +56,9 @@ class TestSliceOp(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -67,7 +66,6 @@ class TestSliceOp(OpTest):


 class TestCase1(TestSliceOp):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-3, 0, 2]
@@ -78,7 +76,6 @@ class TestCase1(TestSliceOp):


 class TestCase2(TestSliceOp):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-3, 0, 2]
@@ -90,7 +87,6 @@ class TestCase2(TestSliceOp):

 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -118,9 +114,9 @@ class TestSliceOp_decs_dim(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -128,7 +124,6 @@ class TestSliceOp_decs_dim(OpTest):


 class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [1, 0, 2]
@@ -140,7 +135,6 @@ class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-1, 0, 2]
@@ -152,7 +146,6 @@ class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
        self.starts = [0, 1, 2, 3]
@@ -164,7 +157,6 @@ class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-1]
@@ -176,7 +168,6 @@ class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [0, 1, 2, 3]
@@ -190,7 +181,6 @@ class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 # without attr(decrease)
 class TestSliceOp_starts_ListTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -198,8 +188,9 @@ class TestSliceOp_starts_ListTensor(OpTest):

        starts_tensor = []
        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int64') * ele))
+            starts_tensor.append(
+                ("x" + str(index), np.ones((1)).astype('int64') * ele)
+            )

        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
        self.outputs = {'Out': self.out}
@@ -207,7 +198,7 @@ class TestSliceOp_starts_ListTensor(OpTest):
            'axes': self.axes,
            'starts': self.starts_infer,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -224,9 +215,9 @@ class TestSliceOp_starts_ListTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -236,7 +227,6 @@ class TestSliceOp_starts_ListTensor(OpTest):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -244,8 +234,9 @@ class TestSliceOp_decs_dim_starts_ListTensor(OpTest):

        starts_tensor = []
        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
+            starts_tensor.append(
+                ("x" + str(index), np.ones((1)).astype('int32') * ele)
+            )

        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}

@@ -273,9 +264,9 @@ class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -283,8 +274,8 @@ class TestSliceOp_decs_dim_starts_ListTensor(OpTest):


 class TestSliceOp_decs_dim_5_starts_ListTensor(
-        TestSliceOp_decs_dim_starts_ListTensor):
-
+    TestSliceOp_decs_dim_starts_ListTensor
+):
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-1]
@@ -300,7 +291,6 @@ class TestSliceOp_decs_dim_5_starts_ListTensor(
 # Situation 3: starts(tensor), ends(list, no tensor)
 # with attr(decrease)
 class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -308,7 +298,7 @@ class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
        self.config()
        self.inputs = {
            'Input': self.input,
-            "StartsTensor": np.array(self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32"),
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
@@ -332,15 +322,14 @@ class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Situation 4: starts(tensor), ends(tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -350,14 +339,14 @@ class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
        self.inputs = {
            'Input': self.input,
            "StartsTensor": np.array(self.starts, dtype="int64"),
-            "EndsTensor": np.array(self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32"),
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
            'axes': self.axes,
            #'starts': self.starts,
            #'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -372,15 +361,14 @@ class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Situation 5: starts(tensor), ends(tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -389,7 +377,7 @@ class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
        self.inputs = {
            'Input': self.input,
            "StartsTensor": np.array(self.starts, dtype="int32"),
-            "EndsTensor": np.array(self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32"),
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
@@ -413,15 +401,14 @@ class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Situation 6: starts(tensor), ends(list, have tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -430,20 +417,21 @@ class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):

        ends_tensor = []
        for index, ele in enumerate(self.ends):
-            ends_tensor.append(("y" + str(index), np.ones(
-                (1)).astype('int32') * ele))
+            ends_tensor.append(
+                ("y" + str(index), np.ones((1)).astype('int32') * ele)
+            )

        self.inputs = {
            'Input': self.input,
            "StartsTensor": np.array(self.starts, dtype="int32"),
-            'EndsTensorList': ends_tensor
+            'EndsTensorList': ends_tensor,
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
            'axes': self.axes,
            #'starts': self.starts,
            'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -460,14 +448,13 @@ class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Test float16
 class TestFP16(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -479,7 +466,7 @@ class TestFP16(OpTest):
            'axes': self.axes,
            'starts': self.starts,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -495,13 +482,12 @@ class TestFP16(OpTest):
        self.check_output_with_place(self.place, atol=1e-5)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 class TestFP16_2(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -513,7 +499,7 @@ class TestFP16_2(OpTest):
            'axes': self.axes,
            'starts': self.starts,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -529,24 +515,28 @@ class TestFP16_2(OpTest):
        self.check_output_with_place(self.place, atol=1e-5)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006,
-                                   numeric_grad_delta=0.5)
+        self.check_grad_with_place(
+            self.place,
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            numeric_grad_delta=0.5,
+        )


 class TestSliceApiWithTensor(unittest.TestCase):
-
    def test_starts_ends_is_tensor(self):
        with paddle.fluid.dygraph.guard():
            a = paddle.rand(shape=[4, 5, 6], dtype='float32')
            axes = [0, 1, 2]
            starts = [-3, 0, 2]
            ends = [3, 2, 4]
-            a_1 = paddle.slice(a,
-                               axes=axes,
-                               starts=paddle.to_tensor(starts, dtype='int32'),
-                               ends=paddle.to_tensor(ends, dtype='int32'))
+            a_1 = paddle.slice(
+                a,
+                axes=axes,
+                starts=paddle.to_tensor(starts, dtype='int32'),
+                ends=paddle.to_tensor(ends, dtype='int32'),
+            )
            a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)

            np.testing.assert_allclose(a_1.numpy(), a_2.numpy())
@@ -569,24 +559,22 @@ class TestSliceApiWithTensor(unittest.TestCase):


 class TestImperativeVarBaseGetItem(unittest.TestCase):
-
    def test_getitem_with_long(self):
        with fluid.dygraph.guard():
            data = np.random.random((2, 80, 16128)).astype('float32')
            var = fluid.dygraph.to_variable(data)
-            sliced = var[:, 10:, :var.shape[1]]  # var.shape[1] is 80L here
+            sliced = var[:, 10:, : var.shape[1]]  # var.shape[1] is 80L here
            self.assertEqual(sliced.shape, [2, 70, 80])

-            sliced = var[:, var.shape[0]:, var.shape[0]:var.shape[1]]
+            sliced = var[:, var.shape[0] :, var.shape[0] : var.shape[1]]
            self.assertEqual(sliced.shape, [2, 78, 78])

    def test_getitem_with_float(self):
-
        def test_float_in_slice_item():
            with fluid.dygraph.guard():
                data = np.random.random((2, 80, 16128)).astype('float32')
                var = fluid.dygraph.to_variable(data)
-                sliced = var[:, 1.1:, :var.shape[1]]
+                sliced = var[:, 1.1:, : var.shape[1]]

        self.assertRaises(Exception, test_float_in_slice_item)

@@ -600,15 +588,6 @@ class TestImperativeVarBaseGetItem(unittest.TestCase):


 class TestInferShape(unittest.TestCase):
-
-    def test(self):
-        x = paddle.ones(shape=[3, 4, 5])
-        x.desc.set_shape([3, -1, 5])
-        self.assertEqual(x.shape, (3, -1, 5))
-
-        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
-        self.assertEqual(out0.shape, (3, 3, 5))
-
    def test_axis_less_than_zero(self):

        # Using paddle.disable_static will make other unittests fail.
@@ -616,13 +595,18 @@ class TestInferShape(unittest.TestCase):
            x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
            x = paddle.to_tensor(x_arr)

-            pp_slice = paddle.slice(x, [
-                100,
-            ], [0], [1])
+            pp_slice = paddle.slice(
+                x,
+                [
+                    100,
+                ],
+                [0],
+                [1],
+            )
            np_slice = x_arr[:, :, 0:1]
            np.testing.assert_allclose(pp_slice, np_slice)

-            pp_slice = paddle.slice(x, (-100, ), [0], [1])
+            pp_slice = paddle.slice(x, (-100,), [0], [1])
            np_slice = x_arr[0:1]
            np.testing.assert_allclose(pp_slice, np_slice)

@@ -630,9 +614,11 @@ class TestInferShape(unittest.TestCase):
            x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))

            starts = paddle.to_tensor(
-                np.reshape(np.array([], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0,))
+            )
            ends = paddle.to_tensor(
-                np.reshape(np.array([], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0,))
+            )

            with self.assertRaises(ValueError):
                paddle.slice(x, [-1000000], starts, ends)

--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
@@ -30,7 +30,6 @@ SEED = 2021


 class TestSoftmaxWithCrossEntropyOp(OpTest):
-
    def set_mlu(self):
        self.__class__.use_mlu = True

@@ -53,8 +52,10 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.initParams()

        logits = getattr(
-            self, "logits",
-            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)

        if self.soft_label:
@@ -65,8 +66,9 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
            self.shape[self.axis] = 1
            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")

-        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
-                             self.ignore_index)
+        loss = cross_entropy(
+            softmax, labels, self.soft_label, self.axis, self.ignore_index
+        )

        one_hot_label = np.eye(axis_dim)[labels.reshape(-1)]

@@ -74,7 +76,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.outputs = {
            "Backprop": (softmax - one_hot_label).astype(self.dtype),
            "Softmax": softmax.astype(self.dtype),
-            "Loss": loss.astype(self.dtype)
+            "Loss": loss.astype(self.dtype),
        }
        self.attrs = {
            "numeric_stable_mode": self.numeric_stable_mode,
@@ -92,14 +94,16 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        if self.dtype == np.float16:
            return
        # fp32 has low precision, cpu and mlu both need to relax the max_relative_error if using fp32
-        self.check_grad_with_place(self.place, ['Logits'],
-                                   'Loss',
-                                   numeric_grad_delta=0.001,
-                                   max_relative_error=0.5)
+        self.check_grad_with_place(
+            self.place,
+            ['Logits'],
+            'Loss',
+            numeric_grad_delta=0.001,
+            max_relative_error=0.5,
+        )


 class TestPowNet(unittest.TestCase):
-
    def _test(self, run_mlu=True):
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
@@ -114,9 +118,9 @@ class TestPowNet(unittest.TestCase):
        with paddle.static.program_guard(main_prog, startup_prog):
            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(name="label",
-                                       shape=[32, 1],
-                                       dtype='int64')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64'
+            )

            sum = paddle.add(a, b)
            z = paddle.pow(sum, 2.0)
@@ -140,16 +144,17 @@ class TestPowNet(unittest.TestCase):
        print("Start run on {}".format(place))
        for epoch in range(100):

-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np, "b": b_np, "label": label_np},
+                fetch_list=[prediction, loss],
+            )
            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
+                print(
+                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                        epoch, pred_res[0], loss_res
+                    )
+                )

        return pred_res, loss_res

@@ -157,7 +162,7 @@ class TestPowNet(unittest.TestCase):
        cpu_pred, cpu_loss = self._test(False)
        mlu_pred, mlu_loss = self._test(True)

-        np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-5)
+        np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=2e-5)
        np.testing.assert_allclose(mlu_loss, cpu_loss)



--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
@@ -44,17 +44,19 @@ SEED = 10


 class TestSyncBatchNormRunnerBase(object):
-
-    def get_model(self,
-                  main,
-                  startup,
-                  place,
-                  layout,
-                  seed,
-                  sync_bn=False,
-                  only_forward=False):
+    def get_model(
+        self,
+        main,
+        startup,
+        place,
+        layout,
+        seed,
+        sync_bn=False,
+        only_forward=False,
+    ):
        raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )

    def wait_server_ready(self, endpoints):
        assert not isinstance(endpoints, string_types)
@@ -63,13 +65,15 @@ class TestSyncBatchNormRunnerBase(object):
            not_ready_endpoints = []
            for ep in endpoints:
                ip_port = ep.split(":")
-                with closing(socket.socket(socket.AF_INET,
-                                           socket.SOCK_STREAM)) as sock:
+                with closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                ) as sock:
                    sock.settimeout(2)
                    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                    if hasattr(socket, 'SO_REUSEPORT'):
-                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
-                                        1)
+                        sock.setsockopt(
+                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1
+                        )

                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                    if result != 0:
@@ -77,39 +81,47 @@ class TestSyncBatchNormRunnerBase(object):
                        not_ready_endpoints.append(ep)
            if not all_ok:
                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" +
-                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.write(
+                    "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+                )
                sys.stderr.flush()
                time.sleep(3)
            else:
                break

-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
+    def initCommunicator(
+        self, program, rank, nranks, wait_port, current_endpoint, endpoints
+    ):
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
        if rank == 0 and wait_port:
            self.wait_server_ready(other_endpoints)
        block = program.global_block()
-        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
-                                       persistable=True,
-                                       type=core.VarDesc.VarType.RAW)
-        block.append_op(type='c_gen_cncl_id',
-                        inputs={},
-                        outputs={'Out': cncl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-        block.append_op(type='c_comm_init',
-                        inputs={'X': cncl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': self.global_ring_id
-                        })
+        cncl_id_var = block.create_var(
+            name=nameGen.generate('cncl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+        )
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+            },
+        )

    def run_trainer(self, args):
        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
@@ -127,8 +139,8 @@ class TestSyncBatchNormRunnerBase(object):
                self._compare(args, place, layout, True)

        # Test FP16 - @TODO
-        self.dtype = np.float16
-        self.atol = 1e-2
+        self.bn_dtype = np.float16
+        self.atol = 3e-3

        # Test training
        for place in places:
@@ -142,24 +154,30 @@ class TestSyncBatchNormRunnerBase(object):

        sys.stdout.buffer.write(
            pickle.dumps(
-                'training, inference, fp32, fp16, NCHW, NHWC all passed'))
+                'training, inference, fp32, fp16, NCHW, NHWC all passed'
+            )
+        )

    def _compare(self, args, place, layout, only_forward):
        scope = core.Scope()

        np.random.seed(SEED)
-        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2
        sys.stderr.write("data: " + str(data) + "\n")
-        data = create_or_get_tensor(scope, "input",
-                                    OpTest.np_dtype_to_fluid_dtype(data), place)
+        data = create_or_get_tensor(
+            scope, "input", OpTest.np_dtype_to_fluid_dtype(data), place
+        )

-        bn_fetches = self._cal_single_card(args, data, place, layout,
-                                           only_forward)
+        bn_fetches = self._cal_single_card(
+            args, data, place, layout, only_forward
+        )
        fetch_names, sync_bn_fetches = self._cal_multiple_cards(
-            args, data, place, layout, only_forward)
+            args, data, place, layout, only_forward
+        )

-        sys.stderr.write("len(sync_bn_fetches): " + str(len(sync_bn_fetches)) +
-                         "\n")
+        sys.stderr.write(
+            "len(sync_bn_fetches): " + str(len(sync_bn_fetches)) + "\n"
+        )
        for i in six.moves.xrange(0, len(sync_bn_fetches)):
            sys.stderr.write("i: " + str(i) + "\n")
            sys.stderr.write("fetch_names[i]): " + fetch_names[i] + "\n")
@@ -167,13 +185,14 @@ class TestSyncBatchNormRunnerBase(object):
            bn_val = bn_fetches[i]
            sync_bn_val = sync_bn_fetches[i]
            if sync_bn_val.shape != bn_val.shape:
-                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
+                sync_bn_val = sync_bn_val[: bn_val.shape[0]]

            # i = 0
            if fetch_names[i] == 'reduce_sum_0.tmp_0':
                # sys.stderr.write("skip reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n")
-                sys.stderr.write("reduce_sum_0.tmp_0 (Out of reduce_sum op)" +
-                                 "\n")
+                sys.stderr.write(
+                    "reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n"
+                )
                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")

@@ -201,7 +220,8 @@ class TestSyncBatchNormRunnerBase(object):
            if fetch_names[i] == 'batch_norm_0.tmp_2':
                # sys.stderr.write("skip batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
                sys.stderr.write(
-                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
+                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n"
+                )
                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")

@@ -234,8 +254,9 @@ class TestSyncBatchNormRunnerBase(object):

            # i = 8
            if fetch_names[i] == 'batch_norm_0.tmp_1':
-                sys.stderr.write("skip batch_norm_0.tmp_1 (SavedVariance)" +
-                                 "\n")
+                sys.stderr.write(
+                    "skip batch_norm_0.tmp_1 (SavedVariance)" + "\n"
+                )
                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")

@@ -281,10 +302,16 @@ class TestSyncBatchNormRunnerBase(object):
            if fetch_names[i] == 'conv2d_0.tmp_0@GRAD':
                atol = 1e-2

-            assert np.allclose(
-                bn_val, sync_bn_val, atol=atol), "Output (" + fetch_names[
-                    i] + ") has diff. \n" + "\nBN     " + str(
-                        bn_val) + "\n" + "Sync BN " + str(sync_bn_val)
+            assert np.allclose(bn_val, sync_bn_val, atol=atol), (
+                "Output ("
+                + fetch_names[i]
+                + ") has diff. \n"
+                + "\nBN     "
+                + str(bn_val)
+                + "\n"
+                + "Sync BN "
+                + str(sync_bn_val)
+            )

    def _cal_single_card(self, args, data, place, layout, only_forward):
        # Single-MLU, N = 32 per MLU
@@ -294,23 +321,31 @@ class TestSyncBatchNormRunnerBase(object):
        startup_prog.global_seed(SEED)
        paddle.seed(SEED)

-        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
-                              False, only_forward)
+        outs = self.get_model(
+            train_prog, startup_prog, place, layout, SEED, False, only_forward
+        )

        exe = fluid.Executor(place)
        exe.run(startup_prog)
        fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            'bn_moving_mean',
+            'bn_moving_variance',
+            'bn_scale',
+            'bn_bias',
        ]
        if not only_forward:
            others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'batch_norm_0.tmp_0',
+                'batch_norm_0.tmp_1',
+                'bn_scale@GRAD',
+                'bn_bias@GRAD',
+                'batch_norm_0.tmp_3@GRAD',
+                'conv2d_0.tmp_0@GRAD',
            ]
            fetch_names += others
-        bn_fetches = exe.run(program=train_prog,
-                             feed={'input': data},
-                             fetch_list=fetch_names)
+        bn_fetches = exe.run(
+            program=train_prog, feed={'input': data}, fetch_list=fetch_names
+        )

        return bn_fetches

@@ -331,8 +366,9 @@ class TestSyncBatchNormRunnerBase(object):
        current_endpoint = args["currentendpoint"]
        nranks = 2

-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
+        self.initCommunicator(
+            startup_prog, rank, nranks, True, current_endpoint, endpoints
+        )
        # sys.stderr.write("after init, startup_prog: " +
        #                  startup_prog.to_string(True) + "\n")
        train_prog.global_seed(SEED)
@@ -342,8 +378,9 @@ class TestSyncBatchNormRunnerBase(object):
        paddle.seed(SEED)

        self.rank = rank
-        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
-                              True, only_forward)
+        outs = self.get_model(
+            train_prog, startup_prog, place, layout, SEED, True, only_forward
+        )
        # sys.stderr.write("after get_model, train_prog: " +
        #                  train_prog.to_string(True) + "\n")
        # sys.stderr.write("after get_model, startup_prog: " +
@@ -366,17 +403,24 @@ class TestSyncBatchNormRunnerBase(object):
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            'bn_moving_mean',
+            'bn_moving_variance',
+            'bn_scale',
+            'bn_bias',
        ]
        if not only_forward:
            others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'batch_norm_0.tmp_0',
+                'batch_norm_0.tmp_1',
+                'bn_scale@GRAD',
+                'bn_bias@GRAD',
+                'batch_norm_0.tmp_3@GRAD',
+                'conv2d_0.tmp_0@GRAD',
            ]
            fetch_names += others
-        sync_bn_fetches = exe.run(program=train_prog,
-                                  feed={'input': data},
-                                  fetch_list=fetch_names)
+        sync_bn_fetches = exe.run(
+            program=train_prog, feed={'input': data}, fetch_list=fetch_names
+        )

        return fetch_names, sync_bn_fetches

@@ -399,19 +443,20 @@ from contextlib import closing


 class TestDistBase(unittest.TestCase):
-
    def setUp(self):
        self._port_set = set()
        self._trainers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
        self._python_interp = sys.executable

    def _find_free_port(self):
-
        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

@@ -440,7 +485,7 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
            "PADDLE_CURRENT_ENDPOINT": w1_ep,
        }
-        #update environment
+        # update environment
        env0.update(envs)
        env1.update(envs)

@@ -451,15 +496,19 @@ class TestDistBase(unittest.TestCase):
        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
        print("tr0_cmd: {}, env: {}\n".format(tr0_cmd, env0))
        print("tr1_cmd: {}, env: {}\n".format(tr1_cmd, env1))
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
-
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )

        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -473,14 +522,16 @@ class TestDistBase(unittest.TestCase):
            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self, model_file, col_type, check_error_log=False, need_envs={}
+    ):
        required_envs = {
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -491,7 +542,7 @@ class TestDistBase(unittest.TestCase):
            "FLAGS_call_stack_level": "2",
            "GLOG_v": "3",
            "PADDLE_WITH_GLOO": '0',
-            "BACKEND": "cncl"
+            "BACKEND": "cncl",
        }
        required_envs.update(need_envs)
        if check_error_log:
@@ -499,8 +550,11 @@ class TestDistBase(unittest.TestCase):
            required_envs["GLOG_logtostderr"] = "1"
            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
        self.assertEqual(
-            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed'
+        )
        self.assertEqual(
-            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed'
+        )
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
@@ -29,14 +29,17 @@ paddle.enable_static()


 class TestSyncBatchNormOp(TestDistBase):
-
    def _setup_config(self):
        pass

    def test_identity(self, col_type="identity"):
-        self.check_with_place("sync_batch_norm_op_mlu.py",
-                              col_type,
-                              check_error_log=True)
+        envs = {"CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE": "1"}
+        self.check_with_place(
+            "sync_batch_norm_op_mlu.py",
+            col_type,
+            check_error_log=True,
+            need_envs=envs,
+        )


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py
--- a/tools/dockerfile/Dockerfile.mlu
+++ b/tools/dockerfile/Dockerfile.mlu