clean up WITH_MLU (#52546)

e75c01f9 · Wang Xin · GitHub · 075d6b14 · e75c01f9 · e75c01f9
120 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
 option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
 option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
 option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
-option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
@@ -81,9 +80,6 @@ endif()
 if(WITH_GPU AND WITH_ROCM)
  message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
-if(WITH_GPU AND WITH_MLU)
-  message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
-endif()
 if(WITH_GPU AND NOT APPLE)
  enable_language(CUDA)
@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL)
      CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
 endif()
-if(NOT WITH_MLU AND WITH_CNCL)
-  message(
-    WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
-  set(WITH_MLU
-      OFF
-      CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
-endif()
 if(WITH_NCCL)
  add_definitions("-DPADDLE_WITH_NCCL")
  include(nccl)
@@ -469,10 +457,6 @@ if(WITH_GPU)
  endif()
 endif()
-if(WITH_MLU)
-  include(neuware)
-endif()
 if(WITH_ROCM)
  include(hip)
  include(miopen) # set miopen libraries, must before configure

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -116,11 +116,6 @@ if(WITH_IPU)
  add_definitions(-DPADDLE_WITH_IPU)
 endif()
-if(WITH_MLU)
-  message(STATUS "Compile with MLU!")
-  add_definitions(-DPADDLE_WITH_MLU)
-endif()
 if(WITH_GPU)
  add_definitions(-DPADDLE_WITH_CUDA)
  add_definitions(-DEIGEN_USE_GPU)

--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
-if(NOT WITH_MLU)
-  return()
-endif()
-if(NOT ENV{NEUWARE_HOME})
-  set(NEUWARE_HOME "/usr/local/neuware")
-else()
-  set(NEUWARE_HOME $ENV{NEUWARE_HOME})
-endif()
-message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME})
-set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include)
-set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
-include_directories(${NEUWARE_INCLUDE_DIR})
-set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
-set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
-set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
-set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
-set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
-generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
-                     ${CNPAPI_LIB})
-if(WITH_CNCL)
-  message(STATUS "Compile with CNCL!")
-  add_definitions(-DPADDLE_WITH_CNCL)
-  set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
-  list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
-endif()
-target_link_libraries(neuware_lib ${NEUWARE_LIB_DEPS})
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -74,9 +74,6 @@ function(op_library TARGET)
  set(MKLDNN_FILE)
  set(op_common_deps operator op_registry math_function layer
                     common_infer_shape_functions)
-  if(WITH_MLU)
-    set(op_common_deps ${op_common_deps} mlu_baseop)
-  endif()
  # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
  set(options UNITY)
@@ -169,12 +166,6 @@ function(op_library TARGET)
        list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
      endif()
    endif()
-    if(WITH_MLU)
-      string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
-        list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
-      endif()
-    endif()
  else()
    foreach(src ${op_library_SRCS})
      if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
@@ -201,8 +192,6 @@ function(op_library TARGET)
        list(APPEND xpu_kp_cc_srcs ${src})
      elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
        list(APPEND xpu_kp_cc_srcs ${src})
-      elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
-        list(APPEND mlu_cc_srcs ${src})
      elseif(${src} MATCHES ".*\\.cc$")
        list(APPEND cc_srcs ${src})
      elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
@@ -519,18 +508,6 @@ function(op_library TARGET)
    endforeach()
  endif()
-  # pybind USE_OP_DEVICE_KERNEL for MLU
-  if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
-    foreach(mlu_src ${mlu_cc_srcs})
-      set(op_name "")
-      find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
-      if(NOT ${op_name} EQUAL "")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
-        set(pybind_flag 1)
-      endif()
-    endforeach()
-  endif()
  # pybind USE_OP_DEVICE_KERNEL for MKLDNN
  if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
    # Append first implemented MKLDNN activation operator

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -356,11 +356,6 @@ if(WITH_XPU)
  list(APPEND third_party_deps extern_xpu)
 endif()
-if(WITH_MLU)
-  include(external/concurrentqueue) # download, build, install concurrentqueue
-  list(APPEND third_party_deps extern_concurrentqueue)
-endif()
 if(WITH_PSLIB)
  include(external/pslib) # download, build, install pslib
  list(APPEND third_party_deps extern_pslib)

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -99,11 +99,6 @@ struct DLDeviceVisitor
        "platform::NPUPinnedPlace is not supported"));
  }
-  inline ::DLDevice operator()(const platform::MLUPlace &place) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("platform::MLUPlace is not supported"));
-  }
  inline ::DLDevice operator()(const platform::CustomPlace &place) const {
    PADDLE_THROW(platform::errors::Unimplemented(
        "platform::CustomPlace is not supported"));

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
-#endif
-    } else if (platform::is_mlu_place(place_)) {
-#ifdef PADDLE_WITH_MLU
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
-      } else {
-        gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
-      }
-#else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
 #endif
    } else if (platform::is_custom_place(place_)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif
-#ifdef PADDLE_WITH_MLU
-MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
-    const platform::MLUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-void MLUDefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
-}
-void MLUDefaultStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
-}
-MLUUnsafeFastGarbageCollector::MLUUnsafeFastGarbageCollector(
-    const platform::MLUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-void MLUUnsafeFastGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback();
-}
-MLUStreamGarbageCollector::MLUStreamGarbageCollector(
-    const platform::MLUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {
-  platform::MLUDeviceGuard guard(place.device);
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueCreate(&stream_));
-  callback_manager_.reset(
-      new platform::StreamCallbackManager<mluStream>(stream_));
-}
-MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
-  auto place = this->dev_ctx_->GetPlace();
-  platform::MLUDeviceGuard guard(place.device);
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
-}
-mluStream MLUStreamGarbageCollector::stream() const { return stream_; }
-void MLUStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
-void MLUStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback_manager_->AddCallback(callback);
-}
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
    const platform::CustomPlace &place, size_t max_memory_size)

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -22,9 +22,6 @@
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif
 #include "paddle/fluid/platform/stream_callback_manager.h"
 namespace paddle {
@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif
-#ifdef PADDLE_WITH_MLU
-class MLUDefaultStreamGarbageCollector : public GarbageCollector {
- public:
-  MLUDefaultStreamGarbageCollector(const platform::MLUPlace &place,
-                                   size_t max_memory_size);
-  void Wait() const override;
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-class MLUUnsafeFastGarbageCollector : public GarbageCollector {
- public:
-  MLUUnsafeFastGarbageCollector(const platform::MLUPlace &place,
-                                size_t max_memory_size);
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-class MLUStreamGarbageCollector : public GarbageCollector {
- public:
-  MLUStreamGarbageCollector(const platform::MLUPlace &place,
-                            size_t max_memory_size);
-  ~MLUStreamGarbageCollector();
-  void Wait() const override;
-  mluStream stream() const;
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
- private:
-  mluStream stream_;
-  std::unique_ptr<platform::StreamCallbackManager<mluStream>> callback_manager_;
-};
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 class CustomDefaultStreamGarbageCollector : public GarbageCollector {
 public:

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
 #define REGISTER_OP_NPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
-#define REGISTER_OP_MLU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)
-#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...)                  \
-  REGISTER_OP_KERNEL_EX(                                              \
-      op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE,       \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
 #define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...)                  \
  REGISTER_OP_KERNEL_EX(                                              \
      op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE,       \

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -57,10 +57,6 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = place.device;
      platform::SetXPUDeviceId(dev_id);
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#ifndef PADDLE_WITH_MLU
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Cannot run operator on place %s, please recompile paddle or "
-          "reinstall Paddle with MLU support.",
-          place));
-#else
-      auto dev_id = place.device;
-      platform::SetMLUDeviceId(dev_id);
 #endif
    } else if (platform::is_custom_place(place)) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (kernel_iter == kernels.end() &&
-      platform::is_mlu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "missing MLU kernel: " << type_
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  if (kernel_iter == kernels.end() &&
      platform::is_custom_place(expected_kernel_key.place_)) {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use CUDA device since it's not compiled with CUDA,"
          "Please recompile or reinstall Paddle with GPU support."));
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
-      } else {
-        gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use MLU device since it's not compiled with MLU,"
-          "Please recompile or reinstall Paddle with MLU support."));
 #endif
    } else if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)

--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (kernel_key.backend() == phi::Backend::MLU) {
-    VLOG(3) << "phi missing MLU kernel: " << op.Type()
-            << ", expected_kernel_key:" << kernel_key
-            << ", fallback to CPU one!";
-    return phi::KernelKey(
-        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
-  }
-#endif
 #ifdef PADDLE_WITH_IPU
  if (kernel_key.backend() == phi::Backend::IPU) {
    VLOG(3) << "phi missing IPU kernel: " << op.Type()

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src,
        "Copying from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto src_mlu_place = src_place;
-    auto dst_cpu_place = dst_place;
-    auto stream =
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    auto src_cpu_place = src_place;
-    auto dst_mlu_place = dst_place;
-    auto stream =
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  }
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    auto src_mlu_place = src_place;
-    auto dst_mlu_place = dst_place;
-    auto stream =
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copying from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_ipu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copying from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 }
 template <typename TENSOR>
@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os,
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
          "XPUPlace is not supported when not compiled with XPU"));
-#endif
-    } else if (platform::is_mlu_place(tensor.place())) {
-#ifdef PADDLE_WITH_MLU
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& mlu_dev_ctx =
-          static_cast<const platform::MLUDeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu,
-                     buf.get(),
-                     tensor.place(),
-                     reinterpret_cast<const void*>(data),
-                     size_to_write,
-                     mlu_dev_ctx.stream());
-        mlu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "MLUPlace is not supported when not compiled with MLU"));
 #endif
    } else if (platform::is_custom_place(tensor.place())) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is,
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
      phi::DenseTensor cpu_tensor;
      cpu_tensor.Resize(phi::make_ddim(shape));
      framework::VisitDataType(
@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is,
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
-      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "MLUPlace is not supported when not compiled with MLU"));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "NPUPlace is not supported when not compiled with NPU"));
      }
 #endif
    } else {
@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is,
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
      phi::DenseTensor cpu_tensor;
      cpu_tensor.Resize(phi::make_ddim(dims));
      framework::VisitDataType(
@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is,
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
-      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "MLUPlace is not supported when not compiled with MLU"));
      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -26,9 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -142,11 +139,6 @@ void TensorFromArray(const T* src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(dst_place)) {  // NOLINT
    memory::Copy(
@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(dst_place)) {  // NOLINT
    memory::Copy(
@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src,
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src.place())) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src.place())) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -177,10 +177,6 @@ if(WITH_GLOO)
  endif()
 endif()
-if(WITH_MLU)
-  set(MLU_DEPS mlu_baseop)
-endif()
 if(NOT WITH_ASCEND_CL)
  cc_library(
    gradient_accumulator

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -159,15 +159,6 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                unsupported_ops_xpu_bf16.end());
-#elif defined(PADDLE_WITH_MLU)
-  auto unsupported_ops_mlu_fp16 = std::get<2>(
-      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
-  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
-                                unsupported_ops_mlu_fp16.end());
-  auto unsupported_ops_mlu_bf16 = std::get<2>(
-      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
-  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
-                                unsupported_ops_mlu_bf16.end());
 #endif
  VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
          << unsupported_fp16_ops_->size() << " "

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -34,9 +34,6 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "xpu/refactor/math.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (platform::is_mlu_place(place)) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::DeviceContext* ctx = pool.Get(place);
-    auto dev_ctx = dynamic_cast<platform::MLUDeviceContext*>(ctx);
-    if (data_type == framework::DataTypeTrait<float>::DataType()) {
-      dst_tensor->mutable_data<float>(place);
-    } else if (data_type ==
-               framework::DataTypeTrait<platform::float16>::DataType()) {
-      dst_tensor->mutable_data<platform::float16>(place);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Gradient accumulation of data type (%s) on place (%s) is not "
-          "supported in imperative mode",
-          framework::DataTypeToString(data_type),
-          place));
-    }
-    static const float alpha = 1.f;
-    static const float beta = 1.f;
-    operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor);
-    operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor);
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlAssignAdd(dev_ctx->cnnl_handle(),
-                      static_cast<const void*>(&alpha),
-                      src_tensor_desc.get(),
-                      operators::GetBasePtr(&src_tensor),
-                      nullptr,
-                      0,
-                      static_cast<const void*>(&beta),
-                      dst_tensor_desc.get(),
-                      operators::GetBasePtr(dst_tensor)));
-    return;
-  }
-#endif
  PADDLE_THROW(platform::errors::Unimplemented(
      "Gradient accumulation of data type (%s) on place (%s) is not "
      "supported in imperative mode",

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
      kernel_signature_(std::move(kernel_signature)),
      phi_kernel_(phi_kernel) {}
-#ifdef PADDLE_WITH_MLU
-static void tokenize(const std::string& ops,
-                     char delim,
-                     std::unordered_set<std::string>* op_set) {
-  std::string::size_type beg = 0;
-  for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
-       ++end) {
-    op_set->insert(ops.substr(beg, end - beg));
-    beg = end + 1;
-  }
-  op_set->insert(ops.substr(beg));
-}
-static bool is_in_mlu_black_list(const std::string& op_name) {
-  static bool inited = false;
-  static std::unordered_set<std::string> mlu_black_list;
-  static std::mutex s_mtx;
-  if (!inited) {
-    std::lock_guard<std::mutex> guard(s_mtx);
-    if (!inited) {
-      if (std::getenv("MLU_BLACK_LIST") != nullptr) {
-        std::string ops(std::getenv("MLU_BLACK_LIST"));
-        tokenize(ops, ',', &mlu_black_list);
-      }
-      inited = true;
-      VLOG(3) << "MLU Black List: ";
-      for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
-           ++iter) {
-        VLOG(3) << *iter << " ";
-      }
-    }
-  }
-  if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
-    return true;
-  }
-  return false;
-}
-#endif
 template <typename VarType>
 PreparedOp PrepareImpl(
    const NameVarMap<VarType>& ins,
@@ -258,12 +216,6 @@ PreparedOp PrepareImpl(
                              op.Type(), expected_kernel_key.dtype());
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (is_in_mlu_black_list(op.Type())) {
-    expected_kernel_key.set_backend(phi::Backend::CPU);
-  }
-#endif
  bool has_phi_kernel = false;
  const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
@@ -468,16 +420,6 @@ PreparedOp PrepareImpl(
    kernel_iter = kernels.find(fluid_kernel_type);
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (kernel_iter == kernels.end() &&
-      paddle::platform::is_mlu_place(fluid_kernel_type.place_)) {
-    VLOG(3) << "missing MLU kernel: " << op.Type()
-            << ", expected_kernel_key:" << fluid_kernel_type
-            << ", fallbacking to CPU one!";
-    fluid_kernel_type.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(fluid_kernel_type);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  if (kernel_iter == kernels.end() &&
      paddle::platform::is_custom_place(fluid_kernel_type.place_)) {

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use IPU device since it's not compiled with IPU,"
          "Please recompile or reinstall Paddle with IPU support."));
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#if defined(PADDLE_WITH_MLU)
-      gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
-      VLOG(10) << "Created GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use MLU device since it's not compiled with MLU,"
-          "Please recompile or reinstall Paddle with MLU support."));
 #endif
    } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type,
    } else if (platform::is_npu_place(place)) {
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with NPU if use NPUPlace."));
-    } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-      platform::SetMLUDeviceId(place.device);
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with MLU if use MLUPlace."));
-#endif
    } else if (platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
      phi::DeviceManager::SetDevice(place);

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -58,10 +58,6 @@
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/fluid/memory/allocation/custom_allocator.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
@@ -194,11 +190,6 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
 #endif
-#ifdef PADDLE_WITH_MLU
-        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
-        }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
        for (const auto& dev_type : device_types) {
@@ -254,11 +245,6 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
 #endif
-#ifdef PADDLE_WITH_MLU
-        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
-        }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
        for (const auto& dev_type : device_types) {
@@ -290,11 +276,6 @@ class AllocatorFacadePrivate {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
-#endif
-#ifdef PADDLE_WITH_MLU
-        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
-        }
 #endif
        break;
      }
@@ -801,12 +782,6 @@ class AllocatorFacadePrivate {
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -851,13 +826,6 @@ class AllocatorFacadePrivate {
      system_allocators_[p] = CreateCUDAAllocator(p);
    }
 #endif
-#ifdef PADDLE_WITH_MLU
-    int device_count = platform::GetMLUDeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::MLUPlace p(i);
-      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-    }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {
@@ -894,12 +862,6 @@ class AllocatorFacadePrivate {
      places.emplace_back(platform::IPUPlace(dev_id));
    }
 #endif
-#ifdef PADDLE_WITH_MLU
-    int device_count = platform::GetMLUDeviceCount();
-    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
-      places.emplace_back(platform::MLUPlace(dev_id));
-    }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {

--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    init_allocate_size_func_ = &platform::GpuInitAllocSize;
    re_allocate_size_func_ = &platform::GpuReallocSize;
-#elif defined(PADDLE_WITH_MLU)
-    init_allocate_size_func_ = &platform::MLUInitAllocSize;
-    re_allocate_size_func_ = &platform::MLUReallocSize;
 #endif
  }
 #endif
@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  allocate_bytes = DeviceAllocateSize(
      &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
-#elif defined(PADDLE_WITH_MLU)
-  allocate_bytes = DeviceAllocateSize(
-      &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
 #endif
 #endif

--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -25,9 +25,6 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) {
 }
 #endif
-#ifdef PADDLE_WITH_MLU
-TEST(BuddyAllocator, MluFraction) {
-  // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  buddy_allocator.Release();
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     600 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-TEST(BuddyAllocator, InitRealloc) {
-  FLAGS_initial_gpu_memory_in_mb = 100;
-  FLAGS_reallocate_gpu_memory_in_mb = 50;
-  EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(100 << 20));
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-  // Less then initial size and reallocate size
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  // Between initial size and reallocate size and not exceed pool
-  TestBuddyAllocator(&buddy_allocator, 80 << 20);
-  TestBuddyAllocator(&buddy_allocator, 99 << 20);
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     101 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
-  FLAGS_initial_gpu_memory_in_mb = 5;
-  FLAGS_reallocate_gpu_memory_in_mb = 10;
-  EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(10 << 20));
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-  // Less than initial size and reallocate size
-  TestBuddyAllocator(&buddy_allocator, 1 << 20);
-  // Between initial size and reallocate size and exceed pool
-  TestBuddyAllocator(&buddy_allocator, 6 << 20);
-  TestBuddyAllocator(&buddy_allocator, 8 << 20);
-  TestBuddyAllocator(&buddy_allocator, 9 << 20);
-  // Greater than max trunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     11 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-TEST(BuddyAllocator, FractionRefillPool) {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.6;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-  size_t max_chunk_size = platform::MLUMaxChunkSize();
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      max_chunk_size);
-  // Less than pool size
-  int* p0 = TestBuddyAllocator(&buddy_allocator,
-                               max_chunk_size - 1000,
-                               /* use_system_allocator = */ false,
-                               /* free_ptr = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-  size_t alloc =
-      platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
-  // Exceed pool trigger refilling size of fraction of avaiable mlu, and should
-  // be able to alloc 60% of the remaining MLU
-  int* p1 = TestBuddyAllocator(&buddy_allocator,
-                               alloc,
-                               /* use_system_allocator = */ false,
-                               /* free_ptr = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-  alloc =
-      platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
-  // Exceed pool trigger refilling size of fraction of avaiable mlu, and should
-  // be able to alloc 60% of the remaining MLU
-  TestBuddyAllocator(&buddy_allocator,
-                     alloc,
-                     /* use_system_allocator = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-  buddy_allocator.Free(p0);
-  buddy_allocator.Free(p1);
-}
-TEST(BuddyAllocator, AllocFromAvailable) {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.7;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-  size_t total = 0, available = 0;
-  platform::SetMLUDeviceId(0);
-  platform::MLUMemoryUsage(&available, &total);
-  // Take half of available MLU
-  void* p;
-  cnrtStatus result = cnrtMalloc(&p, available >> 1);
-  EXPECT_TRUE(result == cnrtSuccess);
-  // BuddyAllocator should be able to alloc the remaining MLU
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
-  if (p) {
-    EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
-  }
-}
-TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
-  FLAGS_fraction_of_gpu_memory_to_use = 1.0;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-  void* p = nullptr;
-  EXPECT_TRUE(cnrtMalloc(&p, static_cast<size_t>(1) << 30) == cnrtSuccess);
-  // BuddyAllocator should be able to alloc the remaining MLU
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
-  if (p) {
-    EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
-  }
-}
-TEST(BuddyAllocator, Release) {
-  // In a 8 GB machine, the pool size will be about 800 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 50 << 20);
-  buddy_allocator.Release();
-}
-#endif
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>(
 #endif
 }
-// For MLU
-#ifdef PADDLE_WITH_MLU
-class MLUBuddyAllocatorList {
- private:
-  MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) {
-    auto mlu_num = devices_.size();
-    allocators_.resize(mlu_num);
-    init_flags_.reserve(mlu_num);
-    for (size_t i = 0; i < mlu_num; ++i) {
-      init_flags_.emplace_back(new std::once_flag());
-    }
-  }
-  static MLUBuddyAllocatorList *CreateNewInstance() {
-    return new MLUBuddyAllocatorList();
-  }
- public:
-  static MLUBuddyAllocatorList *Instance() {
-    static auto *instance = CreateNewInstance();
-    return instance;
-  }
-  BuddyAllocator *Get(int mlu_id) {
-    auto pos = std::distance(
-        devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
-    PADDLE_ENFORCE_LT(pos,
-                      devices_.size(),
-                      platform::errors::OutOfRange(
-                          "The index exceeds the size of devices, the size of "
-                          "devices is %d, the index is %d",
-                          devices_.size(),
-                          pos));
-    std::call_once(*init_flags_[pos], [this, pos] {
-      platform::SetMLUDeviceId(devices_[pos]);
-      allocators_[pos].reset(
-          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                 new detail::MLUAllocator(devices_[pos])),
-                             platform::MLUMinChunkSize(),
-                             platform::MLUMaxChunkSize()));
-      VLOG(10) << "\n\nNOTE:\n"
-               << "You can set GFlags environment variable "
-               << "(mlu reuse gpu GFlags) "
-               << "'FLAGS_fraction_of_gpu_memory_to_use' "
-               << "or 'FLAGS_initial_gpu_memory_in_mb' "
-               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
-               << "to change the memory size for MLU usage.\n"
-               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
-               << FLAGS_fraction_of_gpu_memory_to_use
-               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
-               << FLAGS_initial_gpu_memory_in_mb
-               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
-               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    });
-    return allocators_[pos].get();
-  }
- private:
-  std::vector<int> devices_;
-  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
-  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
-};
-BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) {
-  return MLUBuddyAllocatorList::Instance()->Get(mlu_id);
-}
-#endif
-template <>
-size_t Used<platform::MLUPlace>(const platform::MLUPlace &place) {
-#ifdef PADDLE_WITH_MLU
-  return GetMLUBuddyAllocator(place.device)->Used();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
-#ifdef PADDLE_WITH_MLU
-  auto *buddy_allocator = GetMLUBuddyAllocator(place.device);
-  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    platform::MLUDeviceGuard(place.device);
-    size_t avail = 0, total = 0;
-    platform::MLUMemoryUsage(&avail, &total);
-    PADDLE_THROW(platform::errors::ResourceExhausted(
-        "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
-        "%s, MLUMinChunkSize %s, MLU memory used: %s.",
-        string::HumanReadableSize(size),
-        place.device,
-        string::HumanReadableSize(avail),
-        string::HumanReadableSize(total),
-        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
-        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
-        string::HumanReadableSize(Used<platform::MLUPlace>(place))));
-  } else {
-    if (FLAGS_init_allocated_mem) {
-      cnrtMemset(ptr, 0xEF, size);
-    }
-  }
-  return ptr;
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-void Free<platform::MLUPlace>(const platform::MLUPlace &place,
-                              void *p,
-                              size_t size) {
-#ifdef PADDLE_WITH_MLU
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetMLUBuddyAllocator(place.device)->Free(p);
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
-#ifdef PADDLE_WITH_MLU
-  return GetMLUBuddyAllocator(place.device)->Release();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
 // For CustomDevice
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 class BuddyAllocatorList {

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif
-#ifdef PADDLE_WITH_MLU
-TEST(NaiveBestFitAllocatorTest, MluAlloc) {
-  NaiveBestFitAllocator alloc{platform::MLUPlace(0)};
-  {
-    size_t size = (1 << 20);
-    auto allocation = alloc.Allocate(size);
-  }
-  sleep(10);
-  alloc.Release(platform::MLUPlace(0));
-  size_t size = (1 << 20);
-  auto allocation = alloc.Allocate(size);
-  alloc.Release(platform::MLUPlace(0));
-}
-#endif
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -31,9 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 #endif
-#ifdef PADDLE_WITH_MLU
-void* MLUAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-  void* p;
-  auto result = platform::RecordedMLUMalloc(&p, size, mlu_id_);
-  if (result == cnrtSuccess) {
-    *index = 0;
-    mlu_alloc_size_ += size;
-    return p;
-  } else {
-    size_t avail, total, actual_avail, actual_total;
-    bool is_limited = platform::RecordedMLUMemGetInfo(
-        &avail, &total, &actual_avail, &actual_total, mlu_id_);
-    size_t allocated = total - avail;
-    std::string err_msg;
-    if (is_limited) {
-      auto limit_size = (total >> 20);
-      err_msg = string::Sprintf(
-          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
-          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
-          "maximum MLU memory usage is limited to %d MB.\n"
-          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size,
-          limit_size);
-    }
-    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
-        "\n\nOut of memory error on MLU %d. "
-        "Cannot allocate %s memory on MLU %d, %s memory has been allocated and "
-        "available memory is only %s.\n\n"
-        "Please check whether there is any other process using MLU %d.\n"
-        "1. If yes, please stop them, or start PaddlePaddle on another MLU.\n"
-        "2. If no, please try one of the following suggestions:\n"
-        "   1) Decrease the batch size of your model.\n"
-        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
-        "please set it to a higher value but less than 1.0.\n"
-        "      The command is "
-        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        mlu_id_,
-        string::HumanReadableSize(size),
-        mlu_id_,
-        string::HumanReadableSize(allocated),
-        string::HumanReadableSize(avail),
-        mlu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use,
-        err_msg));
-  }
-}
-void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(mlu_alloc_size_,
-                    size,
-                    platform::errors::InvalidArgument(
-                        "The size of memory (%d) to free exceeds the size of "
-                        "allocated gpu memory (%d)",
-                        size,
-                        mlu_alloc_size_));
-  mlu_alloc_size_ -= size;
-  platform::RecordedMLUFree(p, size, mlu_id_);
-}
-bool MLUAllocator::UseGpu() const { return true; }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 void* CustomAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
-#ifdef PADDLE_WITH_MLU
-class MLUAllocator : public SystemAllocator {
- public:
-  explicit MLUAllocator(int mlu_id) : mlu_id_(mlu_id) {}
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
- private:
-  size_t mlu_alloc_size_ = 0;
-  int mlu_id_;
-};
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 class CustomAllocator : public SystemAllocator {
 public:

--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) {
  }
 }
 #endif
-#ifdef PADDLE_WITH_MLU
-TEST(MLUAllocator, Alloc) {
-  paddle::memory::detail::MLUAllocator a(0);
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
-TEST(MLUAllocator, AllocFailure) {
-  paddle::memory::detail::MLUAllocator allocator(0);
-  size_t index;
-  size_t alloc_size = (static_cast<size_t>(1) << 40);  // Very large number
-  try {
-    allocator.Alloc(&index, alloc_size);
-    ASSERT_TRUE(false);
-  } catch (paddle::memory::allocation::BadAlloc&) {
-    PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetLastError());
-  }
-}
-#endif
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -23,10 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 namespace paddle {
 namespace memory {
@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
 }
 #endif
-#ifdef PADDLE_WITH_MLU
-template <>
-void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::MLUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetMLUDeviceId(src_place.device);
-  if (stream) {
-    VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::MLUMemcpyD2HAsync(
-        dst, src, num, reinterpret_cast<mluStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
-    VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place;
-    platform::RecordEvent record_event(
-        "MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::MLUMemcpyD2HSync(dst, src, num);
-  }
-}
-template <>
-void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetMLUDeviceId(dst_place.device);
-  if (stream) {
-    VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::MLUMemcpyH2DAsync(
-        dst, src, num, reinterpret_cast<mluStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::MLUDeviceContext*>(pool.Get(dst_place))->Wait();
-    VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place;
-    platform::RecordEvent record_event(
-        "MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
-    platform::MLUMemcpyH2DSync(dst, src, num);
-  }
-}
-template <>
-void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
-                                                  void* dst,
-                                                  platform::MLUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  if (dst_place == src_place) {
-    platform::SetMLUDeviceId(dst_place.device);
-    if (stream) {
-      VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyD2DAsync(
-          dst, src, num, reinterpret_cast<mluStream>(stream));
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
-      VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyD2DSync(dst, src, num);
-    }
-  } else {
-    if (stream) {
-      VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyPeerAsync(dst,
-                                   dst_place.device,
-                                   src,
-                                   src_place.device,
-                                   num,
-                                   reinterpret_cast<mluStream>(stream));
-    } else {
-      VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyPeerSync(
-          dst, dst_place.device, src, src_place.device, num);
-    }
-  }
-}
-// NOTE: only for CPUPlace and MLUPlace.
-template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  void* dst,
-                                  phi::Place src_place,
-                                  const void* src,
-                                  size_t num,
-                                  void* stream) {
-  if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CPUPlace place_dst, place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&
-             dst_place.GetType() == phi::AllocationType::MLU) {
-    platform::MLUPlace place_dst(dst_place.GetDeviceId());
-    platform::CPUPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::MLU &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::MLUPlace place_src(src_place.GetDeviceId());
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::MLU &&
-             dst_place.GetType() == phi::AllocationType::MLU) {
-    platform::MLUPlace place_src(src_place.GetDeviceId());
-    platform::MLUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CPUPlace place_src;
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CustomPlace place_src(src_place);
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CustomPlace place_src(src_place);
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#endif
-  }
-}
-// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
-template <>
-void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
-       dst,
-       src_place,
-       src,
-       num,
-       stream);
-}
-// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
-template <>
-void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::MLUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(dst_place,
-       dst,
-       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src,
-       num,
-       stream);
-}
-// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
-template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
-}
-// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
-template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::CPUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
-}
-#endif  // PADDLE_WITH_MLU
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
 template <>
 void Copy<phi::Place, phi::Place>(phi::Place dst_place,

--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -16,9 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif
 namespace paddle {
 namespace memory {

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -55,10 +55,6 @@ if (WITH_LITE)
    add_subdirectory(lite)
 endif()
-if (WITH_MLU)
-    add_subdirectory(mlu)
-endif()
 if(WITH_CINN)
    add_subdirectory(cinn)
 endif()
@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL)
  op_library(sync_batch_norm_op)
 endif()
-if (WITH_MLU)
-  op_library(sync_batch_norm_op)
-endif()
 op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
-    found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-    MLUCnnlTensorDesc scale_desc(*scale);
-    MLUCnnlTensorDesc found_inf_desc(
-        *found_inf, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(ctx.GetPlace());
-      // check is_finite or is_nan
-      phi::DenseTensor is_finite(found_inf->type());
-      if (i != 0) {
-        is_finite.Resize(phi::make_ddim({1}));
-        is_finite.mutable_data<bool>(ctx.GetPlace());
-      } else {
-        is_finite.ShareDataWith(*found_inf);
-      }
-      MLUCnnlTensorDesc x_desc(*x);
-      MLUCnnlTensorDesc out_desc(*out);
-      MLUCnnl::IsNanInf(
-          ctx, x_desc.get(), GetBasePtr(x), GetBasePtr(&is_finite));
-      // save is_finite by logical_and op after checking every input
-      if (i != 0) {
-        MLUCnnlTensorDesc is_finite_desc(
-            is_finite, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
-        MLUCnnl::Logic(ctx,
-                       CNNL_LOGIC_OP_OR,
-                       found_inf_desc.get(),
-                       GetBasePtr(found_inf),
-                       is_finite_desc.get(),
-                       GetBasePtr(&is_finite),
-                       found_inf_desc.get(),
-                       GetBasePtr(found_inf));
-      }
-      // The normal logic is :
-      // out = in, if found_inf = true
-      // out = in/scale, if found_inf = false
-      // But when found_inf is true, the data of Out should not be used.
-      // So, on MLU, we always compute out with in/scale.
-      phi::DenseTensor float_x;
-      phi::DenseTensor float_out;
-      if (std::is_same<T, paddle::platform::float16>::value) {
-        float_x.Resize(x->dims());
-        float_out.Resize(out->dims());
-        float_x.mutable_data<MPDType>(ctx.GetPlace());
-        float_out.mutable_data<MPDType>(ctx.GetPlace());
-        MLUCnnlTensorDesc float_x_desc(float_x);
-        MLUCnnlTensorDesc float_out_desc(float_out);
-        auto cast_fp16_type =
-            GetCastDataType(DataType::FLOAT16, DataType::FLOAT32);
-        MLUCnnl::Cast(ctx,
-                      cast_fp16_type,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      float_x_desc.get(),
-                      GetBasePtr(&float_x));
-        MLUCnnl::Div(ctx,
-                     CNNL_COMPUTATION_HIGH_PRECISION,
-                     float_x_desc.get(),
-                     GetBasePtr(&float_x),
-                     scale_desc.get(),
-                     GetBasePtr(scale),
-                     float_out_desc.get(),
-                     GetBasePtr(&float_out));
-        auto cast_fp32_type =
-            GetCastDataType(DataType::FLOAT32, DataType::FLOAT16);
-        MLUCnnl::Cast(ctx,
-                      cast_fp32_type,
-                      float_out_desc.get(),
-                      GetBasePtr(&float_out),
-                      out_desc.get(),
-                      GetBasePtr(out));
-      } else {
-        MLUCnnl::Div(ctx,
-                     CNNL_COMPUTATION_HIGH_PRECISION,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     scale_desc.get(),
-                     GetBasePtr(scale),
-                     out_desc.get(),
-                     GetBasePtr(out));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
-                       ops::CheckFiniteAndUnscaleMLUKernel<float>,
-                       ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -21,9 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel {
    }
    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
-#ifdef PADDLE_WITH_MLU
-    auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
-    auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
-    if (src_type == dst_type || MLUSupportsCast(src_type, dst_type)) {
-      return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
-                            tensor_place);
-    } else {
-      VLOG(3) << "MLU not support cast type: "
-              << framework::DataTypeToString(src_type)
-              << " to type: " << framework::DataTypeToString(dst_type)
-              << ", fallbacking to CPU one!";
-      return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
-                            platform::CPUPlace());
-    }
-#endif
    return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
                          tensor_place);
  }

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -23,9 +23,6 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/fluid/framework/convert_utils.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
@@ -57,17 +54,8 @@ struct FillConstantVisitor {
  void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
                                       std::is_same<T, int16_t>::value)>::type
                 * = nullptr) const {
-#if defined(PADDLE_WITH_MLU)
-    if (platform::is_mlu_place(context_.GetPlace())) {
-      FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
-    } else {
-      phi::funcs::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
-    }
-#else
    phi::funcs::SetConstant<DeviceContext, T> set_constant;
    set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
-#endif
  }
  const DeviceContext &dev_ctx_;
@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-#if defined(PADDLE_WITH_MLU)
-REGISTER_OP_MLU_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, float>);
-#endif
 REGISTER_OP_VERSION(coalesce_tensor)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/collective/barrier_op_mlu.cc
+++ b/paddle/fluid/operators/collective/barrier_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/barrier_op.h"
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-namespace paddle {
-namespace operators {
-template <typename T>
-class BarrierOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CNCL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
-    int64_t numel = in->numel();
-    const void* sendbuff = in->data();
-    void* recvbuff = out->mutable_data<T>(place);
-    int rid = ctx.Attr<int>("ring_id");
-    auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
-    auto* comm = cncl_comm->comm();
-    auto comm_stream = cncl_comm->stream();
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    cnclReduceOp_t cncl_red_type = cnclSum;
-    dev_ctx.Wait();
-    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
-        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
-    PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
-#else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "PaddlePaddle should compile with CNCL."));
-#endif
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
--- a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-#include "paddle/fluid/framework/convert_utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-#if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    int nranks = ctx.Attr<int>("nranks");
-    int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
-    PADDLE_ENFORCE_EQ(
-        nranks,
-        comm->nranks(),
-        platform::errors::InvalidArgument(
-            "nranks: %s should equal to %s", nranks, comm->nranks()));
-    framework::DDim out_dims = x->dims();
-    out_dims[0] *= nranks;
-    out->mutable_data<T>(out_dims, place);
-    uint32_t send_numel = x->numel();
-    void* send_buff;
-    void* recv_buff;
-    phi::DenseTensor in_tensor, out_tensor;
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      // cast from int64 to int32 since cncl do not support int64
-      in_tensor.mutable_data<int32_t>(x->dims(), place);
-      out_tensor.mutable_data<int32_t>(out->dims(), place);
-      MLUCnnlTensorDesc x_int64_desc(*x);
-      MLUCnnlTensorDesc x_int32_desc(in_tensor);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    x_int64_desc.get(),
-                    GetBasePtr(x),
-                    x_int32_desc.get(),
-                    GetBasePtr(&in_tensor));
-      send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
-      recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
-    } else {
-      in_tensor.ShareDataWith(*x);
-      out_tensor.ShareDataWith(*out);
-      send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
-      recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
-    }
-    mluStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-    cnclDataType_t dtype = platform::ToCNCLDataType(
-        framework::TransToProtoVarType(in_tensor.dtype()));
-    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
-        send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      // cast back from int64 out_tensor to out
-      MLUCnnlTensorDesc out_int64_desc(*out);
-      MLUCnnlTensorDesc out_int32_desc(out_tensor);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    out_int32_desc.get(),
-                    GetBasePtr(&out_tensor),
-                    out_int64_desc.get(),
-                    GetBasePtr(out));
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with MLU."));
-#endif
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_allgather,
-                       ops::CAllGatherOpMLUKernel<float>,
-                       ops::CAllGatherOpMLUKernel<uint8_t>,
-                       ops::CAllGatherOpMLUKernel<int>,
-                       ops::CAllGatherOpMLUKernel<int8_t>,
-                       ops::CAllGatherOpMLUKernel<int16_t>,
-                       ops::CAllGatherOpMLUKernel<int64_t>,
-                       ops::CAllGatherOpMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_allreduce_max,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, uint8_t>)
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_allreduce_min,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, uint8_t>)
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_allreduce_prod,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, uint8_t>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-namespace paddle {
-namespace operators {
-template <typename T>
-class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    int numel = x->numel();
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
-    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
-    mluStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-    int root = ctx.Attr<int>("root");
-    if (root == comm->rank()) {
-      PADDLE_ENFORCE_MLU_SUCCESS(
-          cnclBcast(reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
-                    numel,
-                    dtype,
-                    root,
-                    comm->comm(),
-                    stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
-              << x->numel();
-      if (out != x) {
-        framework::TensorCopy(
-            *static_cast<const phi::DenseTensor*>(x),
-            place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<phi::DenseTensor*>(out));
-      }
-    } else {
-      PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place),
-                                           numel,
-                                           dtype,
-                                           root,
-                                           comm->comm(),
-                                           stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
-              << phi::product(out->dims());
-    }
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with MLU."));
-#endif
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_broadcast,
-                       ops::CBroadcastOPMLUKernel<float>,
-                       ops::CBroadcastOPMLUKernel<plat::float16>,
-                       ops::CBroadcastOPMLUKernel<int>,
-                       ops::CBroadcastOPMLUKernel<int16_t>,
-                       ops::CBroadcastOPMLUKernel<int8_t>,
-                       ops::CBroadcastOPMLUKernel<uint8_t>);
--- a/paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_reduce_max,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, uint8_t>)
--- a/paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_reduce_min,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, uint8_t>)
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_reduce_prod,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, uint8_t>)
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(c_reduce_sum,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, uint8_t>)
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -33,22 +33,7 @@ Call calculation stream synchronization.
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using MLU = plat::MLUPlace;
 REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream,
                             ops::CSyncCalcStreamOp,
                             ops::CSyncCalcStreamOpMaker);
-REGISTER_OP_NPU_KERNEL(c_sync_calc_stream,
-                       ops::CSyncCalcStreamKernel<float, MLU>,
-                       ops::CSyncCalcStreamKernel<double, MLU>,
-                       ops::CSyncCalcStreamKernel<int, MLU>,
-                       ops::CSyncCalcStreamKernel<int64_t, MLU>,
-                       ops::CSyncCalcStreamKernel<plat::float16, MLU>);
-REGISTER_OP_MLU_KERNEL(c_sync_calc_stream,
-                       ops::CSyncCalcStreamKernel<float, MLU>,
-                       ops::CSyncCalcStreamKernel<double, MLU>,
-                       ops::CSyncCalcStreamKernel<int, MLU>,
-                       ops::CSyncCalcStreamKernel<int64_t, MLU>,
-                       ops::CSyncCalcStreamKernel<plat::float16, MLU>);
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,
 REGISTER_OP_NPU_KERNEL(c_sync_comm_stream,
                       ops::CSyncCommStreamKernel<float, plat::NPUPlace>);
-REGISTER_OP_MLU_KERNEL(c_sync_comm_stream,
-                       ops::CSyncCommStreamKernel<float, plat::MLUPlace>);
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(mp_allreduce_sum,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
--- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class EqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_EQ,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename DeviceContext, typename T>
-class NotEqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_NE,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename DeviceContext, typename T>
-class LessThanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_LT,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename DeviceContext, typename T>
-class LessEqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_LE,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename DeviceContext, typename T>
-class GreaterThanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_GT,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename DeviceContext, typename T>
-class GreaterEqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_GE,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(
-    equal,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
-REGISTER_OP_MLU_KERNEL(
-    not_equal,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
-REGISTER_OP_MLU_KERNEL(
-    less_than,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
-REGISTER_OP_MLU_KERNEL(
-    less_equal,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
-REGISTER_OP_MLU_KERNEL(
-    greater_than,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
-REGISTER_OP_MLU_KERNEL(
-    greater_equal,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
--- a/paddle/fluid/operators/controlflow/logical_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T, cnnlLogicOp_t log_method>
-class LogicalMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    if (log_method == CNNL_LOGIC_OP_NOT) {
-      y = x;
-    }
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Logic(ctx,
-                   log_method,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   y_desc.get(),
-                   GetBasePtr(y),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(logical_not,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_NOT>);
-REGISTER_OP_MLU_KERNEL(logical_and,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_AND>);
-REGISTER_OP_MLU_KERNEL(logical_or,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_OR>);
-REGISTER_OP_MLU_KERNEL(logical_xor,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_XOR>);
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -38,11 +38,6 @@ if(WITH_XPU)
  detection_library(prior_box_op SRCS prior_box_op.cc)
  detection_library(yolo_box_op SRCS yolo_box_op.cc)
  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
-elseif(WITH_MLU)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-                    iou_similarity_op_mlu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
-  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
 else()
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op.cu)

--- a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-struct IouFunction {
- public:
-  explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
-    place = ctx.GetPlace();
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    //  y should be init first
-    TransposeFromMLUTensor<T>(ctx, axis, x, y, false /*need_reshape_or_alloc*/);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-    MLUCnnlOpTensorDesc add_op_desc(
-        CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      add_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z),
-                      ToCnnlDataType<T>());
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-    MLUCnnlOpTensorDesc sub_op_desc(
-        CNNL_OP_TENSOR_SUB, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      sub_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z),
-                      ToCnnlDataType<T>());
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      mul_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z),
-                      ToCnnlDataType<T>());
-  }
-  void DivNoNan(const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
-    MLUCnnl::DivNoNan(ctx,
-                      prefer,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z));
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    float alpha = 1.0;
-    float beta = scalar;
-    MLUCnnl::Transform(ctx,
-                       &alpha,
-                       &beta,
-                       x_desc.get(),
-                       GetBasePtr(x),
-                       y_desc.get(),
-                       GetBasePtr(y));
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-    MLUCnnl::Maximum(ctx,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     z_desc.get(),
-                     GetBasePtr(z));
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-    MLUCnnl::Minimum(ctx,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     z_desc.get(),
-                     GetBasePtr(z));
-  }
- private:
-  platform::Place place;
-  const framework::ExecutionContext& ctx;
-};
-template <typename T>
-class IouSimilarityMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    bool normalized = ctx.Attr<bool>("box_normalized");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto _type = x->dtype();
-    auto place = ctx.GetPlace();
-    IouFunction<T> F(ctx);
-    auto N = x->dims()[0];
-    auto M = y->dims()[0];
-    out->mutable_data<T>({N, M}, place);
-    phi::DenseTensor xt(_type);
-    phi::DenseTensor yt(_type);
-    xt.mutable_data<T>({4, N}, place);
-    yt.mutable_data<T>({4, M}, place);
-    std::vector<int> vec_trans = {1, 0};
-    F.Transpose(x, &xt, vec_trans);
-    F.Transpose(y, &yt, vec_trans);
-    phi::DenseTensor xmin1 = xt.Slice(0, 1);
-    phi::DenseTensor ymin1 = xt.Slice(1, 2);
-    phi::DenseTensor xmax1 = xt.Slice(2, 3);
-    phi::DenseTensor ymax1 = xt.Slice(3, 4);
-    phi::DenseTensor xmin2 = yt.Slice(0, 1);
-    phi::DenseTensor ymin2 = yt.Slice(1, 2);
-    phi::DenseTensor xmax2 = yt.Slice(2, 3);
-    phi::DenseTensor ymax2 = yt.Slice(3, 4);
-    xmin1.Resize({N, 1});
-    ymin1.Resize({N, 1});
-    xmax1.Resize({N, 1});
-    ymax1.Resize({N, 1});
-    xmin2.Resize({1, M});
-    ymin2.Resize({1, M});
-    xmax2.Resize({1, M});
-    ymax2.Resize({1, M});
-    phi::DenseTensor w1(_type);
-    phi::DenseTensor h1(_type);
-    phi::DenseTensor w2(_type);
-    phi::DenseTensor h2(_type);
-    phi::DenseTensor area1(_type);
-    phi::DenseTensor area2(_type);
-    w1.mutable_data<T>({N, 1}, place);
-    h1.mutable_data<T>({N, 1}, place);
-    w2.mutable_data<T>({1, M}, place);
-    h2.mutable_data<T>({1, M}, place);
-    area1.mutable_data<T>({N, 1}, place);
-    area2.mutable_data<T>({1, M}, place);
-    F.Sub(&xmax1, &xmin1, &w1);
-    F.Sub(&ymax1, &ymin1, &h1);
-    F.Sub(&xmax2, &xmin2, &w2);
-    F.Sub(&ymax2, &ymin2, &h2);
-    if (!normalized) {
-      F.Adds(&w1, 1.0f, &w1);
-      F.Adds(&h1, 1.0f, &h1);
-      F.Adds(&w2, 1.0f, &w2);
-      F.Adds(&h2, 1.0f, &h2);
-    }
-    F.Mul(&w1, &h1, &area1);
-    F.Mul(&w2, &h2, &area2);
-    phi::DenseTensor inter_xmax(_type);
-    phi::DenseTensor inter_ymax(_type);
-    phi::DenseTensor inter_xmin(_type);
-    phi::DenseTensor inter_ymin(_type);
-    inter_xmax.mutable_data<T>({N, M}, place);
-    inter_ymax.mutable_data<T>({N, M}, place);
-    inter_xmin.mutable_data<T>({N, M}, place);
-    inter_ymin.mutable_data<T>({N, M}, place);
-    F.Minimum(&xmax1, &xmax2, &inter_xmax);
-    F.Minimum(&ymax1, &ymax2, &inter_ymax);
-    F.Maximum(&xmin1, &xmin2, &inter_xmin);
-    F.Maximum(&ymin1, &ymin2, &inter_ymin);
-    phi::DenseTensor inter_w(_type);
-    phi::DenseTensor inter_h(_type);
-    inter_w.mutable_data<T>({N, M}, place);
-    inter_h.mutable_data<T>({N, M}, place);
-    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
-    F.Sub(&inter_ymax, &inter_ymin, &inter_h);
-    if (!normalized) {
-      F.Adds(&inter_w, 1.0f, &inter_w);
-      F.Adds(&inter_h, 1.0f, &inter_h);
-    }
-    phi::DenseTensor zeros(_type);
-    zeros.mutable_data<T>({1}, place);
-    FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
-    F.Maximum(&inter_w, &zeros, &inter_w);
-    F.Maximum(&inter_h, &zeros, &inter_h);
-    F.Mul(&inter_w, &inter_h, out);
-    phi::DenseTensor union_area(_type);
-    union_area.mutable_data<T>({N, M}, place);
-    F.Add(&area1, &area2, &union_area);
-    F.Sub(&union_area, out, &union_area);
-    F.DivNoNan(out, &union_area, out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(iou_similarity,
-                       ops::IouSimilarityMLUKernel<float>,
-                       ops::IouSimilarityMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/detection/prior_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class PriorBoxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* image = ctx.Input<phi::DenseTensor>("Image");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
-    float step_w = ctx.Attr<float>("step_w");
-    float step_h = ctx.Attr<float>("step_h");
-    float offset = ctx.Attr<float>("offset");
-    bool clip = ctx.Attr<bool>("clip");
-    bool min_max_aspect_ratios_order =
-        ctx.Attr<bool>("min_max_aspect_ratios_order");
-    int im_width = image->dims()[3];
-    int im_height = image->dims()[2];
-    int width = input->dims()[3];
-    int height = input->dims()[2];
-    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
-    bool flip = ctx.Attr<bool>("flip");
-    std::vector<float> new_aspect_ratios;
-    ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    phi::DenseTensor ratios;
-    paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
-    MLUOpTensorDesc new_aspect_ratios_desc(ratios);
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    phi::DenseTensor min;
-    paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
-    MLUOpTensorDesc min_sizes_desc(min);
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    phi::DenseTensor max;
-    paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
-    MLUOpTensorDesc max_sizes_desc(max);
-    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
-    phi::DenseTensor var_tensor;
-    paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
-    MLUOpTensorDesc variances_attr_desc(var_tensor);
-    auto place = ctx.GetPlace();
-    boxes->mutable_data<T>(place);
-    variances->mutable_data<T>(place);
-    MLUOpTensorDesc var_desc(*variances);
-    MLUOpTensorDesc output_desc(*boxes);
-    MLUOP::OpPriorBox(ctx,
-                      min_sizes_desc.get(),
-                      GetBasePtr(&min),
-                      new_aspect_ratios_desc.get(),
-                      GetBasePtr(&ratios),
-                      variances_attr_desc.get(),
-                      GetBasePtr(&var_tensor),
-                      max_sizes_desc.get(),
-                      GetBasePtr(&max),
-                      height,
-                      width,
-                      im_height,
-                      im_width,
-                      step_h,
-                      step_w,
-                      offset,
-                      clip,
-                      min_max_aspect_ratios_order,
-                      output_desc.get(),
-                      GetBasePtr(boxes),
-                      var_desc.get(),
-                      GetBasePtr(variances));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class YoloBoxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* scores = ctx.Output<phi::DenseTensor>("Scores");
-    const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
-    auto class_num = ctx.Attr<int>("class_num");
-    auto conf_thresh = ctx.Attr<float>("conf_thresh");
-    auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    auto clip_bbox = ctx.Attr<bool>("clip_bbox");
-    auto scale = ctx.Attr<float>("scale_x_y");
-    auto iou_aware = ctx.Attr<bool>("iou_aware");
-    auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    int anchor_num = anchors.size() / 2;
-    int64_t size = anchors.size();
-    auto dim_x = x->dims();
-    int n = dim_x[0];
-    int s = anchor_num;
-    int h = dim_x[2];
-    int w = dim_x[3];
-    // The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
-    // H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
-    // anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
-    std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
-    std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
-    // In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
-    // of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
-    // classification scores of boxes.
-    std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
-    std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
-    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    phi::DenseTensor boxes_tensor_mluops =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
-    phi::DenseTensor scores_tensor_mluops =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
-                                                   dev_ctx);
-    MLUOpTensorDesc boxes_trans_desc_mluops(
-        4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
-    MLUCnnlTensorDesc boxes_trans_desc_cnnl(
-        4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
-    MLUOpTensorDesc scores_trans_desc_mluops(
-        4, scores_dim_mluops.data(), ToMluOpDataType<T>());
-    MLUCnnlTensorDesc scores_trans_desc_cnnl(
-        4, scores_dim_mluops.data(), ToCnnlDataType<T>());
-    boxes->mutable_data<T>(ctx.GetPlace());
-    scores->mutable_data<T>(ctx.GetPlace());
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
-    MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
-    MLUOpTensorDesc img_size_desc(
-        *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
-    phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
-    anchors_temp.Resize({size});
-    paddle::framework::TensorFromVector(
-        anchors, ctx.device_context(), &anchors_temp);
-    MLUOpTensorDesc anchors_desc(anchors_temp);
-    MLUCnnlTensorDesc boxes_desc_cnnl(
-        4, boxes_out_dim.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc scores_desc_cnnl(
-        4, scores_out_dim.data(), ToCnnlDataType<T>());
-    MLUOP::OpYoloBox(ctx,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     img_size_desc.get(),
-                     GetBasePtr(img_size),
-                     anchors_desc.get(),
-                     GetBasePtr(&anchors_temp),
-                     class_num,
-                     conf_thresh,
-                     downsample_ratio,
-                     clip_bbox,
-                     scale,
-                     iou_aware,
-                     iou_aware_factor,
-                     boxes_trans_desc_mluops.get(),
-                     GetBasePtr(&boxes_tensor_mluops),
-                     scores_trans_desc_mluops.get(),
-                     GetBasePtr(&scores_tensor_mluops));
-    const std::vector<int> perm = {0, 1, 3, 2};
-    // transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
-    MLUCnnl::Transpose(ctx,
-                       perm,
-                       4,
-                       boxes_trans_desc_cnnl.get(),
-                       GetBasePtr(&boxes_tensor_mluops),
-                       boxes_desc_cnnl.get(),
-                       GetBasePtr(boxes));
-    // transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
-    // class_num]
-    MLUCnnl::Transpose(ctx,
-                       perm,
-                       4,
-                       scores_trans_desc_cnnl.get(),
-                       GetBasePtr(&scores_tensor_mluops),
-                       scores_desc_cnnl.get(),
-                       GetBasePtr(scores));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_ADD);
-  }
-};
-template <typename T>
-class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(elementwise_add,
-                       ops::ElementwiseAddMLUKernel<float>,
-                       ops::ElementwiseAddMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(elementwise_add_grad,
-                       ops::ElementwiseAddGradMLUKernel<float>,
-                       ops::ElementwiseAddGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<DIV, T>(ctx);
-  }
-};
-template <typename T>
-class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims,
-                           y_dims,
-                           x_dims_array.data(),
-                           y_dims_array.data(),
-                           out_dims_array.data(),
-                           max_dim,
-                           axis);
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    // compute dout/y == 1/y * dout
-    phi::DenseTensor dout_div_y(dout->dtype());
-    dout_div_y.Resize(dout->dims());
-    dout_div_y.mutable_data<T>(ctx.GetPlace());
-    MLUBinary<DIV>(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   dout_desc.get(),
-                   GetBasePtr(dout),
-                   y_desc.get(),
-                   GetBasePtr(y),
-                   dout_desc.get(),
-                   GetBasePtr(&dout_div_y));
-    if (dx) {
-      // compute dx = dout/y = 1/y * dout
-      if (dx->dims() != dout->dims()) {
-        dx->mutable_data<T>(ctx.GetPlace());
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dout_div_y.dims(), dx->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(*dx);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dout_div_y),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      } else {
-        dx->ShareDataWith(dout_div_y);
-      }
-    }
-    if (dy) {
-      // compute dy = -out * (dout/y) = -out/y * dout
-      phi::DenseTensor neg_out(out->type());
-      neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());
-      MLUCnnlTensorDesc out_desc(*out);
-      MLUUnary<NEG>(ctx,
-                    CNNL_COMPUTATION_HIGH_PRECISION,
-                    out_desc.get(),
-                    GetBasePtr(out),
-                    out_desc.get(),
-                    GetBasePtr(&neg_out));
-      phi::DenseTensor dy_temp(y->dtype());
-      dy_temp.Resize(dout->dims());
-      dy_temp.mutable_data<T>(ctx.GetPlace());
-      MLUCnnl::OpTensor(ctx,
-                        mul_op_desc.get(),
-                        dout_desc.get(),
-                        GetBasePtr(&neg_out),
-                        dout_desc.get(),
-                        GetBasePtr(&dout_div_y),
-                        dout_desc.get(),
-                        GetBasePtr(&dy_temp),
-                        ToCnnlDataType<T>());
-      if (dy->dims() != dout->dims()) {
-        dy->mutable_data<T>(ctx.GetPlace());
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(*dy);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dy_temp),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-      } else {
-        dy->ShareDataWith(dy_temp);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(elementwise_div,
-                       ops::ElementwiseDivMLUKernel<int>,
-                       ops::ElementwiseDivMLUKernel<float>,
-                       ops::ElementwiseDivMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(elementwise_div_grad,
-                       ops::ElementwiseDivGradMLUKernel<int>,
-                       ops::ElementwiseDivGradMLUKernel<float>,
-                       ops::ElementwiseDivGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwiseMaxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<MAXIMUM, T>(ctx);
-  }
-};
-template <typename T>
-class ElementwiseMaxGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUMinMaxGradHelper<MAXIMUM_GRAD, T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(elementwise_max,
-                       ops::ElementwiseMaxMLUKernel<int>,
-                       ops::ElementwiseMaxMLUKernel<float>,
-                       ops::ElementwiseMaxMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradMLUKernel<int>,
-    ops::ElementwiseMaxGradMLUKernel<float>,
-    ops::ElementwiseMaxGradMLUKernel<paddle::platform::float16>);
-#endif
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<MINIMUM, T>(ctx);
-  }
-};
-template <typename T>
-class ElementwiseMinGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUMinMaxGradHelper<MINIMUM_GRAD, T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(elementwise_min,
-                       ops::ElementwiseMinMLUKernel<int>,
-                       ops::ElementwiseMinMLUKernel<float>,
-                       ops::ElementwiseMinMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(elementwise_min_grad,
-                       ops::ElementwiseMinGradMLUKernel<int>,
-                       ops::ElementwiseMinGradMLUKernel<float>,
-                       ops::ElementwiseMinGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#ifdef PADDLE_WITH_MLU
-#include <vector>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-inline void GetReduceAxes(const int axis,
-                          const framework::DDim& src_ddims,
-                          const framework::DDim& target_ddims,
-                          std::vector<int>* axes) {
-  int64_t src_dim_size = src_ddims.size();
-  int64_t target_dim_size = target_ddims.size();
-  for (int64_t i = 0; i < src_dim_size; ++i) {
-    if (i < axis || i >= target_dim_size + axis) {
-      axes->push_back(i);
-      continue;
-    }
-    if (src_ddims[i] > target_ddims[i - axis]) {
-      axes->push_back(i);
-    }
-  }
-}
-inline void GetReduceAxesAndDstDims(const int axis,
-                                    const framework::DDim& src_ddims,
-                                    const framework::DDim& target_ddims,
-                                    std::vector<int>* reduce_axes,
-                                    std::vector<int>* dst_dims_vec) {
-  int64_t src_dim_size = src_ddims.size();
-  int64_t target_dim_size = target_ddims.size();
-  int src_axis = (target_dim_size < src_dim_size ? axis : 0);
-  for (int ax = 0; ax < src_dim_size; ++ax) {
-    if ((ax < src_axis || ax >= src_axis + target_dim_size) ||
-        (src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) {
-      reduce_axes->push_back(ax);
-    } else {
-      dst_dims_vec->push_back(src_ddims[ax]);
-    }
-  }
-  if (dst_dims_vec->size() == 0) {
-    // target_var is scalar
-    dst_dims_vec->push_back(1);
-  }
-}
-template <typename T>
-void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
-                       const cnnlOpTensorDesc_t op_tensor_op) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_mlu_place(ctx.GetPlace()),
-      true,
-      platform::errors::Unavailable("This kernel only runs on MLU."));
-  PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) ||
-                        (op_tensor_op == CNNL_OP_TENSOR_SUB) ||
-                        (op_tensor_op == CNNL_OP_TENSOR_MUL),
-                    true,
-                    platform::errors::Unavailable(
-                        "This kernel of MLU only support ADD, SUB, MUL."));
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* y = ctx.Input<phi::DenseTensor>("Y");
-  auto* out = ctx.Output<phi::DenseTensor>("Out");
-  out->mutable_data<T>(ctx.GetPlace());
-  int axis = ctx.Attr<int>("axis");
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  axis =
-      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims,
-                         y_dims,
-                         x_dims_array.data(),
-                         y_dims_array.data(),
-                         out_dims_array.data(),
-                         max_dim,
-                         axis);
-  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*out);
-  MLUCnnlOpTensorDesc op_tensor_desc(
-      op_tensor_op, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    op_tensor_desc.get(),
-                    x_desc.get(),
-                    GetBasePtr(x),
-                    y_desc.get(),
-                    GetBasePtr(y),
-                    out_desc.get(),
-                    GetBasePtr(out),
-                    ToCnnlDataType<T>());
-}
-// ------------------ BinaryOp -----------------
-enum BINARY_FUNCTOR {
-  DIV,
-  DIVNONAN,
-  MAXIMUM,
-  MINIMUM,
-  POW,
-};
-template <BINARY_FUNCTOR func>
-void MLUBinary(const framework::ExecutionContext& ctx,
-               cnnlComputationPreference_t prefer,
-               const cnnlTensorDescriptor_t x_desc,
-               const void* x,
-               const cnnlTensorDescriptor_t y_desc,
-               const void* y,
-               const cnnlTensorDescriptor_t out_desc,
-               void* out);
-template <>
-inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
-                           cnnlComputationPreference_t prefer,
-                           const cnnlTensorDescriptor_t x_desc,
-                           const void* x,
-                           const cnnlTensorDescriptor_t y_desc,
-                           const void* y,
-                           const cnnlTensorDescriptor_t out_desc,
-                           void* out) {
-  MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
-}
-template <>
-inline void MLUBinary<MAXIMUM>(
-    const framework::ExecutionContext& ctx,
-    cnnlComputationPreference_t prefer,  // useless, only for compatible
-    const cnnlTensorDescriptor_t x_desc,
-    const void* x,
-    const cnnlTensorDescriptor_t y_desc,
-    const void* y,
-    const cnnlTensorDescriptor_t out_desc,
-    void* out) {
-  MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out);
-}
-template <>
-inline void MLUBinary<MINIMUM>(const framework::ExecutionContext& ctx,
-                               cnnlComputationPreference_t prefer,
-                               const cnnlTensorDescriptor_t in1_desc,
-                               const void* in1,
-                               const cnnlTensorDescriptor_t in2_desc,
-                               const void* in2,
-                               const cnnlTensorDescriptor_t out_desc,
-                               void* out) {
-  MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out);
-}
-template <>
-inline void MLUBinary<POW>(const framework::ExecutionContext& ctx,
-                           cnnlComputationPreference_t prefer,
-                           const cnnlTensorDescriptor_t x_desc,
-                           const void* x,
-                           const cnnlTensorDescriptor_t y_desc,
-                           const void* y,
-                           const cnnlTensorDescriptor_t out_desc,
-                           void* out) {
-  MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
-}
-template <BINARY_FUNCTOR Functor, typename T>
-void MLUBinaryOp(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* y = ctx.Input<phi::DenseTensor>("Y");
-  auto* out = ctx.Output<phi::DenseTensor>("Out");
-  out->mutable_data<T>(ctx.GetPlace());
-  int axis = ctx.Attr<int>("axis");
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  axis =
-      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims,
-                         y_dims,
-                         x_dims_array.data(),
-                         y_dims_array.data(),
-                         out_dims_array.data(),
-                         max_dim,
-                         axis);
-  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
-  MLUBinary<Functor>(ctx,
-                     prefer_type,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     out_desc.get(),
-                     GetBasePtr(out));
-}
-// ------------------ UnaryOp -----------------
-enum UNARY_FUNCTOR {
-  NEG,
-  RECIPROCAL,
-};
-template <UNARY_FUNCTOR func>
-void MLUUnary(const framework::ExecutionContext& ctx,
-              cnnlComputationPreference_t prefer,
-              const cnnlTensorDescriptor_t input_desc,
-              const void* input,
-              const cnnlTensorDescriptor_t output_desc,
-              void* output);
-template <>
-inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
-                          cnnlComputationPreference_t prefer,
-                          const cnnlTensorDescriptor_t input_desc,
-                          const void* input,
-                          const cnnlTensorDescriptor_t output_desc,
-                          void* output) {
-  MLUCnnl::Neg(ctx, input_desc, input, output_desc, output);
-}
-template <>
-inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
-                                 cnnlComputationPreference_t prefer,
-                                 const cnnlTensorDescriptor_t input_desc,
-                                 const void* input,
-                                 const cnnlTensorDescriptor_t output_desc,
-                                 void* output) {
-  MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output);
-}
-template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
-void MLUUnaryOp(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* out = ctx.Output<phi::DenseTensor>("Out");
-  out->mutable_data<Tout>(ctx.GetPlace());
-  MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tin>());
-  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tout>());
-  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
-  MLUUnary<Functor>(ctx,
-                    prefer_type,
-                    x_desc.get(),
-                    GetBasePtr(x),
-                    out_desc.get(),
-                    GetBasePtr(out));
-}
-// ------------------ MLUElementwiseGradOp -----------------
-enum MINMAX_GRAD_FUNCTOR {
-  MAXIMUM_GRAD,
-  MINIMUM_GRAD,
-};
-template <MINMAX_GRAD_FUNCTOR Functor, typename Tin, typename Tout = Tin>
-void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* y = ctx.Input<phi::DenseTensor>("Y");
-  auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-  auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-  int axis = ctx.Attr<int>("axis");
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  axis =
-      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims,
-                         y_dims,
-                         x_dims_array.data(),
-                         y_dims_array.data(),
-                         out_dims_array.data(),
-                         max_dim,
-                         axis);
-  // mask = Logic(x, y) only support min & max
-  cnnlLogicOp_t logic =
-      Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE;
-  phi::DenseTensor mask(x->dtype());
-  mask.Resize(phi::make_ddim(out_dims_array));
-  mask.mutable_data<Tin>(ctx.GetPlace());
-  cnnlDataType_t data_type = ToCnnlDataType<Tin>();
-  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
-  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
-  MLUCnnlTensorDesc mask_desc(max_dim, out_dims_array.data(), data_type);
-  MLUCnnl::Logic(ctx,
-                 logic,
-                 x_desc.get(),
-                 GetBasePtr(x),
-                 y_desc.get(),
-                 GetBasePtr(y),
-                 mask_desc.get(),
-                 GetBasePtr(&mask));
-  // dx = Mul(dz, mask)
-  phi::DenseTensor dx_temp(x->dtype());
-  dx_temp.Resize(dout->dims());
-  dx_temp.mutable_data<Tout>(ctx.GetPlace());
-  MLUCnnlTensorDesc dout_desc(*dout);
-  MLUCnnlOpTensorDesc mul_op_desc(
-      CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    mul_op_desc.get(),
-                    dout_desc.get(),
-                    GetBasePtr(dout),
-                    dout_desc.get(),
-                    GetBasePtr(&mask),
-                    dout_desc.get(),
-                    GetBasePtr(&dx_temp),
-                    data_type);
-  // dy = Sub(dz, dx)
-  phi::DenseTensor dy_temp(y->dtype());
-  dy_temp.Resize(dout->dims());
-  dy_temp.mutable_data<Tout>(ctx.GetPlace());
-  MLUCnnlOpTensorDesc sub_op_desc(
-      CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    sub_op_desc.get(),
-                    dout_desc.get(),
-                    GetBasePtr(dout),
-                    dout_desc.get(),
-                    GetBasePtr(&dx_temp),
-                    dout_desc.get(),
-                    GetBasePtr(&dy_temp),
-                    data_type);
-  if (dx) {
-    if (dx->dims() != dout->dims()) {
-      dx->mutable_data<Tout>(ctx.GetPlace());
-      std::vector<int> reduce_axes;
-      GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
-      MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                       CNNL_REDUCE_ADD,
-                                       data_type,
-                                       CNNL_NOT_PROPAGATE_NAN,
-                                       CNNL_REDUCE_NO_INDICES,
-                                       CNNL_32BIT_INDICES);
-      MLUCnnlTensorDesc dx_desc(*dx);
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      dout_desc.get(),
-                      GetBasePtr(&dx_temp),
-                      0,
-                      nullptr,
-                      nullptr,
-                      dx_desc.get(),
-                      GetBasePtr(dx));
-    } else {
-      dx->ShareDataWith(dx_temp);
-    }
-  }
-  if (dy) {
-    if (dy->dims() != dout->dims()) {
-      dy->mutable_data<Tout>(ctx.GetPlace());
-      std::vector<int> reduce_axes;
-      GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
-      MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                       CNNL_REDUCE_ADD,
-                                       data_type,
-                                       CNNL_NOT_PROPAGATE_NAN,
-                                       CNNL_REDUCE_NO_INDICES,
-                                       CNNL_32BIT_INDICES);
-      MLUCnnlTensorDesc dy_desc(*dy);
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      dout_desc.get(),
-                      GetBasePtr(&dy_temp),
-                      0,
-                      nullptr,
-                      nullptr,
-                      dy_desc.get(),
-                      GetBasePtr(dy));
-    } else {
-      dy->ShareDataWith(dy_temp);
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle
-#endif
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-namespace paddle {
-namespace operators {
-using MLUDeviceContext = platform::MLUDeviceContext;
-template <typename T>
-class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_MUL);
-  }
-};
-template <typename T>
-class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims,
-                           y_dims,
-                           x_dims_array.data(),
-                           y_dims_array.data(),
-                           out_dims_array.data(),
-                           max_dim,
-                           axis);
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() == dout->dims()) {
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          y_desc.get(),
-                          GetBasePtr(y),
-                          x_desc.get(),
-                          GetBasePtr(dx),
-                          ToCnnlDataType<T>());
-      } else {
-        phi::DenseTensor dx_temp(x->dtype());
-        dx_temp.Resize(dout->dims());
-        dx_temp.mutable_data<T>(ctx.GetPlace());
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          y_desc.get(),
-                          GetBasePtr(y),
-                          dout_desc.get(),
-                          GetBasePtr(&dx_temp),
-                          ToCnnlDataType<T>());
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(*dx);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dx_temp),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() == dout->dims()) {
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          x_desc.get(),
-                          GetBasePtr(x),
-                          y_desc.get(),
-                          GetBasePtr(dy),
-                          ToCnnlDataType<T>());
-      } else {
-        phi::DenseTensor dy_temp(y->dtype());
-        dy_temp.Resize(dout->dims());
-        dy_temp.mutable_data<T>(ctx.GetPlace());
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          x_desc.get(),
-                          GetBasePtr(x),
-                          dout_desc.get(),
-                          GetBasePtr(&dy_temp),
-                          ToCnnlDataType<T>());
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(*dy);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dy_temp),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(elementwise_mul,
-                       ops::ElementwiseMulMLUKernel<float>,
-                       ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
-                       ops::ElementwiseMulMLUKernel<int>);
-REGISTER_OP_MLU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradMLUKernel<float>,
-    ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
-    ops::ElementwiseMulGradMLUKernel<int>);
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwisePowMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<POW, T>(ctx);
-  }
-};
-template <typename T>
-class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto place = ctx.GetPlace();
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis =
-        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims,
-                           y_dims,
-                           x_dims_array.data(),
-                           y_dims_array.data(),
-                           out_dims_array.data(),
-                           max_dim,
-                           axis);
-    cnnlDataType_t data_type = ToCnnlDataType<T>();
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
-    MLUCnnlTensorDesc out_desc(max_dim, out_dims_array.data(), data_type);
-    auto dout_dims = dout->dims();
-    if (dx) {
-      // dx = dout * y * pow(x, y - 1);
-      phi::DenseTensor one_dx(y->type());
-      one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
-      phi::DenseTensor sub_dx(y->type());
-      sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
-      MLUCnnlOpTensorDesc op_tensor_desc(
-          CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
-      MLUCnnl::OpTensor(ctx,
-                        op_tensor_desc.get(),
-                        y_desc.get(),
-                        GetBasePtr(y),
-                        y_desc.get(),
-                        GetBasePtr(&one_dx),
-                        y_desc.get(),
-                        GetBasePtr(&sub_dx),
-                        data_type);
-      phi::DenseTensor tmp_dx(x->type());
-      tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
-      MLUCnnl::Pow(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   y_desc.get(),
-                   GetBasePtr(&sub_dx),
-                   out_desc.get(),
-                   GetBasePtr(&tmp_dx));
-      MLUCnnl::MulAx(ctx,
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dx));
-      MLUCnnl::MulAx(ctx,
-                     out_desc.get(),
-                     GetBasePtr(dout),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dx));
-      if (x_dims != dout_dims) {
-        dx->mutable_data<T>(place);
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dout_dims, x_dims, &reduce_axes);
-        if (!reduce_axes.empty()) {
-          MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                           CNNL_REDUCE_ADD,
-                                           data_type,
-                                           CNNL_NOT_PROPAGATE_NAN,
-                                           CNNL_REDUCE_NO_INDICES,
-                                           CNNL_32BIT_INDICES);
-          MLUCnnlTensorDesc dx_desc(*dx);
-          MLUCnnl::Reduce(ctx,
-                          true /*need_workspace*/,
-                          reduction_desc.get(),
-                          nullptr,
-                          out_desc.get(),
-                          GetBasePtr(&tmp_dx),
-                          0,
-                          nullptr,
-                          nullptr,
-                          dx_desc.get(),
-                          GetBasePtr(dx));
-        }
-      } else {
-        dx->ShareDataWith(tmp_dx);
-      }
-    }
-    if (dy) {
-      // dy = dout * log(x) * pow(x, y)
-      phi::DenseTensor tmp_dy(y->type());
-      tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
-      MLUCnnl::Pow(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   y_desc.get(),
-                   GetBasePtr(y),
-                   out_desc.get(),
-                   GetBasePtr(&tmp_dy));
-      phi::DenseTensor log_x(x->type());
-      log_x.mutable_data<T>(x->dims(), place);
-      MLUCnnl::Log(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   CNNL_LOG_E,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   x_desc.get(),
-                   GetBasePtr(&log_x));
-      MLUCnnl::MulAx(ctx,
-                     x_desc.get(),
-                     GetBasePtr(&log_x),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dy));
-      MLUCnnl::MulAx(ctx,
-                     out_desc.get(),
-                     GetBasePtr(dout),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dy));
-      if (y_dims != dout_dims) {
-        dy->mutable_data<T>(place);
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dout_dims, y_dims, &reduce_axes);
-        if (!reduce_axes.empty()) {
-          MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                           CNNL_REDUCE_ADD,
-                                           data_type,
-                                           CNNL_NOT_PROPAGATE_NAN,
-                                           CNNL_REDUCE_NO_INDICES,
-                                           CNNL_32BIT_INDICES);
-          MLUCnnlTensorDesc dy_desc(*dy);
-          MLUCnnl::Reduce(ctx,
-                          true /*need_workspace*/,
-                          reduction_desc.get(),
-                          nullptr,
-                          out_desc.get(),
-                          GetBasePtr(&tmp_dy),
-                          0,
-                          nullptr,
-                          nullptr,
-                          dy_desc.get(),
-                          GetBasePtr(dy));
-        }
-      } else {
-        dy->ShareDataWith(tmp_dy);
-      }
-    }
-    if (!dx && !dy) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Not support all outputs to be empty."));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(elementwise_pow,
-                       ops::ElementwisePowMLUKernel<plat::float16>,
-                       ops::ElementwisePowMLUKernel<float>);
-REGISTER_OP_MLU_KERNEL(elementwise_pow_grad,
-                       ops::ElementwisePowGradMLUKernel<plat::float16>,
-                       ops::ElementwisePowGradMLUKernel<float>);
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_SUB);
-  }
-};
-template <typename T>
-class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-        tmp_dout = dy;
-      }
-      // call neg op, dy = -dout
-      MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout);
-      MLUCnnlTensorDesc dy_desc(*dy);
-      MLUUnary<NEG>(ctx,
-                    CNNL_COMPUTATION_HIGH_PRECISION,
-                    tmp_dout_desc.get(),
-                    GetBasePtr(tmp_dout),
-                    dy_desc.get(),
-                    GetBasePtr(dy));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(elementwise_sub,
-                       ops::ElementwiseSubMLUKernel<int>,
-                       ops::ElementwiseSubMLUKernel<float>,
-                       ops::ElementwiseSubMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(elementwise_sub_grad,
-                       ops::ElementwiseSubGradMLUKernel<int>,
-                       ops::ElementwiseSubGradMLUKernel<float>,
-                       ops::ElementwiseSubGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape(
          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
      shape_data = cpu_shape_tensor.data<int>();
    }
-#endif
-#ifdef PADDLE_WITH_MLU
-    if (platform::is_mlu_place(shape_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-      shape_data = cpu_shape_tensor.data<int>();
-    }
 #endif
    auto vec_shape =
        std::vector<int>(shape_data, shape_data + shape_tensor->numel());
@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape(
        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
        vec_epxand_shape.push_back(*temp.data<int32_t>());
      }
-#endif
-#ifdef PADDLE_WITH_MLU
-      else if (platform::is_mlu_place(tensor->place())) {  // NOLINT
-        phi::DenseTensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_shape.push_back(*temp.data<int32_t>());
-      }
 #endif
      else {  // NOLINT
        vec_epxand_shape.push_back(*tensor->data<int32_t>());

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,11 +6,7 @@ if(WITH_XPU)
 endif()
 # please add new math_library in alphabetical order
-if(WITH_MLU)
+math_library(concat_and_split DEPS concat_and_split_functor)
-  math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
-else()
-  math_library(concat_and_split DEPS concat_and_split_functor)
-endif()
 math_library(context_project DEPS im2col math_function)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)

--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -17,9 +17,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
 };
 #endif
-#ifdef PADDLE_WITH_MLU
-template <typename T>
-class ConcatFunctor<platform::MLUDeviceContext, T> {
- public:
-  void operator()(const platform::MLUDeviceContext& context,
-                  const std::vector<phi::DenseTensor>& input,
-                  int axis,
-                  phi::DenseTensor* output) {
-    int dev_id = context.GetPlace().GetDeviceId();
-    platform::MLUDeviceGuard guard(dev_id);
-    auto ins_size = input.size();
-    const int axis_t = axis;
-    const int ins_size_t = ins_size;
-    // mlu should do sth
-    // init ins tensors
-    std::vector<const void*> inputs;
-    std::vector<MLUCnnlTensorDesc> input_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < ins_size; i++) {
-      input_descs.emplace_back(MLUCnnlTensorDesc(
-          input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
-      desc_vector.push_back(input_descs.back().get());
-      inputs.push_back(input[i].data());
-    }
-    // init out tensors
-    MLUCnnlTensorDesc output_desc(
-        *output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
-    // MLU should do sth
-    MLUCnnl::Concat(context,
-                    ins_size_t,
-                    axis_t,
-                    desc_vector.data(),
-                    inputs.data(),
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-template <typename T>
-class SplitFunctor<platform::MLUDeviceContext, T> {
- public:
-  void operator()(const platform::MLUDeviceContext& context,
-                  const phi::DenseTensor& input,
-                  const std::vector<const phi::DenseTensor*>& ref_inputs,
-                  const int axis,
-                  std::vector<phi::DenseTensor*>* outputs) {
-    if (input.numel() == 0) {
-      return;
-    }
-    int dev_id = context.GetPlace().GetDeviceId();
-    platform::MLUDeviceGuard guard(dev_id);
-    auto in_dims = input.dims();
-    auto out_size = outputs->size();
-    std::vector<framework::DDim> outs_dims(out_size, in_dims);
-    for (size_t i = 0; i < out_size; ++i) {
-      outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
-    }
-    // init out tensors
-    std::vector<void*> vct_tensor;
-    std::vector<MLUCnnlTensorDesc> output_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < out_size; i++) {
-      (*outputs)[i]->Resize(outs_dims[i]);
-      output_descs.emplace_back(
-          MLUCnnlTensorDesc(*(*outputs)[i],
-                            CNNL_LAYOUT_ARRAY,
-                            ToCnnlDataType((*outputs)[i]->dtype())));
-      desc_vector.push_back(output_descs.back().get());
-      vct_tensor.push_back(GetBasePtr((*outputs)[i]));
-    }
-    // init in tensors
-    MLUCnnlTensorDesc input_desc(
-        input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input.dtype()));
-    // MLU should do sth
-    MLUCnnl::Split(context,
-                   out_size,
-                   axis,
-                   input_desc.get(),
-                   input.data(),
-                   desc_vector.data(),
-                   vct_tensor.data());
-  }
-};
-#endif
 #define DEFINE_FUNCTOR(type)                           \
  template class ConcatFunctor<phi::CPUContext, type>; \
  template class SplitFunctor<phi::CPUContext, type>;
@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
 DEFINE_XPU_FUNCTOR(float)
 DEFINE_XPU_FUNCTOR(platform::float16)
 #endif
-#ifdef PADDLE_WITH_MLU
-#define DEFINE_MLU_FUNCTOR(type)                                  \
-  template class ConcatFunctor<platform::MLUDeviceContext, type>; \
-  template class SplitFunctor<platform::MLUDeviceContext, type>;
-DEFINE_MLU_FUNCTOR(float)
-DEFINE_MLU_FUNCTOR(platform::float16)
-DEFINE_MLU_FUNCTOR(int64_t)
-DEFINE_MLU_FUNCTOR(bool)
-DEFINE_MLU_FUNCTOR(int)
-DEFINE_MLU_FUNCTOR(int8_t)
-DEFINE_MLU_FUNCTOR(int16_t)
-DEFINE_MLU_FUNCTOR(uint8_t)
-#endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class AccuracyMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
-    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
-    auto* total = ctx.Output<phi::DenseTensor>("Total");
-    int num_samples = indices->dims()[0];
-    if (num_samples == 0) {
-      return;
-    }
-    // cast `indices` or `label` if their type is not INT32
-    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    phi::DenseTensor label_int32(framework::TransToPhiDataType(VT::INT32));
-    auto indices_type = framework::TransToProtoVarType(indices->type());
-    if (indices_type != VT::INT32) {
-      PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32),
-                        true,
-                        platform::errors::Unimplemented(
-                            "In accuracy mlu kernel, cast indices from [%s] to "
-                            "[%s] is not supported.",
-                            framework::DataTypeToString(indices_type),
-                            framework::DataTypeToString(VT::INT32)));
-      indices_int32.Resize(indices->dims());
-      indices_int32.mutable_data<int>(ctx.GetPlace());
-      MLUCnnlTensorDesc org_indices_desc(*indices);
-      MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-      cnnlCastDataType_t cast_type = GetCastDataType(indices_type, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    org_indices_desc.get(),
-                    GetBasePtr(indices),
-                    indices_int32_desc.get(),
-                    GetBasePtr(&indices_int32));
-    } else {
-      indices_int32.ShareDataWith(*indices);
-    }
-    auto label_type = framework::TransToProtoVarType(label->type());
-    if (label_type != VT::INT32) {
-      PADDLE_ENFORCE_EQ(
-          MLUSupportsCast(label_type, VT::INT32),
-          true,
-          platform::errors::Unimplemented(
-              "In accuracy mlu kernel, cast label from [%s] to [%s] "
-              "is not supported.",
-              framework::DataTypeToString(label_type),
-              framework::DataTypeToString(VT::INT32)));
-      label_int32.Resize(label->dims());
-      label_int32.mutable_data<int>(ctx.GetPlace());
-      MLUCnnlTensorDesc org_label_desc(*label);
-      MLUCnnlTensorDesc label_int32_desc(label_int32);
-      cnnlCastDataType_t cast_type = GetCastDataType(label_type, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    org_label_desc.get(),
-                    GetBasePtr(label),
-                    label_int32_desc.get(),
-                    GetBasePtr(&label_int32));
-    } else {
-      label_int32.ShareDataWith(*label);
-    }
-    // equal
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnlTensorDesc label_int32_desc(label_int32);
-    phi::DenseTensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
-    equal_tensor.Resize(indices->dims());
-    equal_tensor.mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_EQ,
-                   indices_int32_desc.get(),
-                   GetBasePtr(&indices_int32),
-                   label_int32_desc.get(),
-                   GetBasePtr(&label_int32),
-                   equal_tensor_desc.get(),
-                   GetBasePtr(&equal_tensor));
-    // cast equal
-    phi::DenseTensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
-    equal_fp32.Resize(indices->dims());
-    equal_fp32.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
-    cnnlCastDataType_t equal_cast_type = GetCastDataType(VT::BOOL, VT::FP32);
-    MLUCnnl::Cast(ctx,
-                  equal_cast_type,
-                  equal_tensor_desc.get(),
-                  GetBasePtr(&equal_tensor),
-                  equal_fp32_desc.get(),
-                  GetBasePtr(&equal_fp32));
-    // [correct]
-    // reduce_max
-    phi::DenseTensor correct_max(framework::TransToPhiDataType(VT::FP32));
-    correct_max.Resize(phi::make_ddim({num_samples}));
-    correct_max.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc correct_max_desc(correct_max);
-    MLUCnnlReduceDesc reduce_max_desc({1},
-                                      CNNL_REDUCE_MAX,
-                                      ToCnnlDataType<float>(),
-                                      CNNL_NOT_PROPAGATE_NAN,
-                                      CNNL_REDUCE_NO_INDICES,
-                                      CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(ctx,
-                    true /*need_workspace*/,
-                    reduce_max_desc.get(),
-                    nullptr,
-                    equal_fp32_desc.get(),
-                    GetBasePtr(&equal_fp32),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    correct_max_desc.get(),
-                    GetBasePtr(&correct_max));
-    // reduce_sum
-    phi::DenseTensor correct_sum(framework::TransToPhiDataType(VT::FP32));
-    correct_sum.Resize(correct->dims());
-    correct_sum.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc correct_sum_desc(correct_sum);
-    MLUCnnlReduceDesc reduce_sum_desc({0},
-                                      CNNL_REDUCE_ADD,
-                                      ToCnnlDataType<float>(),
-                                      CNNL_NOT_PROPAGATE_NAN,
-                                      CNNL_REDUCE_NO_INDICES,
-                                      CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(ctx,
-                    true /*need_workspace*/,
-                    reduce_sum_desc.get(),
-                    nullptr,
-                    correct_max_desc.get(),
-                    GetBasePtr(&correct_max),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    correct_sum_desc.get(),
-                    GetBasePtr(&correct_sum));
-    // cast to int
-    correct->mutable_data<int>(ctx.GetPlace());
-    MLUCnnlTensorDesc correct_desc(*correct);
-    cnnlCastDataType_t correct_cast_type = GetCastDataType(VT::FP32, VT::INT32);
-    MLUCnnl::Cast(ctx,
-                  correct_cast_type,
-                  correct_sum_desc.get(),
-                  GetBasePtr(&correct_sum),
-                  correct_desc.get(),
-                  GetBasePtr(correct));
-    // [total]
-    total->mutable_data<int>(ctx.GetPlace());
-    MLUCnnlTensorDesc total_desc(*total);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &num_samples,
-                  total_desc.get(),
-                  GetBasePtr(total));
-    // use `total` of type `float32` for calculating accuracy
-    phi::DenseTensor total_fp32(framework::TransToPhiDataType(VT::FP32));
-    total_fp32.Resize(total->dims());
-    total_fp32.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc total_fp32_desc(total_fp32);
-    float num_samples_fp32 = static_cast<float>(num_samples);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &num_samples_fp32,
-                  total_fp32_desc.get(),
-                  GetBasePtr(&total_fp32));
-    // [accuracy]
-    accuracy->mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc accuracy_desc(*accuracy);
-    MLUCnnl::Div(ctx,
-                 CNNL_COMPUTATION_HIGH_PRECISION,
-                 correct_sum_desc.get(),
-                 GetBasePtr(&correct_sum),
-                 total_fp32_desc.get(),
-                 GetBasePtr(&total_fp32),
-                 accuracy_desc.get(),
-                 GetBasePtr(accuracy));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(accuracy,
-                       ops::AccuracyMLUKernel<float>,
-                       ops::AccuracyMLUKernel<paddle::platform::float16>,
-                       ops::AccuracyMLUKernel<int16_t>,
-                       ops::AccuracyMLUKernel<int64_t>,
-                       ops::AccuracyMLUKernel<uint8_t>,
-                       ops::AccuracyMLUKernel<int>);
--- a/paddle/fluid/operators/mlu/CMakeLists.txt
+++ b/paddle/fluid/operators/mlu/CMakeLists.txt
-if(WITH_MLU)
-  cc_library(
-    mlu_baseop
-    SRCS mlu_baseop.cc
-    DEPS neuware_lib device_context)
-  cc_test(
-    activation_op_mlu_test
-    SRCS activation_op_mlu_test.cc
-    DEPS op_registry activation_op scope device_context executor)
-endif()
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace fw = paddle::framework;
-namespace plat = paddle::platform;
-USE_OP_ITSELF(relu);
-USE_OP_DEVICE_KERNEL(relu, MLU);
-// relu
-template <typename T>
-inline T relu(T x) {
-  return x > 0 ? x : 0.;
-}
-template <typename T>
-inline T relu_grad_dx(T x, T out, T dout) {
-  return out > 0 ? dout : 0;
-}
-template <typename T>
-void Compare(fw::Scope* scope,
-             const plat::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  const int num = 10;
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < num * num; ++i) {
-    init_x.push_back(static_cast<T>(i - 50));
-  }
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  fw::AttributeMap attrs;
-  auto op = fw::OpRegistry::CreateOp(
-      op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-  // eval time
-  struct timeval start, end;
-  gettimeofday(&start, NULL);
-  for (int i = 0; i < 100; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-  gettimeofday(&end, NULL);
-  int micros =
-      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
-  printf("used time: %d\n", micros / 100);
-  // eval value
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_FLOAT_EQ(out_vec[i], relu<T>(init_x[i]));
-  }
-}
-template <typename T>
-void CompareGrad(fw::Scope* scope,
-                 const plat::DeviceContext& ctx,
-                 std::string op_type) {
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  const int num = 10;
-  std::vector<T> init_dout;
-  for (int64_t i = 0; i < num * num; ++i) {
-    init_dout.push_back(static_cast<T>(1.0));
-  }
-  std::vector<T> init_out;
-  for (int64_t i = 0; i < num * num; ++i) {
-    init_out.push_back(static_cast<T>(i - 50));
-  }
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize({num, num});
-  paddle::framework::TensorFromVector(init_out, ctx, tensor_out);
-  tensor_out->Resize({num, num});
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-  // run
-  auto place = ctx.GetPlace();
-  fw::AttributeMap attrs;
-  auto op = fw::OpRegistry::CreateOp(op_type,
-                                     {{"Out@GRAD", {"DOut"}}, {"Out", {"Out"}}},
-                                     {{"X@GRAD", {"DX"}}},
-                                     attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-  // eval time
-  struct timeval start, end;
-  gettimeofday(&start, NULL);
-  for (int i = 0; i < 100; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-  gettimeofday(&end, NULL);
-  int micros =
-      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
-  printf("used time: %d\n", micros / 100);
-  // eval value
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-  ctx.Wait();
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    EXPECT_FLOAT_EQ(dx_vec[i],
-                    relu_grad_dx<T>(dx_vec[i], init_out[i], init_dout[i]));
-  }
-}
-TEST(relu, MLU_fp32) {
-  fw::Scope scope;
-  auto* ctx = plat::DeviceContextPool::Instance().Get(plat::MLUPlace(0));
-  Compare<float>(&scope, *ctx, "relu");
-}
-TEST(relu_grad, MLU_fp32) {
-  fw::Scope scope;
-  auto* ctx = plat::DeviceContextPool::Instance().Get(plat::MLUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "relu_grad");
-}
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <cn_api.h>
-#include <cnnl.h>
-#include <concurrentqueue.h>
-#include <mlu_op.h>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/platform/device/mlu/enforce.h"
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-using ExecutionContext = framework::ExecutionContext;
-using DeviceContextPool = platform::DeviceContextPool;
-using MLUDeviceContext = platform::MLUDeviceContext;
-const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
-    {"reduce_all", CNNL_REDUCE_AND},
-    {"reduce_any", CNNL_REDUCE_OR},
-    {"reduce_max", CNNL_REDUCE_MAX},
-    {"reduce_mean", CNNL_REDUCE_AVG},
-    {"reduce_min", CNNL_REDUCE_MIN},
-    {"reduce_sum", CNNL_REDUCE_ADD},
-    {"reduce_prod", CNNL_REDUCE_MUL},
-};
-const std::map<std::string, cnnlInterpMode_t> MLUInterpModeMap = {
-    {"bilinear", CNNL_INTERP_BILINEAR},
-    {"nearest", CNNL_INTERP_NEAREST},
-    {"linear", CNNL_INTERP_LINEAR},
-    {"trilinear", CNNL_INTERP_TRILINEAR},
-    {"bicubic", CNNL_INTERP_BICUBIC}};
-const std::map<std::string, cnnlInterpBackwardMode_t> MLUInterpBackwardModeMap =
-    {{"bilinear", CNNL_INTERP_BACKWARD_BILINEAR},
-     {"nearest", CNNL_INTERP_BACKWARD_NEAREST},
-     {"linear", CNNL_INTERP_BACKWARD_LINEAR},
-     {"trilinear", CNNL_INTERP_BACKWARD_TRILINEAR},
-     {"bicubic", CNNL_INTERP_BACKWARD_BICUBIC}};
-inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) {
-  auto iter = MLUReduceOpMap.find(reduce_name);
-  if (iter != MLUReduceOpMap.end()) {
-    return iter->second;
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Not support reduce op type of MLU Device: %s", reduce_name));
-}
-inline cnnlInterpMode_t GetMLUCnnlInterpMode(const std::string interp_mode) {
-  auto iter = MLUInterpModeMap.find(interp_mode);
-  if (iter != MLUInterpModeMap.end()) {
-    return iter->second;
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Not support interp mode of MLU Device: %s", interp_mode));
-}
-inline cnnlInterpBackwardMode_t GetMLUCnnlInterpBackwardMode(
-    const std::string interp_mode) {
-  auto iter = MLUInterpBackwardModeMap.find(interp_mode);
-  if (iter != MLUInterpBackwardModeMap.end()) {
-    return iter->second;
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Not support interp mode of MLU Device: %s", interp_mode));
-}
-inline const void* GetBasePtr(const phi::DenseTensor* t) { return t->data(); }
-inline void* GetBasePtr(phi::DenseTensor* t) { return t->data(); }
-inline cnnlDataType_t ToCnnlDataType(const phi::DataType& dtype) {
-  cnnlDataType_t type = CNNL_DTYPE_FLOAT;
-  switch (dtype) {
-    case DataType::FLOAT16:
-      type = CNNL_DTYPE_HALF;
-      break;
-    case DataType::FLOAT32:
-      type = CNNL_DTYPE_FLOAT;
-      break;
-    case DataType::FLOAT64:
-      type = CNNL_DTYPE_DOUBLE;
-      break;
-    case DataType::INT8:
-      type = CNNL_DTYPE_INT8;
-      break;
-    case DataType::INT16:
-      type = CNNL_DTYPE_INT16;
-      break;
-    case DataType::INT32:
-      type = CNNL_DTYPE_INT32;
-      break;
-    case DataType::INT64:
-      type = CNNL_DTYPE_INT64;
-      break;
-    case DataType::BOOL:
-      type = CNNL_DTYPE_BOOL;
-      break;
-    case DataType::UINT8:
-      type = CNNL_DTYPE_UINT8;
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-inline cnnlDataType_t ToCnnlDataType(
-    const paddle::framework::proto::VarType::Type& type) {
-  return ToCnnlDataType(framework::TransToPhiDataType(type));
-}
-template <typename T>
-inline cnnlDataType_t ToCnnlDataType() {
-  auto type = framework::ToDataType(std::type_index(typeid(T)));
-  return ToCnnlDataType(type);
-}
-inline mluOpDataType_t ToMluOpDataType(const phi::DataType& dtype) {
-  mluOpDataType_t type = MLUOP_DTYPE_FLOAT;
-  switch (dtype) {
-    case DataType::FLOAT16:
-      type = MLUOP_DTYPE_HALF;
-      break;
-    case DataType::FLOAT32:
-      type = MLUOP_DTYPE_FLOAT;
-      break;
-    case DataType::FLOAT64:
-      type = MLUOP_DTYPE_DOUBLE;
-      break;
-    case DataType::INT8:
-      type = MLUOP_DTYPE_INT8;
-      break;
-    case DataType::INT16:
-      type = MLUOP_DTYPE_INT16;
-      break;
-    case DataType::INT32:
-      type = MLUOP_DTYPE_INT32;
-      break;
-    case DataType::INT64:
-      type = MLUOP_DTYPE_INT64;
-      break;
-    case DataType::BOOL:
-      type = MLUOP_DTYPE_BOOL;
-      break;
-    case DataType::UINT8:
-      type = MLUOP_DTYPE_UINT8;
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-inline mluOpDataType_t ToMluOpDataType(
-    const paddle::framework::proto::VarType::Type& type) {
-  return ToMluOpDataType(framework::TransToPhiDataType(type));
-}
-template <typename T>
-inline mluOpDataType_t ToMluOpDataType() {
-  auto type = framework::ToDataType(std::type_index(typeid(T)));
-  return ToMluOpDataType(type);
-}
-// Converts (via narrowing) a type T value to a type U, and checks that the
-// value has no value change due to the conversion.
-template <typename WideT, typename NarrowT>
-NarrowT CheckedNarrowing(const WideT& wide) {
-  NarrowT narrow = wide;
-  CHECK_EQ(narrow, wide)
-      << "checked narrowing failed; values not equal post-conversion";
-  return narrow;
-}
-inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
-  return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
-}
-inline static mluOpHandle_t GetMLUOpHandleFromCTX(const ExecutionContext& ctx) {
-  return ctx.template device_context<MLUDeviceContext>().mluOp_handle();
-}
-inline static const MLUDeviceContext& GetDevCtxFromCTX(
-    const ExecutionContext& ctx) {
-  return ctx.template device_context<MLUDeviceContext>();
-}
-using VT = framework::proto::VarType;
-const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
-    MLU_SUPPORTED_CAST_TYPE = {
-        {{VT::FP32, /*cast to*/ VT::FP16}, CNNL_CAST_FLOAT_TO_HALF},
-        {{VT::FP32, /*cast to*/ VT::INT32}, CNNL_CAST_FLOAT_TO_INT32},
-        {{VT::FP32, /*cast to*/ VT::INT16}, CNNL_CAST_FLOAT_TO_INT16},
-        {{VT::FP32, /*cast to*/ VT::INT8}, CNNL_CAST_FLOAT_TO_INT8},
-        {{VT::FP32, /*cast to*/ VT::UINT8}, CNNL_CAST_FLOAT_TO_UINT8},
-        {{VT::FP32, /*cast to*/ VT::BOOL}, CNNL_CAST_FLOAT_TO_BOOL},
-        {{VT::FP16, /*cast to*/ VT::FP32}, CNNL_CAST_HALF_TO_FLOAT},
-        {{VT::FP16, /*cast to*/ VT::INT32}, CNNL_CAST_HALF_TO_INT32},
-        {{VT::FP16, /*cast to*/ VT::INT16}, CNNL_CAST_HALF_TO_INT16},
-        {{VT::FP16, /*cast to*/ VT::INT8}, CNNL_CAST_HALF_TO_INT8},
-        {{VT::FP16, /*cast to*/ VT::UINT8}, CNNL_CAST_HALF_TO_UINT8},
-        {{VT::FP16, /*cast to*/ VT::BOOL}, CNNL_CAST_HALF_TO_BOOL},
-        {{VT::INT32, /*cast to*/ VT::FP32}, CNNL_CAST_INT32_TO_FLOAT},
-        {{VT::INT32, /*cast to*/ VT::FP16}, CNNL_CAST_INT32_TO_HALF},
-        {{VT::INT32, /*cast to*/ VT::INT8}, CNNL_CAST_INT32_TO_INT8},
-        {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16},
-        {{VT::INT16, /*cast to*/ VT::FP32}, CNNL_CAST_INT16_TO_FLOAT},
-        {{VT::INT16, /*cast to*/ VT::FP16}, CNNL_CAST_INT16_TO_HALF},
-        {{VT::INT16, /*cast to*/ VT::INT32}, CNNL_CAST_INT16_TO_INT32},
-        {{VT::INT8, /*cast to*/ VT::FP32}, CNNL_CAST_INT8_TO_FLOAT},
-        {{VT::INT8, /*cast to*/ VT::FP16}, CNNL_CAST_INT8_TO_HALF},
-        {{VT::INT8, /*cast to*/ VT::INT32}, CNNL_CAST_INT8_TO_INT32},
-        {{VT::UINT8, /*cast to*/ VT::FP32}, CNNL_CAST_UINT8_TO_FLOAT},
-        {{VT::UINT8, /*cast to*/ VT::FP16}, CNNL_CAST_UINT8_TO_HALF},
-        {{VT::BOOL, /*cast to*/ VT::FP32}, CNNL_CAST_BOOL_TO_FLOAT},
-        {{VT::BOOL, /*cast to*/ VT::FP16}, CNNL_CAST_BOOL_TO_HALF},
-        {{VT::BOOL, /*cast to*/ VT::INT32}, CNNL_CAST_BOOL_TO_INT32},
-        {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32},
-        {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64},
-        {{VT::INT64, /*cast to*/ VT::INT32}, CNNL_CAST_INT64_TO_INT32},
-        {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL},
-        {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64},
-        {{VT::INT8, /*cast to*/ VT::INT16}, CNNL_CAST_INT8_TO_INT16},
-        {{VT::FP32, /*cast to*/ VT::FP64}, CNNL_CAST_FLOAT_TO_DOUBLE},
-        {{VT::FP64, /*cast to*/ VT::FP32}, CNNL_CAST_DOUBLE_TO_FLOAT},
-        {{VT::INT64, /*cast to*/ VT::FP32}, CNNL_CAST_INT64_TO_FLOAT},
-        {{VT::INT64, /*cast to*/ VT::FP16}, CNNL_CAST_INT64_TO_HALF},
-        {{VT::FP32, /*cast to*/ VT::INT64}, CNNL_CAST_FLOAT_TO_INT64},
-        {{VT::FP16, /*cast to*/ VT::INT64}, CNNL_CAST_HALF_TO_INT64},
-};
-cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
-                                   const VT::Type& dst_type);
-cnnlCastDataType_t GetCastDataType(const DataType& src_type,
-                                   const DataType& dst_type);
-bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type);
-cnnlDeviceType_t GetCnnlDev(int dev_ordinal);
-using CnnlTensorDesc = cnnlTensorDescriptor_t;
-class MLUCnnlTensorDesc {
- public:
-  MLUCnnlTensorDesc() {}
-  // SE_DISALLOW_COPY_AND_ASSIGN
-  MLUCnnlTensorDesc(const MLUCnnlTensorDesc& desc) = delete;
-  MLUCnnlTensorDesc& operator=(const MLUCnnlTensorDesc&) = delete;
-  MLUCnnlTensorDesc(MLUCnnlTensorDesc&& rhs)
-      : raw_tensor_desc(rhs.raw_tensor_desc) {
-    rhs.raw_tensor_desc = nullptr;
-  }
-  MLUCnnlTensorDesc& operator=(MLUCnnlTensorDesc&& rhs);
-  MLUCnnlTensorDesc(const int tensor_dim,
-                    const int dim_sizes[],
-                    const cnnlDataType_t tensor_dtype);
-  MLUCnnlTensorDesc(const int tensor_dim,
-                    const int dim_sizes[],
-                    const cnnlDataType_t tensor_dtype,
-                    const cnnlTensorLayout_t layout);
-  MLUCnnlTensorDesc(const int tensor_dim,
-                    const int dim_sizes[],
-                    const cnnlDataType_t tensor_dtype,
-                    int position);
-  MLUCnnlTensorDesc(const int tensor_dim,
-                    const int64_t dim_sizes[],
-                    const cnnlDataType_t tensor_dtype);
-  MLUCnnlTensorDesc(const int tensor_dim,
-                    const int64_t dim_sizes[],
-                    const cnnlDataType_t tensor_dtype,
-                    const cnnlTensorLayout_t layout);
-  MLUCnnlTensorDesc(const int tensor_dim,
-                    const int64_t dim_sizes[],
-                    const cnnlDataType_t tensor_dtype,
-                    int position);
-  MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
-                    const cnnlTensorLayout_t layout,
-                    const cnnlDataType_t tensor_dtype);
-  explicit MLUCnnlTensorDesc(const phi::DenseTensor& tensor);
-  MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
-                    cnnlTensorLayout_t layout,
-                    const cnnlDataType_t tensor_dtype,
-                    int position);
-  MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
-                    cnnlTensorLayout_t layout,
-                    const cnnlDataType_t tensor_dtype,
-                    int position,
-                    float scale);
-  ~MLUCnnlTensorDesc();
-  const cnnlTensorDescriptor_t get() const { return raw_tensor_desc; }
- private:
-  cnnlTensorDescriptor_t raw_tensor_desc = nullptr;
-};
-class MLUOpTensorDesc {
- public:
-  MLUOpTensorDesc() {}
-  // SE_DISALLOW_COPY_AND_ASSIGN
-  MLUOpTensorDesc(const MLUOpTensorDesc& desc) = delete;
-  MLUOpTensorDesc& operator=(const MLUOpTensorDesc&) = delete;
-  MLUOpTensorDesc(MLUOpTensorDesc&& rhs)
-      : raw_tensor_desc(rhs.raw_tensor_desc) {
-    rhs.raw_tensor_desc = nullptr;
-  }
-  MLUOpTensorDesc& operator=(MLUOpTensorDesc&& rhs);
-  MLUOpTensorDesc(const int tensor_dim,
-                  const int dim_sizes[],
-                  const mluOpDataType_t tensor_dtype);
-  MLUOpTensorDesc(const int tensor_dim,
-                  const int dim_sizes[],
-                  const mluOpDataType_t tensor_dtype,
-                  const mluOpTensorLayout_t layout);
-  MLUOpTensorDesc(const int tensor_dim,
-                  const int dim_sizes[],
-                  const mluOpDataType_t tensor_dtype,
-                  int position);
-  MLUOpTensorDesc(const int tensor_dim,
-                  const int64_t dim_sizes[],
-                  const mluOpDataType_t tensor_dtype);
-  MLUOpTensorDesc(const int tensor_dim,
-                  const int64_t dim_sizes[],
-                  const mluOpDataType_t tensor_dtype,
-                  const mluOpTensorLayout_t layout);
-  MLUOpTensorDesc(const int tensor_dim,
-                  const int64_t dim_sizes[],
-                  const mluOpDataType_t tensor_dtype,
-                  int position);
-  MLUOpTensorDesc(const phi::DenseTensor& tensor,
-                  const mluOpTensorLayout_t layout,
-                  const mluOpDataType_t tensor_dtype);
-  explicit MLUOpTensorDesc(const phi::DenseTensor& tensor);
-  MLUOpTensorDesc(const phi::DenseTensor& tensor,
-                  mluOpTensorLayout_t layout,
-                  const mluOpDataType_t tensor_dtype,
-                  int position);
-  MLUOpTensorDesc(const phi::DenseTensor& tensor,
-                  mluOpTensorLayout_t layout,
-                  const mluOpDataType_t tensor_dtype,
-                  int position,
-                  float scale);
-  ~MLUOpTensorDesc();
-  const mluOpTensorDescriptor_t get() const { return raw_tensor_desc; }
- private:
-  mluOpTensorDescriptor_t raw_tensor_desc = nullptr;
-};
-class MLUCnnlActivationDesc {
- public:
-  MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
-  MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete;
-  MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof);
-  MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode,
-                        const float ceof,
-                        const float sliced_dim,
-                        const float selu_alpha,
-                        const float selu_lambda);
-  const cnnlActivationDescriptor_t get() const;
-  ~MLUCnnlActivationDesc();
- private:
-  cnnlActivationDescriptor_t active_desc_ = nullptr;
-};
-class MLUCnnlPoolingDesc {
- public:
-  MLUCnnlPoolingDesc(const MLUCnnlPoolingDesc& desc) = delete;
-  MLUCnnlPoolingDesc& operator=(const MLUCnnlPoolingDesc& desc) = delete;
-  MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
-                     const cnnlNanPropagation_t maxpooling_nan_opt,
-                     int window_rows,
-                     int window_cols,
-                     int64_t pad_up,
-                     int64_t pad_down,
-                     int64_t pad_left,
-                     int64_t pad_right,
-                     int row_stride,
-                     int col_stride,
-                     int row_dilation,
-                     int col_dilation,
-                     bool ceil_mode);
-  MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
-                     const cnnlNanPropagation_t maxpooling_nan_opt,
-                     const int tensor_rank,
-                     const std::vector<int>& window,
-                     const std::vector<int>& padding,
-                     const std::vector<int>& stride);
-  const cnnlPoolingDescriptor_t get() const;
-  ~MLUCnnlPoolingDesc();
- private:
-  cnnlPoolingDescriptor_t pooling_desc_ = nullptr;
-};
-class MLUCnnlRandomGeneratorDesc {
- public:
-  MLUCnnlRandomGeneratorDesc(const ExecutionContext& ctx, const int seed);
-  const cnnlRandGenerator_t get() const;
-  phi::DenseTensor& get_state();
-  ~MLUCnnlRandomGeneratorDesc();
- private:
-  phi::DenseTensor mlu_state;
-  cnnlRandGenerator_t mlu_generator = nullptr;
-};
-const std::shared_ptr<MLUCnnlRandomGeneratorDesc>& GetMLURandomGenerator(
-    const ExecutionContext& ctx, const int64_t device_id, const int seed);
-class MLUCnnlReduceDesc {
- public:
-  MLUCnnlReduceDesc(const MLUCnnlReduceDesc& desc) = delete;
-  MLUCnnlReduceDesc& operator=(const MLUCnnlReduceDesc& desc) = delete;
-  MLUCnnlReduceDesc(const std::vector<int>& axis_vec,
-                    const cnnlReduceOp_t reduce_op,
-                    const cnnlDataType_t data_type,
-                    const cnnlNanPropagation_t nan_propagation,
-                    const cnnlReduceIndices_t reduce_indices,
-                    const cnnlIndicesType_t indices_type);
-  const cnnlReduceDescriptor_t get() const;
-  ~MLUCnnlReduceDesc();
- private:
-  cnnlReduceDescriptor_t reduction_desc_ = nullptr;
-};
-class MLUCnnlOpTensorDesc {
- public:
-  MLUCnnlOpTensorDesc(const MLUCnnlOpTensorDesc& desc) = delete;
-  void operator=(const MLUCnnlOpTensorDesc&) = delete;
-  MLUCnnlOpTensorDesc(cnnlOpTensorDesc_t op_tensor_op,
-                      cnnlDataType_t op_tensor_comp_type,
-                      cnnlNanPropagation_t op_tensor_nan_opt);
-  const cnnlOpTensorDescriptor_t get() const;
-  ~MLUCnnlOpTensorDesc();
- private:
-  cnnlOpTensorDescriptor_t op_tensor_desc_ = nullptr;
-};
-class MLUCnnlNMSDesc {
- public:
-  MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete;
-  MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete;
-  MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode,
-                 const float iou_threshold,
-                 const int max_output_size,
-                 const float confidence_threshold,
-                 const int input_layout);
-  const cnnlNmsDescriptor_t get() const;
-  ~MLUCnnlNMSDesc();
- private:
-  cnnlNmsDescriptor_t nms_desc_ = nullptr;
-};
-class MLUCnnlConvolutionDesc {
- public:
-  MLUCnnlConvolutionDesc(const int dims,
-                         const int pad[],
-                         const int stride[],
-                         const int dilation[],
-                         const int group_count,
-                         const cnnlDataType_t tensor_dtype);
-  MLUCnnlConvolutionDesc(const int dims,
-                         const int64_t pad[],
-                         const int64_t stride[],
-                         const int64_t dilation[],
-                         const int group_count,
-                         const cnnlDataType_t tensor_dtype);
-  MLUCnnlConvolutionDesc(const MLUCnnlConvolutionDesc& desc) = delete;
-  MLUCnnlConvolutionDesc& operator=(const MLUCnnlConvolutionDesc& desc) =
-      delete;
-  const cnnlConvolutionDescriptor_t get() const;
-  ~MLUCnnlConvolutionDesc();
- private:
-  cnnlConvolutionDescriptor_t conv_desc_ = nullptr;
-};
-class MLUCnnlBatchSpaceDesc {
- public:
-  MLUCnnlBatchSpaceDesc(uint32_t block_shape[],
-                        uint32_t paddings[],
-                        const uint32_t block_shape_size,
-                        const uint32_t paddings_size);
-  void getBatch2spaceNdextraInputSize(const ExecutionContext& ctx,
-                                      const cnnlTensorDescriptor_t input_desc);
-  void getSpace2batchNdextraInputSize(const ExecutionContext& ctx,
-                                      const cnnlTensorDescriptor_t input_desc);
-  void initSpace2batchNdExtraInput(const ExecutionContext& ctx,
-                                   const cnnlTensorDescriptor_t input_desc,
-                                   void* extra_host_input);
-  void initBatch2spaceNdExtraInput(const ExecutionContext& ctx,
-                                   const cnnlTensorDescriptor_t input_desc,
-                                   void* extra_host_input);
-  const cnnlSpaceBatchNdDescriptor_t get() const;
-  size_t getExtraInputSize() const;
-  ~MLUCnnlBatchSpaceDesc();
- private:
-  cnnlSpaceBatchNdDescriptor_t op_desc_ = nullptr;
-  size_t extra_input_size_;
-};
-class MLUCnnlTrigonDesc {
- public:
-  explicit MLUCnnlTrigonDesc(
-      const cnnlTrigonFunctionMode_t trigon_function_mode);
-  const cnnlTrigonDescriptor_t get() const;
-  ~MLUCnnlTrigonDesc();
- private:
-  cnnlTrigonDescriptor_t trigon_desc_ = nullptr;
-};
-class MLUCnnlDCNDesc {
- public:
-  MLUCnnlDCNDesc(int dimNb,
-                 const int* pad,
-                 const int* stride,
-                 const int* dilation,
-                 int deformable_group,
-                 int conv_group,
-                 int im2col_step);
-  const cnnlDCNDescriptor_t get() const;
-  ~MLUCnnlDCNDesc();
- private:
-  cnnlDCNDescriptor_t dcn_desc_ = nullptr;
-};
-class MLUCnnlGridSampleDesc {
- public:
-  MLUCnnlGridSampleDesc(const std::string& interp_mode_str,
-                        const std::string& padding_mode_str,
-                        bool align_corners);
-  const cnnlGridSampleDescriptor_t get() const;
-  ~MLUCnnlGridSampleDesc();
- private:
-  cnnlGridSampleDescriptor_t grid_sample_desc_ = nullptr;
-};
-class MLUSeqDataDesc {
- public:
-  MLUSeqDataDesc(const MLUSeqDataDesc& desc) = delete;
-  MLUSeqDataDesc& operator=(const MLUSeqDataDesc& desc) = delete;
-  MLUSeqDataDesc(cnnlSeqDataLayout_t layout,
-                 cnnlDataType_t dtype,
-                 int dimNb,
-                 const int dimSize[],
-                 int seqLengthArraySize,
-                 const int seqLengthArray[],
-                 void* paddingFill);
-  const cnnlSeqDataDescriptor_t get() const;
-  ~MLUSeqDataDesc();
- private:
-  cnnlSeqDataDescriptor_t seq_data_desc_ = nullptr;
-};
-class MLURNNDesc {
- public:
-  MLURNNDesc(const MLURNNDesc& desc) = delete;
-  MLURNNDesc& operator=(const MLURNNDesc& desc) = delete;
-  MLURNNDesc(const int hidden_size,
-             const int num_layers,
-             const cnnlRNNInputMode_t input_mode,
-             const cnnlDirectionMode_t direction,
-             const cnnlRNNMode_t rnn_mode);
-  MLURNNDesc(cnnlRNNMode_t cell_mode,
-             cnnlRNNBiasMode_t bias_mode,
-             cnnlDirectionMode_t direction,
-             cnnlRNNInputMode_t input_mode,
-             cnnlDataType_t data_type,
-             cnnlDataType_t math_prec,
-             int input_size,
-             int hidden_size,
-             int proj_size,
-             int layer_num,
-             void* dropout_desc,
-             cnnlRNNPaddingMode_t padding_mode);
-  void SetRNNProjectionLayers(const int rec_proj_size,
-                              const int out_proj_size) {
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlSetRNNProjectionLayers(rnn_desc_, rec_proj_size, out_proj_size));
-  }
-  void SetPeepholeMode(const cnnlRNNPeepholeMode_t peephole_mode) {
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlSetRNNPeepholeMode(rnn_desc_, peephole_mode));
-  }
-  void SetRNNBiasMode(const cnnlRNNBiasMode_t bias_mode) {
-    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNBiasMode(rnn_desc_, bias_mode));
-  }
-  void SetRNNMaskMode(const cnnlRNNMaskMode_t mask_mode) {
-    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNMaskMode(rnn_desc_, mask_mode));
-  }
-  void SetRNNClip(const cnnlRNNClipMode_t clip_mode,
-                  const cnnlNanPropagation_t clip_nan_opt,
-                  const double left_clip,
-                  const double right_clip) {
-    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNClip(
-        rnn_desc_, clip_mode, clip_nan_opt, left_clip, right_clip));
-  }
-  void SetRNNPaddingMode(const cnnlRNNPaddingMode_t padding_mode) {
-    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNPaddingMode(rnn_desc_, padding_mode));
-  }
-  const cnnlRNNDescriptor_t get() const;
-  ~MLURNNDesc();
- private:
-  cnnlRNNDescriptor_t rnn_desc_ = nullptr;
-};
-class MLUCnnl {
- public:
-  static void Active(const ExecutionContext& ctx,
-                     cnnlActivationDescriptor_t active_desc,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void ActiveGrad(const ExecutionContext& ctx,
-                         cnnlActivationDescriptor_t active_desc,
-                         const void* alpha,
-                         const void* beta,
-                         const cnnlTensorDescriptor_t y_desc,
-                         const void* y,
-                         const cnnlTensorDescriptor_t diff_y_desc,
-                         const void* diff_y,
-                         const cnnlTensorDescriptor_t x_desc,
-                         const void* x,
-                         const cnnlTensorDescriptor_t diff_x_desc,
-                         void* diff_x);
-  static void Concat(const ExecutionContext& ctx,
-                     const int pack_num,
-                     const int axis,
-                     const cnnlTensorDescriptor_t inputs_desc[],
-                     const void* const inputs[],
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void Concat(const MLUDeviceContext& dev_ctx,
-                     const int pack_num,
-                     const int axis,
-                     const cnnlTensorDescriptor_t inputs_desc[],
-                     const void* const inputs[],
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void Cast(const ExecutionContext& ctx,
-                   cnnlCastDataType_t cast_type,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void Clip(const ExecutionContext& ctx,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const void* min,
-                   const void* max,
-                   void* y);
-  static void HardtanhBackward(const ExecutionContext& ctx,
-                               const cnnlTensorDescriptor_t x_desc,
-                               const void* x,
-                               const cnnlTensorDescriptor_t diff_y_desc,
-                               const void* diff_y,
-                               const float max_val,
-                               const float min_val,
-                               const cnnlTensorDescriptor_t diff_x_desc,
-                               void* diff_x);
-  static void Div(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  const cnnlTensorDescriptor_t in0_desc,
-                  const void* in0,
-                  const cnnlTensorDescriptor_t in1_desc,
-                  const void* in1,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Fill(const ExecutionContext& ctx,
-                   const cnnlPointerMode_t pointer_mode,
-                   const void* value_ptr,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void LRN(const ExecutionContext& ctx,
-                  const int local_size,
-                  const double alpha,
-                  const double beta,
-                  const double k,
-                  const cnnlTensorDescriptor_t input_quant_desc,
-                  const void* input_quant,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void QuantifyOffline(const ExecutionContext& context,
-                              cnnlQuantizeMode_t mode,
-                              const cnnlTensorDescriptor_t input_desc,
-                              const void* input,
-                              const cnnlTensorDescriptor_t ouput_desc,
-                              void* output);
-  static void QuantifyOnline(const ExecutionContext& context,
-                             const int bitwidth,
-                             const cnnlTensorDescriptor_t input_desc,
-                             const void* input,
-                             const bool compute_scale,
-                             void* position,
-                             void* scale,
-                             const cnnlTensorDescriptor_t ouput_desc,
-                             void* output);
-  static void SGD(const ExecutionContext& context,
-                  const cnnlTensorDescriptor_t grad_desc,
-                  const void* grad,
-                  const void* lr,
-                  const cnnlTensorDescriptor_t var_desc,
-                  void* var);
-  static void ApplyAdaGrad(const ExecutionContext& ctx,
-                           const cnnlTensorDescriptor_t grad_desc,
-                           const void* grad,
-                           const cnnlTensorDescriptor_t accum_desc,
-                           void* accum,
-                           const cnnlTensorDescriptor_t var_desc,
-                           void* var,
-                           const void* lr,
-                           const bool update_slots);
-  static void ApplyRMSProp(const ExecutionContext& context,
-                           const cnnlTensorDescriptor_t grad_desc,
-                           const void* grad,
-                           const void* lr,
-                           const void* rho,
-                           const void* momentum,
-                           const void* epsilon,
-                           const cnnlTensorDescriptor_t var_desc,
-                           void* var,
-                           const cnnlTensorDescriptor_t ms_desc,
-                           void* ms,
-                           const cnnlTensorDescriptor_t mom_desc,
-                           void* mom);
-  static void ApplyCenterRMSProp(const ExecutionContext& ctx,
-                                 const cnnlTensorDescriptor_t grad_desc,
-                                 const void* grad,
-                                 const void* lr,
-                                 const void* rho,
-                                 const void* momentum,
-                                 const void* epsilon,
-                                 const cnnlTensorDescriptor_t var_desc,
-                                 void* var,
-                                 const cnnlTensorDescriptor_t mg_desc,
-                                 void* mg,
-                                 const cnnlTensorDescriptor_t ms_desc,
-                                 void* ms,
-                                 const cnnlTensorDescriptor_t mom_desc,
-                                 void* mom);
-  static void ApplyAdam(const ExecutionContext& ctx,
-                        const cnnlTensorDescriptor_t var_desc,
-                        void* var,
-                        const cnnlTensorDescriptor_t m_desc,
-                        void* m,
-                        const cnnlTensorDescriptor_t v_desc,
-                        void* v,
-                        const cnnlTensorDescriptor_t grad_desc,
-                        const void* grad,
-                        const void* lr,
-                        const void* beta1,
-                        const void* beta2,
-                        const void* beta1_power,
-                        const void* beta2_power,
-                        const void* epsilon,
-                        const bool use_nesterov);
-  static void ApplyAdaMax(const ExecutionContext& ctx,
-                          const cnnlTensorDescriptor_t grad_desc,
-                          const cnnlTensorDescriptor_t var_desc,
-                          void* var,
-                          const cnnlTensorDescriptor_t m_desc,
-                          void* m,
-                          const cnnlTensorDescriptor_t v_desc,
-                          void* v,
-                          const void* diff,
-                          const void* lr,
-                          const void* beta1,
-                          const void* beta2,
-                          const void* beta1_power,
-                          const void* epsilon);
-  static void ApplyMomentum(const ExecutionContext& ctx,
-                            const cnnlTensorDescriptor_t grad_desc,
-                            const void* grad,
-                            const bool use_nesterov,
-                            const void* lr,
-                            const void* momentum,
-                            void* var,
-                            void* accum);
-  static void ApplyKerasMomentum(const ExecutionContext& ctx,
-                                 const cnnlTensorDescriptor_t grad_desc,
-                                 const void* grad,
-                                 const bool use_nesterov,
-                                 const void* lr,
-                                 const void* momentum,
-                                 void* var,
-                                 void* accum);
-  static void ApplyAdadelta(const ExecutionContext& ctx,
-                            const cnnlTensorDescriptor_t grad_desc,
-                            const void* diff,
-                            const void* lr,
-                            const void* rho,
-                            const void* epsilon,
-                            void* var,
-                            void* accum,
-                            void* accum_update);
-  static void SparseSoftmaxXentWithLogits(
-      const ExecutionContext& ctx,
-      cnnlSoftmaxMode_t mode,
-      const cnnlTensorDescriptor_t x_desc,
-      const void* input,
-      const cnnlTensorDescriptor_t label_desc,
-      const void* label,
-      const cnnlTensorDescriptor_t y_desc,
-      void* output,
-      const cnnlTensorDescriptor_t diff_y_desc,
-      void* back_out);
-  static void RandomUniform(const ExecutionContext& ctx,
-                            const int num,
-                            const cnnlDataType_t data_type,
-                            const cnnlRandGenerator_t mlu_generator,
-                            void* mlu_state,
-                            void* output);
-  static void FusedDropout(const ExecutionContext& ctx,
-                           const cnnlRandGenerator_t generator,
-                           const cnnlTensorDescriptor_t input_desc,
-                           const void* input,
-                           const float p,
-                           void* state,
-                           const cnnlTensorDescriptor_t mask_desc,
-                           const void* mask,
-                           const cnnlTensorDescriptor_t output_desc,
-                           void* output);
-  static void Cumsum(const ExecutionContext& ctx,
-                     const int axis,
-                     const bool exclusive,
-                     const bool reverse,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const cnnlTensorDescriptor_t ouput_desc,
-                     void* output);
-  static void BroadcastTo(const ExecutionContext& ctx,
-                          const cnnlTensorDescriptor_t input_desc,
-                          const void* input,
-                          const cnnlTensorDescriptor_t output_desc,
-                          void* output);
-  static void GatherFunctor(const ExecutionContext& ctx,
-                            const int axis,
-                            const int batch_dims,
-                            const cnnlTensorDescriptor_t params_desc,
-                            const void* params,
-                            const cnnlTensorDescriptor_t indices_desc,
-                            const void* indices,
-                            const cnnlTensorDescriptor_t output_desc,
-                            void* output);
-  static void ScatterRefFunctor(const ExecutionContext& ctx,
-                                const cnnlTensorDescriptor_t params_desc,
-                                const void* params,
-                                const cnnlTensorDescriptor_t updates_desc,
-                                const void* updates,
-                                const cnnlTensorDescriptor_t indices_desc,
-                                const void* indices,
-                                const cnnlScatterRefMode_t mode);
-  static void ScatterFunctor(const ExecutionContext& ctx,
-                             const cnnlTensorDescriptor_t params_desc,
-                             void* params,
-                             const cnnlTensorDescriptor_t updates_desc,
-                             const void* updates,
-                             const cnnlTensorDescriptor_t indices_desc,
-                             const void* indices,
-                             const int dim,
-                             const cnnlScatterMode_t mode = CNNL_SCATTER);
-  static void Range(const ExecutionContext& ctx,
-                    const void* start,
-                    const void* end,
-                    const void* step,
-                    const cnnlDataType_t output_dtype,
-                    void* output);
-  static void Round(const ExecutionContext& ctx,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void TopK(const ExecutionContext& ctx,
-                   const int k,
-                   const int dim,
-                   const bool largest,
-                   const bool sorted,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t values_output_desc,
-                   void* values_out,
-                   const cnnlTensorDescriptor_t indices_output_desc,
-                   void* indices_out);
-  static void StridedSlice(const ExecutionContext& ctx,
-                           const int begin[],
-                           const int end[],
-                           const int strides[],
-                           const cnnlTensorDescriptor_t input_desc,
-                           const void* input,
-                           const cnnlTensorDescriptor_t output_desc,
-                           void* output);
-  static void Split(const ExecutionContext& ctx,
-                    int split_num,
-                    int axis,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input_ptr,
-                    const cnnlTensorDescriptor_t output_descs[],
-                    void* output_ptrs[]);
-  static void Split(const MLUDeviceContext& dev_ctx,
-                    int split_num,
-                    int axis,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input_ptr,
-                    const cnnlTensorDescriptor_t output_descs[],
-                    void* output_ptrs[]);
-  static void Scale(const ExecutionContext& ctx,
-                    const int axis,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input,
-                    const cnnlTensorDescriptor_t alpha_desc,
-                    const void* alpha,
-                    const cnnlTensorDescriptor_t beta_desc,
-                    const void* beta,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void AddN(const ExecutionContext& ctx,
-                   uint32_t input_num,
-                   const cnnlTensorDescriptor_t inputs_desc[],
-                   const void* inputs[],
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void Log(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  cnnlLogBase_t log_base,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void StridedSliceGrad(const ExecutionContext& ctx,
-                               const int begin[],
-                               const int end[],
-                               const int strides[],
-                               const cnnlTensorDescriptor_t input_desc,
-                               const void* input,
-                               const cnnlTensorDescriptor_t output_desc,
-                               void* output);
-  static void Logic(const ExecutionContext& ctx,
-                    const cnnlLogicOp_t log_method,
-                    const cnnlTensorDescriptor_t input1_desc,
-                    const void* input1,
-                    const cnnlTensorDescriptor_t input2_desc,
-                    const void* input2,
-                    const cnnlTensorDescriptor_t ouput_desc,
-                    void* output);
-  static void Select(const ExecutionContext& ctx,
-                     const cnnlTensorDescriptor_t condition_desc,
-                     const void* condition_ptr,
-                     const cnnlTensorDescriptor_t then_desc,
-                     const void* then_ptr,
-                     const cnnlTensorDescriptor_t else_desc,
-                     const void* else_ptr,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output_ptr);
-  static void AssignAdd(const ExecutionContext& ctx,
-                        const void* alpha,
-                        const void* beta,
-                        const cnnlTensorDescriptor_t update_desc,
-                        const void* update,
-                        const cnnlTensorDescriptor_t param_desc,
-                        void* param);
-  static void AssignSub(const ExecutionContext& ctx,
-                        const void* alpha,
-                        const void* beta,
-                        const cnnlTensorDescriptor_t update_desc,
-                        const void* update,
-                        const cnnlTensorDescriptor_t param_desc,
-                        void* param);
-  static void Assign(const ExecutionContext& ctx,
-                     const cnnlTensorDescriptor_t update_desc,
-                     const void* update,
-                     const cnnlTensorDescriptor_t param_desc,
-                     void* param);
-  static void GatherNd(const ExecutionContext& ctx,
-                       const cnnlTensorDescriptor_t params_desc,
-                       const void* params,
-                       const cnnlTensorDescriptor_t indices_desc,
-                       const void* indices,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void BatchToSpace(const ExecutionContext& ctx,
-                           const cnnlTensorDescriptor_t input_desc,
-                           const void* input,
-                           const cnnlTensorDescriptor_t output_desc,
-                           void* output,
-                           const cnnlSpaceBatchParam_t param);
-  static void BatchToSpaceNd(const ExecutionContext& ctx,
-                             const cnnlTensorDescriptor_t input_desc,
-                             const void* input,
-                             cnnlSpaceBatchNdDescriptor_t param,
-                             void* extra_device_input,
-                             size_t extra_input_size,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output);
-  static void PoolingForward(const ExecutionContext& ctx,
-                             cnnlPoolingMode_t pool_mode,
-                             int64_t output_h,
-                             int64_t output_w,
-                             cnnlPoolingDescriptor_t pooling_desc,
-                             const void* alpha,
-                             const cnnlTensorDescriptor_t input_desc,
-                             const void* input,
-                             const void* beta,
-                             const void* extra_input_ptr,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output);
-  static void AdaptivePoolingForward(const ExecutionContext& ctx,
-                                     cnnlPoolingMode_t pool_mode,
-                                     const cnnlTensorDescriptor_t input_desc,
-                                     const void* input,
-                                     const cnnlTensorDescriptor_t output_desc,
-                                     void* output,
-                                     const cnnlTensorDescriptor_t index_desc,
-                                     void* index);
-  static void Pool3D(const ExecutionContext& ctx,
-                     cnnlPoolingMode_t pool_mode,
-                     const std::vector<int64_t>& output_shape,
-                     cnnlPoolingDescriptor_t pooling_desc,
-                     const void* alpha,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const void* beta,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void Pad(const ExecutionContext& ctx,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const void* paddings,
-                  const void* padding_value,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Matmul(const ExecutionContext& ctx,
-                     const bool transpose_a,
-                     const bool transpose_b,
-                     const cnnlTensorDescriptor_t in0_desc,
-                     const void* in0,
-                     const cnnlTensorDescriptor_t in1_desc,
-                     const void* in1,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void BatchMatmul(const ExecutionContext& ctx,
-                          const bool transpose_a,
-                          const bool transpose_b,
-                          const cnnlTensorDescriptor_t in0_desc,
-                          const void* in0,
-                          const cnnlTensorDescriptor_t in1_desc,
-                          const void* in1,
-                          const cnnlTensorDescriptor_t output_desc,
-                          void* output);
-  static void MulAx(const ExecutionContext& ctx,
-                    const cnnlTensorDescriptor_t alpha_desc,
-                    const void* alpha,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void OpTensor(const ExecutionContext& ctx,
-                       const cnnlOpTensorDescriptor_t op_tensor_desc,
-                       const cnnlTensorDescriptor_t a_desc,
-                       const void* a,
-                       const cnnlTensorDescriptor_t b_desc,
-                       const void* b,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output,
-                       const cnnlDataType_t dtype,
-                       const float alpha1_float = 1.f,
-                       const float alpha2_float = 1.f,
-                       const float beta_float = 0.f);
-  static void BiasAddGrad(const ExecutionContext& ctx,
-                          const int axis,
-                          const cnnlTensorDescriptor_t out_backprop_desc,
-                          const void* out_backprop,
-                          const cnnlTensorDescriptor_t output_desc,
-                          void* output);
-  static void OneHot(const ExecutionContext& ctx,
-                     const cnnlTensorDescriptor_t desc_indices,
-                     const void* indices,
-                     const int depth,
-                     const void* on_value,
-                     const void* off_value,
-                     const int axis,
-                     cnnlDataType_t output_data_type,
-                     void* output);
-  static void NonMaxSuppression(const ExecutionContext& ctx,
-                                const cnnlNmsDescriptor_t nms_desc,
-                                const cnnlTensorDescriptor_t boxes_desc,
-                                const void* boxes,
-                                const cnnlTensorDescriptor_t confidence_desc,
-                                const void* confidence,
-                                const cnnlTensorDescriptor_t output_desc,
-                                void* output,
-                                void* output_size);
-  static void SoftmaxCrossEntropyWithLogits(
-      const ExecutionContext& ctx,
-      cnnlSoftmaxMode_t mode,
-      cnnlComputationPreference_t prefer,
-      const cnnlTensorDescriptor_t input_desc,
-      const void* logits_in,
-      const cnnlTensorDescriptor_t label_desc,
-      const void* labels_in,
-      const cnnlTensorDescriptor_t loss_out_desc,
-      void* loss_out,
-      const cnnlTensorDescriptor_t back_out_desc,
-      void* back_out);
-  static void SoftmaxForward(const ExecutionContext& ctx,
-                             cnnlSoftmaxAlgorithm_t algorithm,
-                             cnnlSoftmaxMode_t mode,
-                             const void* alpha,
-                             const cnnlTensorDescriptor_t input_desc,
-                             const void* input,
-                             const void* beta,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output);
-  static void SoftmaxBackward(const ExecutionContext& ctx,
-                              cnnlSoftmaxAlgorithm_t algorithm,
-                              cnnlSoftmaxMode_t mode,
-                              const cnnlTensorDescriptor_t y_desc,
-                              const void* y,
-                              const cnnlTensorDescriptor_t diff_y_desc,
-                              const void* diff_y,
-                              const cnnlTensorDescriptor_t diff_x_desc,
-                              void* diff_x);
-  static void Softplus(const ExecutionContext& ctx,
-                       const cnnlTensorDescriptor_t features_desc,
-                       const void* features,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void SoftplusGrad(const ExecutionContext& ctx,
-                           const cnnlTensorDescriptor_t gradients_desc,
-                           const void* gradients,
-                           const cnnlTensorDescriptor_t features_desc,
-                           const void* features,
-                           const cnnlTensorDescriptor_t output_desc,
-                           void* output);
-  static void RsqrtGrad(const ExecutionContext& ctx,
-                        const cnnlTensorDescriptor_t data_desc,
-                        const void* y,
-                        const void* diff_y,
-                        void* output);
-  static void SqrtGrad(const ExecutionContext& ctx,
-                       const cnnlTensorDescriptor_t data_desc,
-                       const void* y,
-                       const void* diff_y,
-                       void* output);
-  static void ConvolutionForward(const ExecutionContext& ctx,
-                                 cnnlConvolutionDescriptor_t conv_desc_,
-                                 const void* alpha,
-                                 const void* beta,
-                                 const cnnlTensorDescriptor_t bias_desc,
-                                 const void* bias_ptr,
-                                 const cnnlTensorDescriptor_t input_desc,
-                                 const void* input,
-                                 const cnnlTensorDescriptor_t filtet_desc,
-                                 const void* filter,
-                                 const cnnlTensorDescriptor_t output_desc,
-                                 void* output);
-  static void FusedConvBNQuantify(const ExecutionContext& ctx,
-                                  cnnlConvolutionDescriptor_t conv_desc,
-                                  const void* epsilon_ptr,
-                                  const int fused_ops_number,
-                                  const cnnlDataType_t tensor_dtype,
-                                  const int input_position,
-                                  const float input_scale,
-                                  const int filter_position,
-                                  const float filter_scale,
-                                  const cnnlTensorDescriptor_t scale_desc,
-                                  const void* scale_ptr,
-                                  const cnnlTensorDescriptor_t offset_desc,
-                                  const void* offset_ptr,
-                                  const cnnlTensorDescriptor_t mean_desc,
-                                  const void* mean_ptr,
-                                  const cnnlTensorDescriptor_t variance_desc,
-                                  const void* variance_ptr,
-                                  const cnnlTensorDescriptor_t input_desc,
-                                  const void* input,
-                                  const cnnlTensorDescriptor_t filtet_desc,
-                                  const void* filter,
-                                  const cnnlTensorDescriptor_t output_desc,
-                                  void* output);
-  static void Tile(const ExecutionContext& ctx,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void UnsortedSegmentSum(const ExecutionContext& ctx,
-                                 const cnnlTensorDescriptor_t data_desc,
-                                 const void* data,
-                                 const cnnlTensorDescriptor_t ids_desc,
-                                 const int* segment_ids,
-                                 const cnnlTensorDescriptor_t output_desc,
-                                 void* output);
-  static void Reduce(const ExecutionContext& ctx,
-                     const bool need_workspace,
-                     const cnnlReduceDescriptor_t reduction_desc,
-                     const void* alpha,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const size_t indices_size,
-                     void* indices,
-                     const void* beta,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void FloorDiv(const ExecutionContext& ctx,
-                       cnnlComputationPreference_t prefer,
-                       const cnnlTensorDescriptor_t input1_desc,
-                       const void* input1,
-                       const cnnlTensorDescriptor_t input2_desc,
-                       const void* input2,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void FloorMod(const ExecutionContext& ctx,
-                       const cnnlTensorDescriptor_t input1_desc,
-                       const void* input1,
-                       const cnnlTensorDescriptor_t input2_desc,
-                       const void* input2,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void Maximum(const ExecutionContext& ctx,
-                      const cnnlTensorDescriptor_t input1_desc,
-                      const void* input1,
-                      const cnnlTensorDescriptor_t input2_desc,
-                      const void* input2,
-                      const cnnlTensorDescriptor_t output_desc,
-                      void* output);
-  static void Minimum(const ExecutionContext& ctx,
-                      const cnnlTensorDescriptor_t input1_desc,
-                      const void* input1,
-                      const cnnlTensorDescriptor_t input2_desc,
-                      const void* input2,
-                      const cnnlTensorDescriptor_t output_desc,
-                      void* output);
-  static void Pow(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  const cnnlTensorDescriptor_t input1_desc,
-                  const void* input1,
-                  const cnnlTensorDescriptor_t input2_desc,
-                  const void* input2,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void PowR(const ExecutionContext& ctx,
-                   cnnlComputationPreference_t prefer,
-                   const cnnlTensorDescriptor_t input1_desc,
-                   const void* input1,
-                   const cnnlTensorDescriptor_t input2_desc,
-                   const void* input2,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void DivNoNan(const ExecutionContext& ctx,
-                       cnnlComputationPreference_t prefer,
-                       const cnnlTensorDescriptor_t input1_desc,
-                       const void* input1,
-                       const cnnlTensorDescriptor_t input2_desc,
-                       const void* input2,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void SquaredDifference(const ExecutionContext& ctx,
-                                const cnnlTensorDescriptor_t input1_desc,
-                                const void* input1,
-                                const cnnlTensorDescriptor_t input2_desc,
-                                const void* input2,
-                                const cnnlTensorDescriptor_t output_desc,
-                                void* output);
-  static void L2Loss(const ExecutionContext& ctx,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     void* output);
-  static void Abs(const ExecutionContext& ctx,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Neg(const ExecutionContext& ctx,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Floor(const ExecutionContext& ctx,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void Ceil(const ExecutionContext& ctx,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void IsNan(const ExecutionContext& ctx,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void Square(const ExecutionContext& ctx,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void Sqrt(const ExecutionContext& ctx,
-                   cnnlComputationPreference_t prefer,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void Rsqrt(const ExecutionContext& ctx,
-                    cnnlComputationPreference_t prefer,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void Cos(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Sin(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void TrigonForward(const ExecutionContext& ctx,
-                            const cnnlTrigonDescriptor_t trigon_desc,
-                            const cnnlTensorDescriptor_t input_desc,
-                            const void* input,
-                            const cnnlTensorDescriptor_t output_desc,
-                            void* output);
-  static void Exp(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Sign(const ExecutionContext& ctx,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output);
-  static void IndexSelect(const ExecutionContext& ctx,
-                          const int dim,
-                          cnnlTensorDescriptor_t input_desc,
-                          const void* input,
-                          const cnnlTensorDescriptor_t index_desc,
-                          const void* index,
-                          const cnnlTensorDescriptor_t output_desc,
-                          void* output);
-  static void IsFinite(const ExecutionContext& ctx,
-                       const cnnlTensorDescriptor_t input_desc,
-                       const void* input,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void IsNanInf(const ExecutionContext& ctx,
-                       const cnnlTensorDescriptor_t input_desc,
-                       const void* input,
-                       void* output);
-  static void Erf(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
-                  const cnnlTensorDescriptor_t input_desc,
-                  const void* input,
-                  const cnnlTensorDescriptor_t output_desc,
-                  void* output);
-  static void Log1p(const ExecutionContext& ctx,
-                    cnnlComputationPreference_t prefer,
-                    const cnnlTensorDescriptor_t input_desc,
-                    const void* input,
-                    const cnnlTensorDescriptor_t output_desc,
-                    void* output);
-  static void LogicalNot(const ExecutionContext& ctx,
-                         const cnnlTensorDescriptor_t input_desc,
-                         const void* input,
-                         const cnnlTensorDescriptor_t output_desc,
-                         void* output);
-  static void DynamicStitch(const ExecutionContext& ctx,
-                            const cnnlTensorDescriptor_t* indices_desc,
-                            const int** indices,
-                            const cnnlTensorDescriptor_t* data_desc,
-                            const void** data,
-                            const int size,
-                            int* indices_dims,
-                            const cnnlTensorDescriptor_t output_desc,
-                            void* output);
-  static void CropAndResize(const ExecutionContext& ctx,
-                            const std::string method_name,
-                            const float extrapolation_value,
-                            const cnnlTensorDescriptor_t image_desc,
-                            const void* image,
-                            const cnnlTensorDescriptor_t boxes_desc,
-                            const void* boxes,
-                            const cnnlTensorDescriptor_t box_index_desc,
-                            const void* box_index,
-                            const cnnlTensorDescriptor_t output_desc,
-                            void* output);
-  static void CropAndResizeBackwardImage(
-      const ExecutionContext& ctx,
-      const std::string method_name,
-      const cnnlTensorDescriptor_t image_desc,
-      const void* image,
-      const cnnlTensorDescriptor_t boxes_desc,
-      const void* boxes,
-      const cnnlTensorDescriptor_t box_idx_desc,
-      const void* box_idx,
-      const cnnlTensorDescriptor_t grads_image_desc,
-      void* grads_image);
-  static void CropAndResizeBackwardBoxes(
-      const ExecutionContext& ctx,
-      const cnnlTensorDescriptor_t input_desc,
-      const void* input,
-      const cnnlTensorDescriptor_t image_desc,
-      const void* image,
-      const cnnlTensorDescriptor_t boxes_desc,
-      const void* boxes,
-      const cnnlTensorDescriptor_t box_idx_desc,
-      const void* box_idx,
-      const cnnlTensorDescriptor_t output_desc,
-      void* output);
-  static void PoolingBackward(const ExecutionContext& ctx,
-                              const cnnlPoolingDescriptor_t pooling_desc,
-                              const void* alpha,
-                              const cnnlTensorDescriptor_t y_desc,
-                              const void* y,
-                              const cnnlTensorDescriptor_t diff_y_desc,
-                              const void* diff_y,
-                              const cnnlTensorDescriptor_t x_desc,
-                              const void* x,
-                              const void* beta,
-                              const cnnlTensorDescriptor_t diff_x_desc,
-                              void* diff_x);
-  static void AdaptivePoolingBackward(const ExecutionContext& ctx,
-                                      const cnnlPoolingMode_t pool_mode,
-                                      const cnnlTensorDescriptor_t y_desc,
-                                      const void* y,
-                                      const cnnlTensorDescriptor_t index_desc,
-                                      const void* index,
-                                      const cnnlTensorDescriptor_t diff_x_desc,
-                                      void* diff_x);
-  static void PoolingIndex(const ExecutionContext& ctx,
-                           const cnnlPoolingDescriptor_t pooling_desc,
-                           const cnnlTensorDescriptor_t x_desc,
-                           const void* x,
-                           const cnnlTensorDescriptor_t y_desc,
-                           void* y);
-  static void SpaceToBatch(const ExecutionContext& ctx,
-                           const cnnlTensorDescriptor_t input_desc,
-                           const void* input,
-                           const cnnlTensorDescriptor_t output_desc,
-                           void* output,
-                           const int64_t block_shape[]);
-  static void SpaceToBatchNd(const ExecutionContext& ctx,
-                             const cnnlTensorDescriptor_t input_desc,
-                             const void* input,
-                             cnnlSpaceBatchNdDescriptor_t param,
-                             void* extra_device_input,
-                             size_t extra_input_size,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output);
-  static void Interp(const ExecutionContext& ctx,
-                     const cnnlInterpMode_t mode,
-                     const bool align_corners,
-                     const bool half_pixel_centers,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void InterpBackward(const ExecutionContext& ctx,
-                             const cnnlInterpBackwardMode_t mode,
-                             const bool align_corners,
-                             const bool half_pixel_centers,
-                             const cnnlTensorDescriptor_t input_desc,
-                             const void* input,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output);
-  static void QuantizeParam(const ExecutionContext& ctx,
-                            const cnnlQuantizeMode_t mode,
-                            const int bitwidth,
-                            const cnnlTensorDescriptor_t input_desc,
-                            const void* input,
-                            void* position,
-                            void* scale,
-                            void* offset);
-  static void QuantizeMatMul(const ExecutionContext& ctx,
-                             const bool transpose_a,
-                             const bool transpose_b,
-                             const cnnlTensorDescriptor_t a_desc,
-                             const void* a,
-                             const void* a_position,
-                             const void* a_scale,
-                             const void* a_offset,
-                             const cnnlTensorDescriptor_t b_desc,
-                             const void* b,
-                             const void* b_position,
-                             const void* b_scale,
-                             const void* b_offset,
-                             const cnnlDataType_t quant_type,
-                             const cnnlDataType_t data_type,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output);
-  static void QuantizeBatchMatMul(const ExecutionContext& ctx,
-                                  const bool adj_x,
-                                  const bool adj_y,
-                                  const cnnlTensorDescriptor_t a_desc,
-                                  const void* a,
-                                  const void* a_position,
-                                  const void* a_scale,
-                                  const void* a_offset,
-                                  const cnnlTensorDescriptor_t b_desc,
-                                  const void* b,
-                                  const void* b_position,
-                                  const void* b_scale,
-                                  const void* b_offset,
-                                  const cnnlDataType_t quant_type,
-                                  const cnnlDataType_t data_type,
-                                  const cnnlTensorDescriptor_t output_desc,
-                                  void* output);
-  static void QuantizeBatchMatMulBCast(const ExecutionContext& ctx,
-                                       const bool adj_x,
-                                       const bool adj_y,
-                                       const cnnlTensorDescriptor_t a_desc,
-                                       const void* a,
-                                       const void* a_position,
-                                       const void* a_scale,
-                                       const void* a_offset,
-                                       const cnnlTensorDescriptor_t b_desc,
-                                       const void* b,
-                                       const void* b_position,
-                                       const void* b_scale,
-                                       const void* b_offset,
-                                       const cnnlDataType_t quant_type,
-                                       const cnnlDataType_t data_type,
-                                       const cnnlTensorDescriptor_t output_desc,
-                                       void* output);
-  static void FusedBatchNorm(const ExecutionContext& ctx,
-                             const bool is_training,
-                             const cnnlTensorDescriptor_t x_desc,
-                             const void* x,
-                             const cnnlTensorDescriptor_t scale_desc,
-                             const void* scale,
-                             const void* offset,
-                             const void* estimated_mean,
-                             const void* estimated_variance,
-                             float epsilon,
-                             float momentum,
-                             const cnnlTensorDescriptor_t output_desc,
-                             void* output,
-                             void* batch_mean,
-                             void* batch_var,
-                             void* saved_mean,
-                             void* saved_var);
-  static void FusedBatchNormGrad(const ExecutionContext& ctx,
-                                 const bool is_training,
-                                 const cnnlTensorDescriptor_t y_backprop_desc,
-                                 const void* y_backprop,
-                                 const cnnlTensorDescriptor_t x_desc,
-                                 const void* x,
-                                 const cnnlTensorDescriptor_t scale_desc,
-                                 const void* scale,
-                                 const void* saved_mean,
-                                 const void* saved_var,
-                                 float epsilon,
-                                 const cnnlTensorDescriptor_t x_backprop_desc,
-                                 void* x_backprop,
-                                 void* scale_backprop,
-                                 void* offset_backprop);
-  static void LayerNormForward(const ExecutionContext& ctx,
-                               int axis,
-                               const cnnlTensorDescriptor_t x_desc,
-                               const void* x,
-                               const cnnlTensorDescriptor_t weight_bias_desc,
-                               const void* weight,
-                               const void* bias,
-                               float eps,
-                               const cnnlTensorDescriptor_t y_desc,
-                               void* y,
-                               const cnnlTensorDescriptor_t mean_rstd_desc,
-                               void* saved_mean,
-                               void* saved_rstd);
-  static void LayerNormBackward(const ExecutionContext& ctx,
-                                int axis,
-                                const cnnlTensorDescriptor_t x_desc,
-                                const void* x,
-                                const cnnlTensorDescriptor_t diff_z_desc,
-                                const void* diff_z,
-                                const cnnlTensorDescriptor_t weight_bias_desc,
-                                const void* weight,
-                                const cnnlTensorDescriptor_t mean_rstd_desc,
-                                const void* saved_mean,
-                                const void* saved_rstd,
-                                const cnnlTensorDescriptor_t diff_x_desc,
-                                void* diff_x,
-                                void* diff_weight,
-                                void* diff_bias);
-  static void Transpose(const ExecutionContext& ctx,
-                        const std::vector<int> perm,
-                        const int input_dim,
-                        const cnnlTensorDescriptor_t input_desc,
-                        const void* input,
-                        const cnnlTensorDescriptor_t output_desc,
-                        void* output);
-  static void TrilTriu(const ExecutionContext& ctx,
-                       const int diagonal_k,
-                       const bool tri_up_mode,
-                       const cnnlTensorDescriptor_t input_desc,
-                       const void* input,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void MatrixBandPart(const ExecutionContext& ctx,
-                             const cnnlTensorDescriptor_t data_desc,
-                             const void* input,
-                             const int num_lower,
-                             const int num_upper,
-                             void* output);
-  static void NumTrue(const ExecutionContext& ctx,
-                      const cnnlTensorDescriptor_t x_desc,
-                      const void* x,
-                      const cnnlTensorDescriptor_t num_true_desc,
-                      void* num_true);
-  static void Where(const ExecutionContext& ctx,
-                    const cnnlTensorDescriptor_t x_desc,
-                    const void* x,
-                    const cnnlTensorDescriptor_t num_true_desc,
-                    const void* num_true,
-                    const bool as_tuple,
-                    const cnnlTensorDescriptor_t y_desc,
-                    void* y);
-  static void Conv2D(const ExecutionContext& ctx,
-                     const cnnlConvolutionDescriptor_t conv_desc,
-                     const cnnlDataType_t tensor_dtype,
-                     const cnnlDataType_t dt_onchip,
-                     const void* input_position,
-                     const void* input_scale,
-                     const void* input_offset,
-                     const void* filter_position,
-                     const void* filter_scale,
-                     const void* filter_offset,
-                     const cnnlTensorDescriptor_t input_desc,
-                     const void* input,
-                     const cnnlTensorDescriptor_t filter_desc,
-                     const void* filter,
-                     const cnnlTensorDescriptor_t bias_desc,
-                     const void* bias,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void ConvBackpropInput(const ExecutionContext& ctx,
-                                const cnnlConvolutionDescriptor_t conv_desc,
-                                const cnnlTensorDescriptor_t filter_desc,
-                                const void* filter,
-                                const cnnlTensorDescriptor_t out_backprop_desc,
-                                const void* out_backprop,
-                                const cnnlTensorDescriptor_t in_backprop_desc,
-                                void* in_backprop);
-  static void QuantizeConvBackpropInput(
-      const ExecutionContext& ctx,
-      const cnnlConvolutionDescriptor_t conv_desc,
-      const cnnlDataType_t tensor_dtype,
-      const cnnlDataType_t dt_onchip,
-      const void* filter_position,
-      const void* filter_scale,
-      const void* filter_offset,
-      const void* out_backprop_position,
-      const void* out_backprop_scale,
-      const void* out_backprop_offset,
-      const cnnlTensorDescriptor_t input_desc,
-      const void* filter,
-      const cnnlTensorDescriptor_t out_backprop_desc,
-      const void* out_backprop,
-      const cnnlTensorDescriptor_t in_backprop_desc,
-      void* in_backprop);
-  static void ConvBackpropFilter(
-      const ExecutionContext& ctx,
-      const cnnlConvolutionDescriptor_t conv_desc,
-      const cnnlTensorDescriptor_t input_desc,
-      const void* input,
-      const cnnlTensorDescriptor_t out_backprop_desc,
-      const void* out_backprop,
-      const cnnlTensorDescriptor_t filter_backprop_desc,
-      void* filter_backprop);
-  static void QuantizeConvBackpropFilter(
-      const ExecutionContext& ctx,
-      const cnnlConvolutionDescriptor_t conv_desc,
-      const cnnlDataType_t tensor_dtype,
-      const cnnlDataType_t dt_onchip,
-      const void* input_position,
-      const void* input_scale,
-      const void* input_offset,
-      const void* out_backprop_position,
-      const void* out_backprop_scale,
-      const void* out_backprop_offset,
-      const cnnlTensorDescriptor_t input_desc,
-      const void* input,
-      const cnnlTensorDescriptor_t out_backprop_desc,
-      const void* out_backprop,
-      const cnnlTensorDescriptor_t filter_backprop_desc,
-      void* filter_backprop);
-  static void DCNForward(const ExecutionContext& ctx,
-                         const cnnlDCNDescriptor_t dcn_desc,
-                         const cnnlTensorDescriptor_t input_desc,
-                         const void* input,
-                         const cnnlTensorDescriptor_t offset_desc,
-                         const void* offset,
-                         const cnnlTensorDescriptor_t mask_desc,
-                         const void* mask,
-                         const cnnlTensorDescriptor_t weight_desc,
-                         const void* weight,
-                         const cnnlTensorDescriptor_t bias_desc,
-                         const void* bias,
-                         const cnnlTensorDescriptor_t output_desc,
-                         void* output);
-  static void DCNBackwardData(const ExecutionContext& ctx,
-                              const cnnlDCNDescriptor_t dcn_desc,
-                              const cnnlTensorDescriptor_t input_desc,
-                              const void* input,
-                              const cnnlTensorDescriptor_t offset_desc,
-                              const void* offset,
-                              const cnnlTensorDescriptor_t mask_desc,
-                              const void* mask,
-                              const cnnlTensorDescriptor_t weight_desc,
-                              const void* weight,
-                              const cnnlTensorDescriptor_t grad_output_desc,
-                              const void* grad_output,
-                              const cnnlTensorDescriptor_t grad_input_desc,
-                              void* grad_input,
-                              const cnnlTensorDescriptor_t grad_offset_desc,
-                              void* grad_offset,
-                              const cnnlTensorDescriptor_t grad_mask_desc,
-                              void* grad_mask);
-  static void DCNBackwardWeight(const ExecutionContext& ctx,
-                                const cnnlDCNDescriptor_t dcn_desc,
-                                const cnnlTensorDescriptor_t input_desc,
-                                const void* input,
-                                const cnnlTensorDescriptor_t offset_desc,
-                                const void* offset,
-                                const cnnlTensorDescriptor_t mask_desc,
-                                const void* mask,
-                                const cnnlTensorDescriptor_t grad_output_desc,
-                                const void* grad_output,
-                                const cnnlTensorDescriptor_t grad_weight_desc,
-                                void* grad_weight,
-                                const cnnlTensorDescriptor_t grad_bias_desc,
-                                void* grad_bias);
-  static void InTopK(const ExecutionContext& ctx,
-                     const cnnlTensorDescriptor_t predictions_desc,
-                     const void* predictions,
-                     const cnnlTensorDescriptor_t targets_desc,
-                     const void* targets,
-                     const cnnlTensorDescriptor_t k_desc,
-                     const void* k,
-                     const int k_int,
-                     const cnnlTensorDescriptor_t output_desc,
-                     void* output);
-  static void ScatterNd(const ExecutionContext& ctx,
-                        cnnlScatterNdMode_t mode,
-                        const cnnlTensorDescriptor_t indices_desc,
-                        const void* indices,
-                        const cnnlTensorDescriptor_t updates_desc,
-                        const void* updates,
-                        const cnnlTensorDescriptor_t input_desc,
-                        const void* input,
-                        const cnnlTensorDescriptor_t output_desc,
-                        void* output);
-  static void BitWise(const ExecutionContext& ctx,
-                      const cnnlBitComputeOp_t optype,
-                      const cnnlTensorDescriptor_t input1_desc,
-                      const void* input1,
-                      const cnnlTensorDescriptor_t input2_desc,
-                      const void* input2,
-                      const cnnlTensorDescriptor_t output_desc,
-                      void* output);
-  static void QR(const ExecutionContext& ctx,
-                 const cnnlTensorDescriptor_t a_desc,
-                 const void* a,
-                 const cnnlTensorDescriptor_t q_desc,
-                 void* q,
-                 const cnnlTensorDescriptor_t r_desc,
-                 void* r,
-                 const bool some);
-  static void Reciprocal(const ExecutionContext& ctx,
-                         const cnnlTensorDescriptor_t input_desc,
-                         const void* input,
-                         const cnnlTensorDescriptor_t output_desc,
-                         void* output);
-  static void BceLoss(const ExecutionContext& ctx,
-                      const cnnlBceLossReduction_t reduction,
-                      const cnnlTensorDescriptor_t input_desc,
-                      const void* input,
-                      const cnnlTensorDescriptor_t target_desc,
-                      const void* target,
-                      const cnnlTensorDescriptor_t weight_desc,
-                      const void* weight,
-                      const cnnlTensorDescriptor_t output_desc,
-                      void* output);
-  static void BceLossBackward(const ExecutionContext& ctx,
-                              const cnnlBceLossReduction_t reduction,
-                              const cnnlTensorDescriptor_t grad_desc,
-                              const void* grad,
-                              const cnnlTensorDescriptor_t input_desc,
-                              const void* input,
-                              const cnnlTensorDescriptor_t target_desc,
-                              const void* target,
-                              const cnnlTensorDescriptor_t weight_desc,
-                              const void* weight,
-                              const cnnlTensorDescriptor_t output_desc,
-                              void* output);
-  static void SmoothL1LossForward(const ExecutionContext& ctx,
-                                  const cnnlTensorDescriptor_t x_desc,
-                                  const void* x,
-                                  const cnnlTensorDescriptor_t t_desc,
-                                  const void* target,
-                                  const float beta,
-                                  const cnnlSmoothL1LossAlgorithm_t algorithm,
-                                  const cnnlTensorDescriptor_t y_desc,
-                                  void* y);
-  static void SmoothL1LossBackward(const ExecutionContext& ctx,
-                                   const cnnlTensorDescriptor_t x_desc,
-                                   const void* x,
-                                   const cnnlTensorDescriptor_t target_desc,
-                                   const void* target,
-                                   const cnnlTensorDescriptor_t dy_desc,
-                                   const void* dy,
-                                   const float beta,
-                                   const cnnlSmoothL1LossAlgorithm_t algorithm,
-                                   const cnnlTensorDescriptor_t dx_desc,
-                                   void* dx);
-  static void EmbeddingForward(const ExecutionContext& ctx,
-                               const int padding_idx,
-                               const cnnlTensorDescriptor_t weight_desc,
-                               const void* weight,
-                               const cnnlTensorDescriptor_t indices_desc,
-                               const int* indices,
-                               const cnnlTensorDescriptor_t output_desc,
-                               void* output);
-  static void RNNForward(const ExecutionContext& ctx,
-                         const cnnlRNNDescriptor_t rnn_desc,
-                         const int dev_seq_lengths[],
-                         const void* weight_param_ptr,
-                         size_t weightspace_size,
-                         const cnnlSeqDataDescriptor_t x_desc,
-                         const void* x,
-                         const cnnlSeqDataDescriptor_t y_desc,
-                         void* y,
-                         const cnnlTensorDescriptor_t h_desc,
-                         const void* hx,
-                         void* hy,
-                         const cnnlTensorDescriptor_t c_desc,
-                         const void* cx,
-                         void* cy,
-                         void* reservespace_ptr);
-  static void RNNBackward(const ExecutionContext& ctx,
-                          const cnnlRNNDescriptor_t rnn_desc,
-                          cnnlWgradMode_t add_grad,
-                          const int dev_seq_lengths[],
-                          const void* weight_param_ptr,
-                          void* dweight_param_ptr,
-                          size_t weightspace_size,
-                          const cnnlSeqDataDescriptor_t x_desc,
-                          const void* x,
-                          void* dx,
-                          const cnnlSeqDataDescriptor_t y_desc,
-                          const void* y,
-                          const void* dy,
-                          const cnnlTensorDescriptor_t hx_desc,
-                          const void* hx,
-                          const void* dhy,
-                          void* dhx,
-                          const cnnlTensorDescriptor_t cx_desc,
-                          const void* cx,
-                          const void* dcy,
-                          void* dcx,
-                          void* reservespace_ptr,
-                          size_t reservespace_size);
-  static void Mask(const ExecutionContext& ctx,
-                   cnnlMaskedOp_t masked_mode,
-                   const cnnlTensorDescriptor_t input_desc,
-                   const void* input,
-                   const cnnlTensorDescriptor_t masked_desc,
-                   const void* masked,
-                   const cnnlTensorDescriptor_t value_desc,
-                   const void* value,
-                   const cnnlTensorDescriptor_t output_desc,
-                   void* output,
-                   uint32_t* number);
-  static void Transform(const ExecutionContext& ctx,
-                        const void* alpha,
-                        const void* beta,
-                        const cnnlTensorDescriptor_t input_desc,
-                        const void* input,
-                        const cnnlTensorDescriptor_t output_desc,
-                        void* output);
-  static void EmbeddingBackward(const ExecutionContext& ctx,
-                                int padding_idx,
-                                bool scale_grad_by_freq,
-                                const cnnlTensorDescriptor_t indices_desc,
-                                const void* indices,
-                                const cnnlTensorDescriptor_t diff_desc,
-                                const void* diff,
-                                const cnnlTensorDescriptor_t output_desc,
-                                void* output);
-  static void BceWithLogits(const ExecutionContext& ctx,
-                            cnnlBceWithLogitsReduction_t reduction,
-                            const cnnlTensorDescriptor_t input_desc,
-                            const void* input,
-                            const cnnlTensorDescriptor_t target_desc,
-                            const void* target,
-                            const cnnlTensorDescriptor_t weight_desc,
-                            const void* weight,
-                            const cnnlTensorDescriptor_t pos_weight_desc,
-                            const void* pos_weight,
-                            const cnnlTensorDescriptor_t output_desc,
-                            void* output);
-  static void BceWithLogitsBackward(
-      const ExecutionContext& ctx,
-      cnnlBceWithLogitsReduction_t reduction,
-      const cnnlTensorDescriptor_t grad_desc,
-      const void* grad,
-      const cnnlTensorDescriptor_t input_desc,
-      const void* input,
-      const cnnlTensorDescriptor_t target_desc,
-      const void* target,
-      const cnnlTensorDescriptor_t weight_desc,
-      const void* weight,
-      const cnnlTensorDescriptor_t pos_weight_desc,
-      const void* pos_weight,
-      const cnnlTensorDescriptor_t diff_input_desc,
-      void* diff_input);
-  static void RoiAlign(const ExecutionContext& ctx,
-                       const int pooled_height,
-                       const int pooled_width,
-                       const int sampling_ratio,
-                       const float spatial_scale,
-                       const bool aligned,
-                       const cnnlTensorDescriptor_t input_desc,
-                       const void* input,
-                       const cnnlTensorDescriptor_t boxes_desc,
-                       const void* boxes,
-                       const cnnlTensorDescriptor_t output_desc,
-                       void* output);
-  static void RoiAlignBackward(const ExecutionContext& ctx,
-                               const int sampling_ratio,
-                               const float spatial_scale,
-                               const bool aligned,
-                               const cnnlTensorDescriptor_t grads_desc,
-                               const void* grads,
-                               const cnnlTensorDescriptor_t boxes_desc,
-                               const void* boxes,
-                               const cnnlTensorDescriptor_t grads_image_desc,
-                               void* grads_image);
-  static void GridSample(const ExecutionContext& ctx,
-                         const cnnlGridSampleDescriptor_t grid_sample_desc,
-                         const cnnlTensorDescriptor_t input_desc,
-                         const void* input,
-                         const cnnlTensorDescriptor_t grid_desc,
-                         const void* grid,
-                         const cnnlTensorDescriptor_t output_desc,
-                         void* output);
-  static void SyncBatchNormStats(const ExecutionContext& ctx,
-                                 const cnnlTensorDescriptor_t x_desc,
-                                 const void* x,
-                                 const float eps,
-                                 const cnnlTensorDescriptor_t mean_desc,
-                                 void* mean,
-                                 const cnnlTensorDescriptor_t invstd_desc,
-                                 void* invstd);
-  static void SyncBatchNormGatherStatsWithCounts(
-      const ExecutionContext& ctx,
-      float momentum,
-      float eps,
-      const cnnlTensorDescriptor_t mean_all_desc,
-      const void* mean_all,
-      const cnnlTensorDescriptor_t invstd_all_desc,
-      const void* invstd_all,
-      const cnnlTensorDescriptor_t moving_mean_desc,
-      void* moving_mean,
-      const cnnlTensorDescriptor_t moving_var_desc,
-      void* moving_var,
-      const cnnlTensorDescriptor_t count_all_desc,
-      const void* count_all,
-      const cnnlTensorDescriptor_t mean_desc,
-      void* mean,
-      const cnnlTensorDescriptor_t invstd_desc,
-      void* invstd);
-  static void SyncBatchNormElemt(const ExecutionContext& ctx,
-                                 const cnnlTensorDescriptor_t x_desc,
-                                 const void* x,
-                                 const cnnlTensorDescriptor_t mean_desc,
-                                 const void* mean,
-                                 const cnnlTensorDescriptor_t invstd_desc,
-                                 const void* invstd,
-                                 const cnnlTensorDescriptor_t weight_desc,
-                                 const void* weight,
-                                 const cnnlTensorDescriptor_t bias_desc,
-                                 const void* bias,
-                                 const cnnlTensorDescriptor_t y_desc,
-                                 void* y);
-  static void SyncBatchnormBackwardReduce(
-      const ExecutionContext& ctx,
-      const cnnlTensorDescriptor_t desc_dz,
-      const void* dz,
-      const cnnlTensorDescriptor_t desc_x,
-      const void* x,
-      const cnnlTensorDescriptor_t desc_mean,
-      const void* mean,
-      const cnnlTensorDescriptor_t desc_invstd,
-      const void* invstd,
-      const cnnlTensorDescriptor_t desc_dweight,
-      void* dweight,
-      const cnnlTensorDescriptor_t desc_dbias,
-      void* dbias,
-      const cnnlTensorDescriptor_t desc_sum_dy,
-      void* sum_dy,
-      const cnnlTensorDescriptor_t desc_sum_dy_xmu,
-      void* sum_dy_xmu,
-      const bool needs_input_grad0,
-      const bool needs_input_grad1,
-      const bool needs_input_grad2);
-  static void SyncBatchNormBackwardElemt(
-      const ExecutionContext& ctx,
-      const cnnlTensorDescriptor_t diff_y_desc,
-      const void* diff_y,
-      const cnnlTensorDescriptor_t x_desc,
-      const void* x,
-      const cnnlTensorDescriptor_t mean_desc,
-      const void* mean,
-      const cnnlTensorDescriptor_t invstd_desc,
-      const void* invstd,
-      const cnnlTensorDescriptor_t weight_desc,
-      const void* weight,
-      const cnnlTensorDescriptor_t sum_dy_desc,
-      const void* sum_dy,
-      const cnnlTensorDescriptor_t sum_dy_xmu_desc,
-      const void* sum_dy_xmu,
-      const cnnlTensorDescriptor_t count_desc,
-      const void* count,
-      const cnnlTensorDescriptor_t diff_x_desc,
-      void* diff_x);
-};
-class MLUOP {
- public:
-  static void OpYoloBox(const ExecutionContext& ctx,
-                        const mluOpTensorDescriptor_t x_desc,
-                        const void* x,
-                        const mluOpTensorDescriptor_t img_size_desc,
-                        const void* img_size,
-                        const mluOpTensorDescriptor_t anchors_desc,
-                        const void* anchors,
-                        const int class_num,
-                        const float conf_thresh,
-                        const int downsample_ratio,
-                        const bool clip_bbox,
-                        const float scale,
-                        const bool iou_aware,
-                        const float iou_aware_factor,
-                        const mluOpTensorDescriptor_t boxes_desc,
-                        void* boxes,
-                        const mluOpTensorDescriptor_t scores_desc,
-                        void* scores);
-  static void OpPriorBox(const ExecutionContext& ctx,
-                         const mluOpTensorDescriptor_t min_sizes_desc,
-                         const void* min_sizes,
-                         const mluOpTensorDescriptor_t aspect_ratios_desc,
-                         const void* aspect_ratios,
-                         const mluOpTensorDescriptor_t variances_desc,
-                         const void* variances,
-                         const mluOpTensorDescriptor_t max_sizes_desc,
-                         const void* max_sizes,
-                         const int height,
-                         const int width,
-                         const int im_height,
-                         const int im_width,
-                         const float step_h,
-                         const float step_w,
-                         const float offset,
-                         const bool clip,
-                         const bool min_max_aspect_ratios_order,
-                         const mluOpTensorDescriptor_t output_desc,
-                         void* output,
-                         const mluOpTensorDescriptor_t var_desc,
-                         void* var);
-};
-const std::map<const std::string, std::pair<std::vector<int>, std::vector<int>>>
-    TransPermMap = {
-        // trans_mode, (forward_perm, backward_perm)
-        {"3D_NCHW2NHWC", {{0, 2, 1}, {0, 2, 1}}},
-        {"4D_NCHW2NHWC", {{0, 2, 3, 1}, {0, 3, 1, 2}}},
-        {"5D_NCHWD2NDHWC", {{0, 4, 2, 3, 1}, {0, 4, 2, 3, 1}}},
-        {"5D_NHWDC2NDHWC", {{0, 3, 1, 2, 4}, {0, 2, 3, 4, 1}}}};
-inline void SetMLUTransposePerm(const framework::DDim& dims,
-                                const DataLayout& data_layout,
-                                std::vector<int>* forward_perm,
-                                std::vector<int>* backward_perm,
-                                std::vector<int>* out_shape) {
-  const int dim_size = dims.size();
-  PADDLE_ENFORCE_EQ((dim_size >= 3) && (dim_size <= 5),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "MLUTransposePerm func only support (dim_size >= 3) && "
-                        "(dim_size <= 5), but now dim_size is %d.",
-                        dim_size));
-  PADDLE_ENFORCE_EQ(
-      (data_layout == DataLayout::kNCHW) || (data_layout == DataLayout::kNHWC),
-      true,
-      platform::errors::InvalidArgument(
-          "MLUTransposePerm func only support DataLayout: kNCHW or kNHWC, but "
-          "now data_layout is %s.",
-          data_layout));
-  // case 1: NCHW of Paddle != NHWC of MLU when dims==3,4
-  // case 2： NHWDC and NCHWD of Paddle != NDHWC of MLU when dims==5
-  std::string map_key = "";
-  if (data_layout == DataLayout::kNCHW) {
-    switch (dim_size) {
-      case 3:
-        map_key = "3D_NCHW2NHWC";
-        break;
-      case 4:
-        map_key = "4D_NCHW2NHWC";
-        break;
-      case 5:
-        map_key = "5D_NCHWD2NDHWC";
-        break;
-    }
-  } else if (data_layout == DataLayout::kNHWC && dim_size == 5) {
-    map_key = "5D_NHWDC2NDHWC";
-  }
-  assert(map_key != "");
-  forward_perm->assign(TransPermMap.at(map_key).first.begin(),
-                       TransPermMap.at(map_key).first.end());
-  backward_perm->assign(TransPermMap.at(map_key).second.begin(),
-                        TransPermMap.at(map_key).second.end());
-  auto in_dims = phi::vectorize(dims);
-  for (size_t i = 0; i < in_dims.size(); i++) {
-    out_shape->push_back(in_dims[forward_perm->at(i)]);
-  }
-}
-template <typename T>
-inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
-                                   const std::vector<int> perm,
-                                   const phi::DenseTensor* transformed_input,
-                                   phi::DenseTensor* transformed_output,
-                                   bool need_reshape_or_alloc) {
-  const int dim_size = perm.size();
-  if (need_reshape_or_alloc) {
-    std::vector<int> output_shape;
-    auto input_dims = transformed_input->dims();
-    for (int i = 0; i < dim_size; ++i) {
-      output_shape.push_back(input_dims[perm[i]]);
-    }
-    transformed_output->mutable_data<T>(
-        framework::DDim(output_shape.data(), dim_size), ctx.GetPlace());
-  }
-  MLUCnnlTensorDesc trans_in_desc(
-      *transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc trans_out_desc(
-      *transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnl::Transpose(ctx,
-                     perm,
-                     dim_size,
-                     trans_in_desc.get(),
-                     GetBasePtr(transformed_input),
-                     trans_out_desc.get(),
-                     GetBasePtr(transformed_output));
-}
-template <typename T>
-inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx,
-                                       T value,
-                                       phi::DenseTensor* out) {
-  MLUCnnlTensorDesc out_desc(*out);
-  MLUCnnl::Fill(
-      ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), GetBasePtr(out));
-}
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class AdamMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* param = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Grad(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* grad = ctx.Input<phi::DenseTensor>("Grad");
-    auto* mom1 = ctx.Input<phi::DenseTensor>("Moment1");
-    auto* mom2 = ctx.Input<phi::DenseTensor>("Moment2");
-    auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto* mom1_out = ctx.Output<phi::DenseTensor>("Moment1Out");
-    auto* mom2_out = ctx.Output<phi::DenseTensor>("Moment2Out");
-    auto* beta1_pow_out = ctx.Output<phi::DenseTensor>("Beta1PowOut");
-    auto* beta2_pow_out = ctx.Output<phi::DenseTensor>("Beta2PowOut");
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      ctx.device_context().Wait();
-      skip_update = skip_update_vec[0];
-    }
-    // skip_update=true, just copy input to output, and TensorCopy will call
-    // mutable_data
-    if (skip_update) {
-      VLOG(4) << "Adam skip update";
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          param_out);
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          mom1_out);
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          mom2_out);
-      framework::TensorCopy(
-          *beta1_pow,
-          beta1_pow->place(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          beta1_pow_out);
-      framework::TensorCopy(
-          *beta2_pow,
-          beta2_pow->place(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          beta2_pow_out);
-      return;
-    }
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-    param_out->ShareDataWith(*param);
-    mom1_out->ShareDataWith(*mom1);
-    mom2_out->ShareDataWith(*mom2);
-    phi::DenseTensor beta1_pow_tmp;
-    phi::DenseTensor beta2_pow_tmp;
-    if (beta1_pow->place() == platform::CPUPlace()) {
-      T beta1 = *beta1_pow->data<T>();
-      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &beta1,
-                    beta1_pow_tmp_desc.get(),
-                    GetBasePtr(&beta1_pow_tmp));
-      beta1_pow = &beta1_pow_tmp;
-    }
-    if (beta2_pow->place() == platform::CPUPlace()) {
-      T beta2 = *beta2_pow->data<T>();
-      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &beta2,
-                    beta2_pow_tmp_desc.get(),
-                    GetBasePtr(&beta2_pow_tmp));
-      beta2_pow = &beta2_pow_tmp;
-    }
-    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
-            << "beta2_pow.numel() : " << beta2_pow->numel();
-    VLOG(3) << "param.numel(): " << param->numel();
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-    const phi::DenseTensor* beta1_tensor = nullptr;
-    const phi::DenseTensor* beta2_tensor = nullptr;
-    const phi::DenseTensor* epsilon_tensor = nullptr;
-    phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-    if (ctx.HasInput("Beta1Tensor")) {
-      beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
-      PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta1Tensor) size must be 1, but get %d",
-                            beta1_tensor->numel()));
-    } else {
-      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &beta1,
-                    beta1_tmp_desc.get(),
-                    GetBasePtr(&beta1_tmp));
-      beta1_tensor = &beta1_tmp;
-    }
-    if (ctx.HasInput("Beta2Tensor")) {
-      beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
-      PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta2Tensor) size must be 1, but get %d",
-                            beta2_tensor->numel()));
-    } else {
-      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &beta2,
-                    beta2_tmp_desc.get(),
-                    GetBasePtr(&beta2_tmp));
-      beta2_tensor = &beta2_tmp;
-    }
-    if (ctx.HasInput("EpsilonTensor")) {
-      epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
-      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(EpsilonTensor) size must be 1, but get %d",
-                            epsilon_tensor->numel()));
-    } else {
-      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &epsilon,
-                    epsilon_tmp_desc.get(),
-                    GetBasePtr(&epsilon_tmp));
-      epsilon_tensor = &epsilon_tmp;
-    }
-    MLUCnnlTensorDesc param_desc(*param);
-    MLUCnnlTensorDesc mom1_desc(*mom1);
-    MLUCnnlTensorDesc mom2_desc(*mom2);
-    MLUCnnlTensorDesc grad_desc(*grad);
-    MLUCnnl::ApplyAdam(ctx,
-                       param_desc.get(),
-                       GetBasePtr(param_out),
-                       mom1_desc.get(),
-                       GetBasePtr(mom1_out),
-                       mom2_desc.get(),
-                       GetBasePtr(mom2_out),
-                       grad_desc.get(),
-                       GetBasePtr(grad),
-                       GetBasePtr(lr),
-                       GetBasePtr(beta1_tensor),
-                       GetBasePtr(beta2_tensor),
-                       GetBasePtr(beta1_pow),
-                       GetBasePtr(beta2_pow),
-                       GetBasePtr(epsilon_tensor),
-                       /*use_nesterov*/ false);
-    if (!use_global_beta_pow) {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
-      MLUCnnlOpTensorDesc mul_op_desc(
-          CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-      MLUCnnl::OpTensor(ctx,
-                        mul_op_desc.get(),
-                        beta1_desc.get(),
-                        GetBasePtr(beta1_pow),
-                        beta1_desc.get(),
-                        GetBasePtr(beta1_tensor),
-                        beta1_desc.get(),
-                        GetBasePtr(beta1_pow_out),
-                        ToCnnlDataType<T>());
-      MLUCnnl::OpTensor(ctx,
-                        mul_op_desc.get(),
-                        beta1_desc.get(),
-                        GetBasePtr(beta2_pow),
-                        beta1_desc.get(),
-                        GetBasePtr(beta2_tensor),
-                        beta1_desc.get(),
-                        GetBasePtr(beta2_pow_out),
-                        ToCnnlDataType<T>());
-    }
-  }
-};
-template <typename T>
-class AdamWMLUKernel : public AdamMLUKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    VLOG(3) << "MLU AdamW Kernel";
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      VLOG(3) << "Has SkipUpdate";
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      ctx.device_context().Wait();
-      skip_update = skip_update_vec[0];
-    }
-    bool with_decay = ctx.Attr<bool>("with_decay");
-    const bool multi_precision = ctx.Attr<bool>("multi_precision");
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto* master_param_out = ctx.Output<phi::DenseTensor>("MasterParamOut");
-    const auto* master_param = ctx.Input<phi::DenseTensor>("MasterParam");
-    VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay;
-    if (!skip_update && with_decay) {
-      auto* param = ctx.Input<phi::DenseTensor>("Param");
-      MLUCnnlTensorDesc param_desc(*param);
-      if (multi_precision) {
-        VLOG(3) << "[adamw] multi_precision, cast masterparam to param.";
-        bool has_master =
-            ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
-        PADDLE_ENFORCE_EQ(
-            has_master,
-            true,
-            platform::errors::InvalidArgument(
-                "The Input(MasterParam) and Output(MasterParamOut) "
-                "should not be null when "
-                "the attr `multi_precision` is true"));
-        // cast masterparam (fp32) to param (fp16), then paramout (fp16) to
-        // masterparamout (fp32)
-        MLUCnnlTensorDesc master_param_desc(*master_param);
-        cnnlCastDataType_t cast_type = GetCastDataType(
-            framework::TransToProtoVarType(master_param->dtype()),
-            framework::TransToProtoVarType(param->dtype()));
-        MLUCnnl::Cast(ctx,
-                      cast_type,
-                      master_param_desc.get(),
-                      GetBasePtr(master_param),
-                      param_desc.get(),
-                      const_cast<void*>(GetBasePtr(param)));
-      } else {
-        const auto* param_var = ctx.InputVar("Param");
-        PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "The Var(%s)'s type should be phi::DenseTensor, "
-                              "but the received is %s",
-                              ctx.InputNames("Param").front(),
-                              framework::ToTypeName(param_var->Type())));
-        auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-        float coeff = ctx.Attr<float>("coeff");
-        // update param with decay coeff: mul(-1 * lr, coeff * param) + param
-        MLUCnnlTensorDesc lr_desc(*lr);
-        MLUCnnlOpTensorDesc mul_op_desc(
-            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          lr_desc.get(),
-                          GetBasePtr(lr),
-                          param_desc.get(),
-                          GetBasePtr(param),
-                          param_desc.get(),
-                          const_cast<void*>(GetBasePtr(param)),
-                          ToCnnlDataType<T>(),
-                          /*alpha1*/ -1.f,
-                          /*alpha2*/ coeff,
-                          /*beta*/ 1.f);
-      }
-    }
-    AdamMLUKernel<T>::Compute(ctx);
-    if (multi_precision) {
-      VLOG(3) << "[adamw] multi_precision, cast paramout to masterparamout.";
-      // cast paramout to masterparamout
-      master_param_out->mutable_data<float>(ctx.GetPlace());
-      cnnlCastDataType_t cast_type = GetCastDataType(
-          framework::TransToProtoVarType(param_out->dtype()),
-          framework::TransToProtoVarType(master_param_out->dtype()));
-      MLUCnnlTensorDesc param_out_desc(*param_out);
-      MLUCnnlTensorDesc master_param_out_desc(*master_param_out);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    param_out_desc.get(),
-                    GetBasePtr(param_out),
-                    master_param_out_desc.get(),
-                    GetBasePtr(master_param_out));
-    }
-  }
-};
-template <typename T>
-class MergedAdamMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // Get inputs and outputs
-    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
-    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    auto mom1s = ctx.MultiInput<phi::DenseTensor>("Moment1");
-    auto mom2s = ctx.MultiInput<phi::DenseTensor>("Moment2");
-    auto beta1_pows = ctx.MultiInput<phi::DenseTensor>("Beta1Pow");
-    auto beta2_pows = ctx.MultiInput<phi::DenseTensor>("Beta2Pow");
-    auto master_params = ctx.MultiInput<phi::DenseTensor>("MasterParam");
-    auto param_outs = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    auto mom1_outs = ctx.MultiOutput<phi::DenseTensor>("Moment1Out");
-    auto mom2_outs = ctx.MultiOutput<phi::DenseTensor>("Moment2Out");
-    auto beta1_pow_outs = ctx.MultiOutput<phi::DenseTensor>("Beta1PowOut");
-    auto beta2_pow_outs = ctx.MultiOutput<phi::DenseTensor>("Beta2PowOut");
-    // Check validation of inputs and outputs
-    size_t param_num = params.size();
-    PADDLE_ENFORCE_EQ(param_num,
-                      param_outs.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Output(ParamOut) must be equal to "
-                          "Input(Param), but got the size of Output(ParamOut) "
-                          "is %d, the size of Input(Param) is %d.",
-                          param_outs.size(),
-                          param_num));
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      ctx.device_context().Wait();
-      skip_update = skip_update_vec[0];
-    }
-    // skip_update=true, just copy input to output, and TensorCopy will call
-    // mutable_data
-    if (skip_update) {
-      VLOG(4) << "MergedAdam skip update";
-      for (size_t i = 0; i < param_num; ++i) {
-        framework::TensorCopy(
-            *params[i],
-            ctx.GetPlace(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            param_outs[i]);
-        framework::TensorCopy(
-            *mom1s[i],
-            ctx.GetPlace(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            mom1_outs[i]);
-        framework::TensorCopy(
-            *mom2s[i],
-            ctx.GetPlace(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            mom2_outs[i]);
-        framework::TensorCopy(
-            *beta1_pows[i],
-            beta1_pows[i]->place(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            beta1_pow_outs[i]);
-        framework::TensorCopy(
-            *beta2_pows[i],
-            beta2_pows[i]->place(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            beta2_pow_outs[i]);
-      }
-      return;
-    }
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-    // Get beta1, beta2 and epsilon from attribute.
-    const phi::DenseTensor* beta1_tensor = nullptr;
-    const phi::DenseTensor* beta2_tensor = nullptr;
-    const phi::DenseTensor* epsilon_tensor = nullptr;
-    phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
-    beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
-    epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-    MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
-    MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
-    MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &beta1,
-                  beta1_tmp_desc.get(),
-                  GetBasePtr(&beta1_tmp));
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &beta2,
-                  beta2_tmp_desc.get(),
-                  GetBasePtr(&beta2_tmp));
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &epsilon,
-                  epsilon_tmp_desc.get(),
-                  GetBasePtr(&epsilon_tmp));
-    beta1_tensor = &beta1_tmp;
-    beta2_tensor = &beta2_tmp;
-    epsilon_tensor = &epsilon_tmp;
-    // Loop to compute
-    for (size_t i = 0; i < param_num; ++i) {
-      VLOG(4) << "[MergedAdam] loop: " << i;
-      param_outs[i]->ShareDataWith(*params[i]);
-      mom1_outs[i]->ShareDataWith(*mom1s[i]);
-      mom2_outs[i]->ShareDataWith(*mom2s[i]);
-      phi::DenseTensor beta1_pow_tmp;
-      phi::DenseTensor beta2_pow_tmp;
-      if (beta1_pows[i]->place() == platform::CPUPlace()) {
-        T beta1 = *beta1_pows[i]->data<T>();
-        beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &beta1,
-                      beta1_pow_tmp_desc.get(),
-                      GetBasePtr(&beta1_pow_tmp));
-        beta1_pows[i] = &beta1_pow_tmp;
-      }
-      if (beta2_pows[i]->place() == platform::CPUPlace()) {
-        T beta2 = *beta2_pows[i]->data<T>();
-        beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &beta2,
-                      beta2_pow_tmp_desc.get(),
-                      GetBasePtr(&beta2_pow_tmp));
-        beta2_pows[i] = &beta2_pow_tmp;
-      }
-      VLOG(3) << "beta1_pow.numel() : " << beta1_pows[i]->numel()
-              << "beta2_pow.numel() : " << beta2_pows[i]->numel();
-      VLOG(3) << "param.numel(): " << params[i]->numel();
-      PADDLE_ENFORCE_EQ(beta1_pow_outs[i]->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "beta1 pow output size should be 1, but received "
-                            "value is:%d.",
-                            beta1_pow_outs[i]->numel()));
-      PADDLE_ENFORCE_EQ(beta2_pow_outs[i]->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "beta2 pow output size should be 1, but received "
-                            "value is:%d.",
-                            beta2_pow_outs[i]->numel()));
-      MLUCnnlTensorDesc param_desc(*params[i]);
-      MLUCnnlTensorDesc mom1_desc(*mom1s[i]);
-      MLUCnnlTensorDesc mom2_desc(*mom2s[i]);
-      MLUCnnlTensorDesc grad_desc(*grads[i]);
-      MLUCnnl::ApplyAdam(ctx,
-                         param_desc.get(),
-                         GetBasePtr(param_outs[i]),
-                         mom1_desc.get(),
-                         GetBasePtr(mom1_outs[i]),
-                         mom2_desc.get(),
-                         GetBasePtr(mom2_outs[i]),
-                         grad_desc.get(),
-                         GetBasePtr(grads[i]),
-                         GetBasePtr(lrs[i]),
-                         GetBasePtr(beta1_tensor),
-                         GetBasePtr(beta2_tensor),
-                         GetBasePtr(beta1_pows[i]),
-                         GetBasePtr(beta2_pows[i]),
-                         GetBasePtr(epsilon_tensor),
-                         /*use_nesterov*/ false);
-      if (!use_global_beta_pow) {
-        beta1_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
-        beta2_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
-        MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
-        MLUCnnlOpTensorDesc mul_op_desc(
-            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          beta1_desc.get(),
-                          GetBasePtr(beta1_pows[i]),
-                          beta1_desc.get(),
-                          GetBasePtr(beta1_tensor),
-                          beta1_desc.get(),
-                          GetBasePtr(beta1_pow_outs[i]),
-                          ToCnnlDataType<T>());
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          beta1_desc.get(),
-                          GetBasePtr(beta2_pows[i]),
-                          beta1_desc.get(),
-                          GetBasePtr(beta2_tensor),
-                          beta1_desc.get(),
-                          GetBasePtr(beta2_pow_outs[i]),
-                          ToCnnlDataType<T>());
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(adam,
-                       ops::AdamMLUKernel<float>,
-                       ops::AdamMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(adamw,
-                       ops::AdamWMLUKernel<float>,
-                       ops::AdamWMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(merged_adam,
-                       ops::MergedAdamMLUKernel<float>,
-                       ops::MergedAdamMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    size_t n = params.size();
-    PADDLE_ENFORCE_EQ(n,
-                      params_out.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Output(ParamOut) must be equal to "
-                          "Input(Param), but got the size of Output(ParamOut) "
-                          "is %d, the size of Input(Param) is %d.",
-                          params_out.size(),
-                          n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(params[i],
-                        params_out[i],
-                        platform::errors::InvalidArgument(
-                            "The size of Input(Param) and Output(ParamOut) "
-                            "must be the same Tensors."));
-    }
-    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
-    PADDLE_ENFORCE_EQ(
-        n,
-        grads.size(),
-        platform::errors::InvalidArgument(
-            "The size of Input(Grad) must be equal to Input(Param), but got "
-            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
-            grads.size(),
-            n));
-    auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    PADDLE_ENFORCE_EQ(n,
-                      velocitys.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Velocity) must be equal to "
-                          "Input(Param), but got the size of Input(Velocity) "
-                          "is %d, the size of Input(Param) is %d.",
-                          velocitys.size(),
-                          n));
-    auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    PADDLE_ENFORCE_EQ(
-        n,
-        velocitys_out.size(),
-        platform::errors::InvalidArgument(
-            "The size of Output(VelocityOut) must be "
-            "equal to Input(Param), but got the size of Output(VelocityOut) is "
-            "%d, the size of Input(Param) is %d.",
-            velocitys_out.size(),
-            n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(velocitys[i],
-                        velocitys_out[i],
-                        platform::errors::InvalidArgument(
-                            "Input(Velocity) and Output(VelocityOut) must be "
-                            "the same Tensors."));
-    }
-    auto mu = static_cast<T>(ctx.Attr<float>("mu"));
-    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    if (lrs.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          lrs.size(),
-          platform::errors::InvalidArgument(
-              "If the size of Input(LearningRate) is not 1, the size of "
-              "Input(LearningRate) must be "
-              "equal to Input(Param), but got the size of Input(LearningRate) "
-              "is %d, the size of Input(Param) is %d.",
-              lrs.size(),
-              n));
-    }
-    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto regularization_methods =
-        ctx.Attr<std::vector<std::string>>("regularization_method");
-    auto regularization_coeffs =
-        ctx.Attr<std::vector<float>>("regularization_coeff");
-    if (regularization_methods.size() != 0) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_methods.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_method) must be equal "
-              "to Input(Param), but got the size of "
-              "Attr(regularization_method) is %d, the size of Input(Param) is "
-              "%d.",
-              regularization_methods.size(),
-              n));
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_coeffs.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_coeff) must be equal "
-              "to Input(Param), but got the size of Attr(regularization_coeff) "
-              "is %d, the size of Input(Param) is %d.",
-              regularization_coeffs.size(),
-              n));
-    }
-    VLOG(5) << "use_nesterov: " << use_nesterov
-            << ",  regularization_methods.size(): "
-            << regularization_methods.size()
-            << ",  regularization_coeffs.size(): "
-            << regularization_coeffs.size();
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    phi::DenseTensor mu_tensor =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &mu,
-                  mu_tensor_desc.get(),
-                  GetBasePtr(&mu_tensor));
-    for (size_t idx = 0; idx < n; ++idx) {
-      phi::RegularizationType regularization_flag =
-          regularization_methods.size() > 0 &&
-                  regularization_methods[idx] == "l2_decay"
-              ? phi::RegularizationType::kL2DECAY
-              : phi::RegularizationType::kNONE;
-      T regularization_coeff = static_cast<T>(0.0);
-      if (regularization_coeffs.size() != 0) {
-        regularization_coeff = static_cast<T>(regularization_coeffs[idx]);
-      }
-      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
-      auto param_out = params_out[idx];
-      auto velocity_out = velocitys_out[idx];
-      auto grad = grads[idx];
-      phi::DenseTensor regularized_grad;
-      MLUCnnlTensorDesc param_desc(*param_out);
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-            param_out->dims(), dev_ctx);
-        MLUCnnlOpTensorDesc op_tensor_desc(
-            CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-        MLUCnnl::OpTensor(ctx,
-                          op_tensor_desc.get(),
-                          param_desc.get(),
-                          GetBasePtr(param_out),
-                          param_desc.get(),
-                          GetBasePtr(grad),
-                          param_desc.get(),
-                          GetBasePtr(&regularized_grad),
-                          ToCnnlDataType<T>(),
-                          regularization_coeff);
-      } else {
-        regularized_grad = *grad;
-      }
-      MLUCnnl::ApplyMomentum(ctx,
-                             param_desc.get(),
-                             GetBasePtr(&regularized_grad),
-                             use_nesterov,
-                             GetBasePtr(learning_rate),
-                             GetBasePtr(&mu_tensor),
-                             GetBasePtr(param_out),
-                             GetBasePtr(velocity_out));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(merged_momentum,
-                       ops::MLUMergedMomentumOpKernel<float>,
-                       ops::MLUMergedMomentumOpKernel<plat::float16>);
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MLUMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    std::string regularization_method =
-        ctx.Attr<std::string>("regularization_method");
-    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
-    phi::RegularizationType regularization_flag{
-        phi::RegularizationType::kNONE};  // disable regularization
-    if (regularization_method == "l2_decay") {
-      regularization_flag = phi::RegularizationType::kL2DECAY;
-    }
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto param = ctx.Input<phi::DenseTensor>("Param");
-    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
-    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto grad = ctx.Input<phi::DenseTensor>("Grad");
-      phi::DenseTensor mu_tensor =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-      MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &mu,
-                    mu_tensor_desc.get(),
-                    GetBasePtr(&mu_tensor));
-      phi::DenseTensor regularized_grad;
-      MLUCnnlTensorDesc param_desc(*param);
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad =
-            ctx.AllocateTmpTensor<T, MLUDeviceContext>(param->dims(), dev_ctx);
-        MLUCnnlOpTensorDesc op_tensor_desc(
-            CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-        MLUCnnl::OpTensor(ctx,
-                          op_tensor_desc.get(),
-                          param_desc.get(),
-                          GetBasePtr(param),
-                          param_desc.get(),
-                          GetBasePtr(grad),
-                          param_desc.get(),
-                          GetBasePtr(&regularized_grad),
-                          ToCnnlDataType<T>(),
-                          regularization_coeff);
-      } else {
-        regularized_grad = *grad;
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      MLUCnnl::ApplyMomentum(ctx,
-                             param_desc.get(),
-                             GetBasePtr(&regularized_grad),
-                             use_nesterov,
-                             GetBasePtr(learning_rate),
-                             GetBasePtr(&mu_tensor),
-                             GetBasePtr(param_out),
-                             GetBasePtr(velocity_out));
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(
-          false,
-          true,
-          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in MomentumOp. Excepted LodTensor "
-                            "or SelectedRows, But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(momentum,
-                       ops::MLUMomentumOpKernel<float>,
-                       ops::MLUMomentumOpKernel<plat::float16>);
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (platform::is_mlu_place(place_)) {
-    int dev_idx = place_.device;
-    compute_stream_ =
-        ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance()
-                                            .Get(place_)))
-            ->stream();
-    events_.resize(buffer_size);
-    for (auto &event : events_) {
-      event = platform::MluEventResourcePool::Instance().New(dev_idx);
-    }
-    stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
-  }
-#endif
 #ifdef PADDLE_WITH_XPU
  if (platform::is_xpu_place(place_)) {
    int dev_idx = place_.device;
@@ -260,57 +245,6 @@ void BufferedReader::ReadAsync(size_t i) {
    }
 #endif
-#ifdef PADDLE_WITH_MLU
-    if (platform::is_mlu_place(place_)) {
-      TensorVec &mlu = mlu_buffer_[i];
-      if (mlu.empty()) {
-        mlu.resize(cpu.size());
-      } else {
-        PADDLE_ENFORCE_EQ(
-            mlu.size(),
-            cpu.size(),
-            platform::errors::InvalidArgument(
-                "Input tensor number on MLU and CPU devices are not matched. "
-                "The number on MLU is %d, on CPU is %d",
-                mlu.size(),
-                cpu.size()));
-      }
-      std::vector<void *> mlu_ptrs;
-      mlu_ptrs.reserve(cpu.size());
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        mlu[i].Resize(cpu[i].dims());
-        mlu[i].set_layout(cpu[i].layout());
-        mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type()));
-      }
-      platform::SetMLUDeviceId(place_.device);
-      PADDLE_ENFORCE_MLU_SUCCESS(
-          cnPlaceNotifier(events_[i].get(), compute_stream_));
-      PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get()));
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        auto cpu_place = cpu[i].place();
-        auto cpu_ptr = cpu[i].data();
-        auto mlu_ptr = mlu_ptrs[i];
-        auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
-        if ((platform::is_mlu_place(cpu_place))) {
-          memory::Copy(
-              place_, mlu_ptr, cpu_place, cpu_ptr, size, stream_.get());
-        } else {
-          memory::Copy(
-              place_, mlu_ptr, cpu_place, cpu_ptr, size, stream_.get());
-          platform::MLUStreamSync(stream_.get());
-        }
-        mlu[i].set_lod(cpu[i].lod());
-      }
-      platform::MLUStreamSync(stream_.get());
-    }
-#endif
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(place_)) {
      TensorVec &xpu = xpu_buffer_[i];

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -26,10 +26,6 @@
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
-#endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
@@ -92,12 +88,6 @@ class BufferedReader : public framework::DecoratedReader {
  std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
 #endif
-#ifdef PADDLE_WITH_MLU
-  mluStream compute_stream_;
-  std::shared_ptr<platform::MluStreamObject> stream_;
-  std::vector<std::shared_ptr<platform::MluEventObject>> events_;
-#endif
 #ifdef PADDLE_WITH_XPU
  xpuStream compute_stream_;
  std::shared_ptr<platform::XpuStreamObject> stream_;

--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ReduceMaxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    int out_dtype = context.Attr<int>("out_dtype");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = input->dims();
-    const auto& input_dim_size = input->dims().size();
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      for (int i = 0; i < input_dims.size(); i++) {
-        reduce_dims.push_back(static_cast<int>(i));
-      }
-    } else {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        if (dims[i] < 0) {
-          reduce_dims.push_back(dims[i] + input_dim_size);
-        } else {
-          reduce_dims.push_back(dims[i]);
-        }
-      }
-    }
-    auto place = context.GetPlace();
-    phi::DenseTensor cast_out(input->type());
-    cast_out.Resize(output->dims());
-    cast_out.mutable_data<T>(place);
-    auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-    if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        output->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        output->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        output->mutable_data<int32_t>(place);
-      }
-    } else {
-      output->ShareDataWith(cast_out);
-    }
-    MLUCnnlTensorDesc input_desc(
-        *input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
-    MLUCnnlTensorDesc output_desc(
-        *output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
-    MLUCnnlReduceDesc reduction_desc(reduce_dims,
-                                     CNNL_REDUCE_MAX,
-                                     ToCnnlDataType<T>(),
-                                     CNNL_NOT_PROPAGATE_NAN,
-                                     CNNL_REDUCE_NO_INDICES,
-                                     CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(context,
-                    true /*need_workspace*/,
-                    reduction_desc.get(),
-                    nullptr,
-                    input_desc.get(),
-                    GetBasePtr(input),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-template <typename T>
-class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Input<phi::DenseTensor>("Out");
-    auto* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto reduce_dims = context.Attr<std::vector<int>>("dim");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    int in_dtype = context.Attr<int>("in_dtype");
-    PADDLE_ENFORCE_EQ(
-        in_dtype == -1,
-        true,
-        platform::errors::InvalidArgument(
-            "MLU only support in_dtype == -1 in reduce_max_grad op."));
-    auto* x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    x_grad->mutable_data<T>(context.GetPlace());
-    auto place = context.GetPlace();
-    // broadcast
-    auto x_dims_vec = phi::vectorize(x->dims());
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    phi::DenseTensor tmp_out, tmp_out_grad;
-    auto tmp_out_dims_vec = x_dims_vec;
-    for (auto d : reduce_dims) {
-      if (d < 0) {
-        d += x_dims_vec.size();
-      }
-      tmp_out_dims_vec[d] = 1;
-    }
-    tmp_out.ShareDataWith(*out);
-    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
-    tmp_out_grad.ShareDataWith(*out_grad);
-    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
-    phi::DenseTensor transformed_out(x->type());
-    transformed_out.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out.mutable_data<T>(place);
-    MLUCnnlTensorDesc tmp_out_desc(tmp_out);
-    MLUCnnlTensorDesc transformed_out_desc(transformed_out);
-    MLUCnnl::BroadcastTo(context,
-                         tmp_out_desc.get(),
-                         GetBasePtr(&tmp_out),
-                         transformed_out_desc.get(),
-                         GetBasePtr(&transformed_out));
-    phi::DenseTensor transformed_out_grad(x->type());
-    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out_grad.mutable_data<T>(place);
-    MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
-    MLUCnnlTensorDesc transformed_out_grad_desc(transformed_out_grad);
-    MLUCnnl::BroadcastTo(context,
-                         tmp_out_grad_desc.get(),
-                         GetBasePtr(&tmp_out_grad),
-                         transformed_out_grad_desc.get(),
-                         GetBasePtr(&transformed_out_grad));
-    // compare
-    phi::DenseTensor equal_cond;
-    equal_cond.mutable_data<bool>(x_grad->dims(), place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc equal_cond_desc(equal_cond);
-    MLUCnnl::Logic(context,
-                   CNNL_LOGIC_OP_EQ,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   transformed_out_desc.get(),
-                   GetBasePtr(&transformed_out),
-                   equal_cond_desc.get(),
-                   GetBasePtr(&equal_cond));
-    // select
-    phi::DenseTensor t_zero;
-    t_zero.mutable_data<T>(x_grad->dims(), place);
-    FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
-    t_zero.Resize(x_grad->dims());
-    MLUCnnlTensorDesc t_zero_desc(t_zero);
-    MLUCnnlTensorDesc x_grad_desc(*x_grad);
-    MLUCnnl::Select(context,
-                    equal_cond_desc.get(),
-                    GetBasePtr(&equal_cond),
-                    transformed_out_grad_desc.get(),
-                    GetBasePtr(&transformed_out_grad),
-                    t_zero_desc.get(),
-                    GetBasePtr(&t_zero),
-                    x_grad_desc.get(),
-                    GetBasePtr(x_grad));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(reduce_max,
-                       ops::ReduceMaxMLUKernel<float>,
-                       ops::ReduceMaxMLUKernel<plat::float16>,
-                       ops::ReduceMaxMLUKernel<int>);
-REGISTER_OP_MLU_KERNEL(reduce_max_grad,
-                       ops::ReduceMaxGradMLUKernel<float>,
-                       ops::ReduceMaxGradMLUKernel<plat::float16>,
-                       ops::ReduceMaxGradMLUKernel<int>);
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ReduceMeanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    MLUReduceOp<T>(context, "reduce_mean");
-  }
-};
-template <typename T>
-class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto reduce_dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = phi::vectorize(input->dims());
-    int reduce_numel = 1;
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < input_dims.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    for (auto& d : reduce_dims) {
-      if (d < 0) {
-        d = d + input_dims.size();
-      }
-      reduce_numel *= input_dims[d];
-    }
-    phi::DenseTensor tmp_output_grad(output_grad->dtype());
-    auto tmp_output_dims = input_dims;
-    for (auto d : reduce_dims) {
-      tmp_output_dims[d] = 1;
-    }
-    tmp_output_grad.ShareDataWith(*output_grad);
-    tmp_output_grad.Resize(phi::make_ddim(tmp_output_dims));
-    MLUCnnlTensorDesc output_grad_desc(tmp_output_grad,
-                                       CNNL_LAYOUT_ARRAY,
-                                       ToCnnlDataType(tmp_output_grad.dtype()));
-    MLUCnnlTensorDesc input_grad_desc(
-        *input_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input_grad->dtype()));
-    auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
-    MLUCnnl::Fill(context,
-                  CNNL_POINTER_MODE_HOST,
-                  &value,
-                  input_grad_desc.get(),
-                  GetBasePtr(input_grad));
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(context,
-                      op_tensor_desc.get(),
-                      output_grad_desc.get(),
-                      GetBasePtr(&tmp_output_grad),
-                      input_grad_desc.get(),
-                      GetBasePtr(input_grad),
-                      input_grad_desc.get(),
-                      GetBasePtr(input_grad),
-                      ToCnnlDataType<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(reduce_mean,
-                       ops::ReduceMeanMLUKernel<float>,
-                       ops::ReduceMeanMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(reduce_mean_grad,
-                       ops::ReduceMeanGradMLUKernel<float>,
-                       ops::ReduceMeanGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ReduceMinMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    int out_dtype = context.Attr<int>("out_dtype");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = input->dims();
-    const auto& input_dim_size = input->dims().size();
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      for (int i = 0; i < input_dims.size(); i++) {
-        reduce_dims.push_back(static_cast<int>(i));
-      }
-    } else {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        if (dims[i] < 0) {
-          reduce_dims.push_back(dims[i] + input_dim_size);
-        } else {
-          reduce_dims.push_back(dims[i]);
-        }
-      }
-    }
-    auto place = context.GetPlace();
-    phi::DenseTensor cast_out(input->type());
-    cast_out.Resize(output->dims());
-    cast_out.mutable_data<T>(place);
-    auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-    if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        output->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        output->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        output->mutable_data<int32_t>(place);
-      }
-    } else {
-      output->ShareDataWith(cast_out);
-    }
-    MLUCnnlTensorDesc input_desc(
-        *input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
-    MLUCnnlTensorDesc output_desc(
-        *output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
-    MLUCnnlReduceDesc reduction_desc(reduce_dims,
-                                     CNNL_REDUCE_MIN,
-                                     ToCnnlDataType<T>(),
-                                     CNNL_NOT_PROPAGATE_NAN,
-                                     CNNL_REDUCE_NO_INDICES,
-                                     CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(context,
-                    true /*need_workspace*/,
-                    reduction_desc.get(),
-                    nullptr,
-                    input_desc.get(),
-                    GetBasePtr(input),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(reduce_min,
-                       ops::ReduceMinMLUKernel<float>,
-                       ops::ReduceMinMLUKernel<plat::float16>,
-                       ops::ReduceMinMLUKernel<int>);
--- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#ifdef PADDLE_WITH_MLU
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-void MLUReduceOp(const framework::ExecutionContext& context,
-                 std::string reduce_name) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_mlu_place(context.GetPlace()),
-      true,
-      platform::errors::Unavailable("This kernel only runs on MLU."));
-  auto* input = context.Input<phi::DenseTensor>("X");
-  auto* output = context.Output<phi::DenseTensor>("Out");
-  output->mutable_data<T>(context.GetPlace());
-  bool reduce_all = context.Attr<bool>("reduce_all");
-  auto dims = context.Attr<std::vector<int>>("dim");
-  auto input_dims = phi::vectorize(input->dims());
-  const auto& input_dim_size = input->dims().size();
-  std::vector<int> reduce_dims;
-  if (reduce_all) {
-    for (size_t i = 0; i < input_dims.size(); i++) {
-      reduce_dims.push_back(static_cast<int>(i));
-    }
-  } else {
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) {
-        reduce_dims.push_back(dims[i] + input_dim_size);
-      } else {
-        reduce_dims.push_back(dims[i]);
-      }
-    }
-  }
-  MLUCnnlTensorDesc input_desc(
-      *input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
-  MLUCnnlTensorDesc output_desc(
-      *output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
-  cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
-  MLUCnnlReduceDesc reduction_desc(reduce_dims,
-                                   reduce_op,
-                                   ToCnnlDataType<T>(),
-                                   CNNL_NOT_PROPAGATE_NAN,
-                                   CNNL_REDUCE_NO_INDICES,
-                                   CNNL_32BIT_INDICES);
-  MLUCnnl::Reduce(context,
-                  true /*need_workspace*/,
-                  reduction_desc.get(),
-                  nullptr,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  0 /*indices_size*/,
-                  nullptr,
-                  nullptr,
-                  output_desc.get(),
-                  GetBasePtr(output));
-}
-}  // namespace operators
-}  // namespace paddle
-#endif
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ReduceMeanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    MLUReduceOp<T>(context, "reduce_prod");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(reduce_prod,
-                       ops::ReduceMeanMLUKernel<float>,
-                       ops::ReduceMeanMLUKernel<plat::float16>,
-                       ops::ReduceMeanMLUKernel<int>);
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ReduceSumMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    MLUReduceOp<T>(context, "reduce_sum");
-  }
-};
-template <typename T>
-class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* in_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    in_grad->mutable_data<T>(context.GetPlace());
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto reduce_dims = context.Attr<std::vector<int>>("dim");
-    auto in_dims = phi::vectorize(in->dims());
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < in_dims.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    for (auto& d : reduce_dims) {
-      if (d < 0) {
-        d = d + in_dims.size();
-      }
-    }
-    phi::DenseTensor tmp_out(out_grad->dtype());
-    auto tmp_output_dims = in_dims;
-    for (auto d : reduce_dims) {
-      tmp_output_dims[d] = 1;
-    }
-    tmp_out.ShareDataWith(*out_grad);
-    tmp_out.Resize(phi::make_ddim(tmp_output_dims));
-    MLUCnnlTensorDesc out_desc(tmp_out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc in_grad_desc(
-        *in_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-    MLUCnnl::BroadcastTo(context,
-                         out_desc.get(),
-                         GetBasePtr(&tmp_out),
-                         in_grad_desc.get(),
-                         GetBasePtr(in_grad));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(reduce_sum,
-                       ops::ReduceSumMLUKernel<float>,
-                       ops::ReduceSumMLUKernel<int>,
-                       ops::ReduceSumMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(reduce_sum_grad,
-                       ops::ReduceSumGradMLUKernel<float>,
-                       ops::ReduceSumGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -41,18 +41,6 @@ class SoftmaxWithCrossEntropyOpMaker
        "The outputs value of softmax activation by given the input batch, "
        "which will be used in backward calculation.")
        .AsIntermediate();
-#if defined(PADDLE_WITH_MLU)
-    AddOutput(
-        "Backprop",
-        "(Tensor, default: Tensor<float>), A tensor in same shape with "
-        "Input(Logits). "
-        "The intermediate value used for backward calculation. The calculation "
-        "is :"
-        "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
-        "where labels is ont-hot."
-        "Currently, the tensor is generated and used in npu/mlu kernel. ")
-        .AsIntermediate();
-#endif
    AddOutput("Loss",
              "(Tensor, default: Tensor<float>), A tensor in same shape with "
              "Input(Logits) "
@@ -135,12 +123,6 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                      true,
                      platform::errors::InvalidArgument(
                          "Output(Softmax) should be not null."));
-#if defined(PADDLE_WITH_MLU)
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(Backprop) should be not null."));
-#endif
    PADDLE_ENFORCE_EQ(
        ctx->HasOutput("Loss"),
        true,
@@ -235,12 +217,6 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
                      true,
                      platform::errors::InvalidArgument(
                          "Input(Softmax) should be not null."));
-#if defined(PADDLE_WITH_MLU)
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Backprop) should be not null."));
-#endif
    PADDLE_ENFORCE_EQ(
        ctx->HasInput("Label"),
        true,
@@ -324,9 +300,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
    grad_op->SetType("softmax_with_cross_entropy_grad");
    grad_op->SetInput("Label", this->Input("Label"));
    grad_op->SetInput("Softmax", this->Output("Softmax"));
-#if defined(PADDLE_WITH_MLU)
-    grad_op->SetInput("Backprop", this->Output("Backprop"));
-#endif
    grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
    grad_op->SetOutput(framework::GradVarName("Logits"),
                       this->InputGrad("Logits"));
@@ -356,26 +329,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
                  ops::SoftmaxWithCrossEntropyGradInplaceInferer);
 REGISTER_OP_VERSION(softmax_with_cross_entropy)
-#if defined(PADDLE_WITH_MLU)
-    .AddCheckpoint(
-        R"ROC(
-              Add a new attribute [use_softmax] )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_softmax", "A flag to indicate whether to do softmax", true))
-    .AddCheckpoint(
-        R"ROC(
-                Add a new dispensable/intermediate output [backprop] )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewOutput(
-            "Backprop",
-            "The intermediate value used for backward calculation. The "
-            "calculation is :"
-            "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
-            "where labels is ont-hot."
-            "Currently, the tensor is generated and used in npu/mlu kernel. "));
-#else
    .AddCheckpoint(
        R"ROC(
              Add a new attribute [use_softmax] )ROC",
        paddle::framework::compatible::OpVersionDesc().NewAttr(
            "use_softmax", "A flag to indicate whether to do softmax", true));
-#endif
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -92,11 +92,6 @@ inline T GetValue(const phi::DenseTensor* x) {
  if (!platform::is_cpu_place(x->place())) {
    phi::DenseTensor cpu_x;
    framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
-#if defined(PADDLE_WITH_MLU)
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    const platform::DeviceContext* dev_ctx = pool.Get(x->place());
-    dev_ctx->Wait();
-#endif
    value = cpu_x.data<T>()[0];
  } else {
    value = x->data<T>()[0];

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -78,11 +78,7 @@ if(WITH_ASCEND_CL)
  set(NPU_CTX_DEPS npu_stream npu_info)
 endif()
-if(WITH_MLU)
+if(WITH_ASCEND_CL)
-  set(MLU_CTX_DEPS mlu_device_context)
-endif()
-if(WITH_ASCEND_CL OR WITH_MLU)
  cc_library(
    stream_callback_manager
    SRCS stream_callback_manager.cc
@@ -175,10 +171,6 @@ if(WITH_XPU)
  target_link_libraries(device_context xpu_resource_pool)
 endif()
-if(WITH_MLU)
-  target_link_libraries(device_context mlu_resource_pool)
-endif()
 if(WITH_CUSTOM_DEVICE)
  target_link_libraries(device_context custom_device_resource_pool)
 endif()

--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -15,11 +15,6 @@ if(WITH_IPU)
  add_subdirectory(ipu)
 endif()
-# MLU
-if(WITH_MLU)
-  add_subdirectory(mlu)
-endif()
 if(WITH_CUSTOM_DEVICE)
  add_subdirectory(custom)
 endif()
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -25,11 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/enforce.h"
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef PADDLE_WITH_IPU
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -33,11 +33,6 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
-#endif
 namespace paddle {
 namespace platform {
@@ -224,18 +219,6 @@ void EmplaceDeviceContexts(
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
          "option."));
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-      EmplaceDeviceContext<MLUDeviceContext>(
-          place_to_device_context,
-          place,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("MLUPlace is not supported. Please "
-                                          "re-compile with WITH_MLU option."));
 #endif
    } else if (platform::is_ipu_place(place)) {
 #ifdef PADDLE_WITH_IPU

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -135,10 +135,6 @@ class IPUDeviceContext
 };
 #endif
-#ifdef PADDLE_WITH_MLU
-class MLUDeviceContext;
-#endif
 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
@@ -173,11 +169,6 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
 };
 #endif
-#ifdef PADDLE_WITH_MLU
-template <>
-struct DefaultDeviceContextType<phi::MLUPlace>;
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -36,10 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef WITH_WIN_DUMP_DBG
 #include <stdio.h>
 #include <time.h>
@@ -195,14 +191,6 @@ void InitDevices() {
      LOG(WARNING)
          << "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime.";
    }
-#endif
-#ifdef PADDLE_WITH_MLU
-    try {
-      // use user specified MLUs in single-node multi-process mode.
-      devices = platform::GetMLUSelectedDevices();
-    } catch (const std::exception &exp) {
-      LOG(WARNING) << "Compiled with WITH_MLU, but no MLU found in runtime.";
-    }
 #endif
    InitDevices(devices);
  });
@@ -228,10 +216,6 @@ void InitDevices(const std::vector<int> devices) {
 #ifdef PADDLE_WITH_IPU
    places.emplace_back(platform::IPUPlace(devices[i]));
 #endif
-#ifdef PADDLE_WITH_MLU
-    places.emplace_back(platform::MLUPlace(devices[i]));
-#endif
  }
  places.emplace_back(platform::CPUPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -15,16 +15,13 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif
 TEST(InitDevices, CPU) {
  using paddle::framework::InitDevices;
  using paddle::platform::DeviceContextPool;
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MLU)
+    !defined(PADDLE_WITH_HIP)
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.Size(), 1U);
@@ -55,18 +52,6 @@ TEST(InitDevices, XPU) {
 #endif
 }
-TEST(InitDevices, MLU) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-#ifdef PADDLE_WITH_MLU
-  int count = paddle::platform::GetMLUDeviceCount();
-  InitDevices();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
-#endif
-}
 #ifndef _WIN32
 TEST(SignalHandle, SignalHandle) {
  std::string msg = "Signal raises";

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -32,7 +32,6 @@ using NPUPlace = phi::NPUPlace;
 using NPUPinnedPlace = phi::NPUPinnedPlace;
 using XPUPlace = phi::XPUPlace;
 using IPUPlace = phi::IPUPlace;
-using MLUPlace = phi::MLUPlace;
 using CustomPlace = phi::CustomPlace;
 using PlaceList = std::vector<Place>;
@@ -110,15 +109,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
      PADDLE_THROW(platform::errors::Unavailable(
          "Paddle is not compiled with IPU. Cannot visit ipu device"));
      return typename Visitor::result_type();
-#endif
-    }
-    case phi::AllocationType::MLU: {
-#ifdef PADDLE_WITH_MLU
-      platform::MLUPlace p(place.GetDeviceId());
-      return visitor(p);
-#else
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Paddle is not compiled with MLU. Cannot visit mlu device"));
 #endif
    }
    case phi::AllocationType::CUSTOM: {

--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -19,7 +19,6 @@ TEST(Place, Equality) {
  paddle::platform::CPUPlace cpu;
  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
  paddle::platform::XPUPlace x0(0), x1(1), xx0(0);
-  paddle::platform::MLUPlace m0(0), m1(1), mm0(0);
  EXPECT_EQ(cpu, cpu);
  EXPECT_EQ(g0, g0);
@@ -28,13 +27,9 @@ TEST(Place, Equality) {
  EXPECT_EQ(x0, x0);
  EXPECT_EQ(x1, x1);
  EXPECT_EQ(x0, xx0);
-  EXPECT_EQ(m0, m0);
-  EXPECT_EQ(m1, m1);
-  EXPECT_EQ(m0, mm0);
  EXPECT_NE(g0, g1);
  EXPECT_NE(x0, x1);
-  EXPECT_NE(m0, m1);
  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
  EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
@@ -49,11 +44,6 @@ TEST(Place, Print) {
    ss << paddle::platform::XPUPlace(1);
    EXPECT_EQ("Place(xpu:1)", ss.str());
  }
-  {
-    std::stringstream ss;
-    ss << paddle::platform::MLUPlace(1);
-    EXPECT_EQ("Place(mlu:1)", ss.str());
-  }
  {
    std::stringstream ss;
    ss << paddle::platform::CUDAPlace(1);

--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -6,7 +6,6 @@ cc_library(
  cuda_tracer
  SRCS cuda_tracer.cc cupti_data_process.cc
  DEPS workqueue_utils enforce glog)
-add_subdirectory(mlu)
 add_subdirectory(custom_device)
 cc_library(
  event_node
@@ -33,12 +32,7 @@ cc_library(
 cc_library(
  new_profiler
  SRCS profiler.cc
-  DEPS host_tracer
+  DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
-       cuda_tracer
-       profiler_utils
-       cpu_utilization
-       event_bind
-       mlu_tracer
       custom_tracer)
 cc_test(
  test_event_node

--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -790,11 +790,7 @@ void ChromeTracingLogger::RefineDisplayName(
        (*it).second * 2 + 1);
  }
-#ifdef PADDLE_WITH_MLU
-  static std::string device_type("MLU");
-#else
  static std::string device_type("GPU");
-#endif
  for (auto it = deviceid_streamid_set_.begin();
       it != deviceid_streamid_set_.end();

--- a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
-if(WITH_MLU)
-  set(MLU_INFO mlu_info)
-endif()
-cc_library(
-  mlu_tracer
-  SRCS mlu_tracer.cc cnpapi_data_process.cc
-  DEPS workqueue_utils enforce glog ${MLU_INFO})
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
-#include <cstdio>
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/os_info.h"
-#ifdef PADDLE_WITH_MLU
-namespace paddle {
-namespace platform {
-namespace {
-inline uint64_t GetTimeGap() {
-  static uint64_t time_gap = []() -> uint64_t {
-    uint64_t cpu_time = PosixInNsec();
-    uint64_t mlu_time = cnpapiGetTimestamp();
-    return (cpu_time - mlu_time);
-  }();
-  return time_gap;
-}
-void AddKernelRecord(const cnpapiActivityKernel* kernel,
-                     uint64_t start_ns,
-                     TraceEventCollector* collector) {
-  static uint64_t time_gap = GetTimeGap();
-  if (kernel->start + time_gap < start_ns) {
-    return;
-  }
-  DeviceTraceEvent event;
-  event.name = demangle(kernel->name);
-  event.type = TracerEventType::Kernel;
-  event.start_ns = kernel->start + time_gap;
-  event.end_ns = kernel->end + time_gap;
-  event.device_id = kernel->device_id;
-  event.context_id = kernel->context_id;
-  event.stream_id = kernel->queue_id;
-  event.correlation_id = kernel->correlation_id;
-  event.kernel_info.block_x = kernel->dimx;
-  event.kernel_info.block_y = kernel->dimy;
-  event.kernel_info.block_z = kernel->dimz;
-  event.kernel_info.grid_x = kernel->kernel_type;
-  event.kernel_info.grid_y = 0;
-  event.kernel_info.grid_z = 0;
-  event.kernel_info.queued = kernel->queued;
-  event.kernel_info.submitted = kernel->submitted;
-  event.kernel_info.completed = kernel->received;
-  collector->AddDeviceEvent(std::move(event));
-}
-const char* MemcpyKind(cnpapiActivityMemcpyType kind) {
-  switch (kind) {
-    case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD:
-      return "MEMCPY_HtoD";
-    case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH:
-      return "MEMCPY_DtoH";
-    case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD:
-      return "MEMCPY_DtoD";
-    case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH:
-      return "MEMCPY_HtoH";
-    case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP:
-      return "MEMCPY_PtoP";
-    default:
-      break;
-  }
-  return "MEMCPY";
-}
-void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy,
-                     uint64_t start_ns,
-                     TraceEventCollector* collector) {
-  static uint64_t time_gap = GetTimeGap();
-  if (memcpy->start + time_gap < start_ns) {
-    return;
-  }
-  DeviceTraceEvent event;
-  event.name = MemcpyKind(memcpy->copy_type);
-  event.type = TracerEventType::Memcpy;
-  event.start_ns = memcpy->start + time_gap;
-  event.end_ns = memcpy->end + time_gap;
-  event.device_id = memcpy->device_id;
-  event.context_id = memcpy->context_id;
-  event.stream_id = memcpy->queue_id;
-  event.correlation_id = memcpy->correlation_id;
-  event.memcpy_info.num_bytes = memcpy->bytes;
-  snprintf(event.memcpy_info.copy_kind,
-           phi::kMemKindMaxLen,
-           "%s",
-           MemcpyKind(memcpy->copy_type));
-  collector->AddDeviceEvent(std::move(event));
-}
-void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
-                      uint64_t start_ns,
-                      TraceEventCollector* collector) {
-  static uint64_t time_gap = GetTimeGap();
-  if (memcpy2->start + time_gap < start_ns) {
-    return;
-  }
-  DeviceTraceEvent event;
-  event.name = MemcpyKind(memcpy2->copy_type);
-  event.type = TracerEventType::Memcpy;
-  event.start_ns = memcpy2->start + time_gap;
-  event.end_ns = memcpy2->end + time_gap;
-  event.device_id = memcpy2->device_id;
-  event.context_id = memcpy2->context_id;
-  event.stream_id = memcpy2->queue_id;
-  event.correlation_id = memcpy2->correlation_id;
-  event.memcpy_info.num_bytes = memcpy2->bytes;
-  snprintf(event.memcpy_info.copy_kind,
-           phi::kMemKindMaxLen,
-           "%s",
-           MemcpyKind(memcpy2->copy_type));
-  collector->AddDeviceEvent(std::move(event));
-}
-void AddMemsetRecord(const cnpapiActivityMemset* memset,
-                     uint64_t start_ns,
-                     TraceEventCollector* collector) {
-  static uint64_t time_gap = GetTimeGap();
-  if (memset->start + time_gap < start_ns) {
-    return;
-  }
-  DeviceTraceEvent event;
-  event.name = "MEMSET";
-  event.type = TracerEventType::Memset;
-  event.start_ns = memset->start + time_gap;
-  event.end_ns = memset->end + time_gap;
-  event.device_id = memset->device_id;
-  event.context_id = memset->context_id;
-  event.stream_id = memset->queue_id;
-  event.correlation_id = memset->correlation_id;
-  event.memset_info.num_bytes = memset->bytes;
-  event.memset_info.value = memset->value;
-  collector->AddDeviceEvent(std::move(event));
-}
-class CnpapiRuntimeCbidStr {
- public:
-  static const CnpapiRuntimeCbidStr& GetInstance() {
-    static CnpapiRuntimeCbidStr inst;
-    return inst;
-  }
-  std::string RuntimeKind(cnpapi_CallbackId cbid) const {
-    auto iter = cbid_str_.find(cbid);
-    if (iter == cbid_str_.end()) {
-      return "MLU Runtime API " + std::to_string(cbid);
-    }
-    return iter->second;
-  }
- private:
-  CnpapiRuntimeCbidStr();
-  std::unordered_map<cnpapi_CallbackId, std::string> cbid_str_;
-};
-CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
-#define REGISTER_RUNTIME_CBID_STR(cbid) \
-  cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid
-  REGISTER_RUNTIME_CBID_STR(cnMalloc);
-  REGISTER_RUNTIME_CBID_STR(cnMallocHost);
-  REGISTER_RUNTIME_CBID_STR(cnFree);
-  REGISTER_RUNTIME_CBID_STR(cnFreeHost);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpy);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpy2D);
-  REGISTER_RUNTIME_CBID_STR(cnMemcpy3D);
-  REGISTER_RUNTIME_CBID_STR(cnMemsetD8);
-  REGISTER_RUNTIME_CBID_STR(cnMemsetD16);
-  REGISTER_RUNTIME_CBID_STR(cnMemsetD32);
-  REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async);
-  REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async);
-  REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async);
-  REGISTER_RUNTIME_CBID_STR(cnInvokeKernel);
-  REGISTER_RUNTIME_CBID_STR(cnCreateQueue);
-  REGISTER_RUNTIME_CBID_STR(cnDestroyQueue);
-  REGISTER_RUNTIME_CBID_STR(cnQueueSync);
-  REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier);
-  REGISTER_RUNTIME_CBID_STR(cnWaitNotifier);
-  REGISTER_RUNTIME_CBID_STR(cnCreateNotifier);
-  REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier);
-  REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier);
-  REGISTER_RUNTIME_CBID_STR(cnCtxCreate);
-  REGISTER_RUNTIME_CBID_STR(cnCtxDestroy);
-  REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent);
-  REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
-  REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
-  REGISTER_RUNTIME_CBID_STR(cnCtxSync);
-  REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
-#undef REGISTER_RUNTIME_CBID_STR
-}
-void AddApiRecord(const cnpapiActivityAPI* api,
-                  uint64_t start_ns,
-                  TraceEventCollector* collector) {
-  static uint64_t time_gap = GetTimeGap();
-  if (api->start + time_gap < start_ns) {
-    return;
-  }
-  RuntimeTraceEvent event;
-  event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid);
-  event.start_ns = api->start + time_gap;
-  event.end_ns = api->end + time_gap;
-  event.process_id = api->process_id;
-  event.thread_id = api->thread_id;
-  event.correlation_id = api->correlation_id;
-  event.callback_id = api->cbid;
-  event.type = TracerEventType::MluRuntime;
-  collector->AddRuntimeEvent(std::move(event));
-}
-}  // namespace
-namespace details {
-void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
-                                 uint64_t start_ns,
-                                 TraceEventCollector* collector) {
-  switch (record->type) {
-    case CNPAPI_ACTIVITY_TYPE_KERNEL:
-      AddKernelRecord(reinterpret_cast<const cnpapiActivityKernel*>(record),
-                      start_ns,
-                      collector);
-      break;
-    case CNPAPI_ACTIVITY_TYPE_MEMCPY:
-      AddMemcpyRecord(reinterpret_cast<const cnpapiActivityMemcpy*>(record),
-                      start_ns,
-                      collector);
-      break;
-    case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP:
-      AddMemcpy2Record(
-          reinterpret_cast<const cnpapiActivityMemcpyPtoP*>(record),
-          start_ns,
-          collector);
-      break;
-    case CNPAPI_ACTIVITY_TYPE_MEMSET:
-      AddMemsetRecord(reinterpret_cast<const cnpapiActivityMemset*>(record),
-                      start_ns,
-                      collector);
-      break;
-    case CNPAPI_ACTIVITY_TYPE_CNDRV_API:
-      AddApiRecord(reinterpret_cast<const cnpapiActivityAPI*>(record),
-                   start_ns,
-                   collector);
-      break;
-    default:
-      break;
-  }
-}
-}  // namespace details
-}  // namespace platform
-}  // namespace paddle
-#endif
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <unordered_map>
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-#include "paddle/fluid/platform/profiler/trace_event_collector.h"
-namespace paddle {
-namespace platform {
-namespace details {
-#ifdef PADDLE_WITH_MLU
-void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
-                                 uint64_t start_ns,
-                                 TraceEventCollector* collector);
-#endif
-}  // namespace details
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
-#include <string>
-#include <unordered_map>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
-#include "paddle/fluid/platform/os_info.h"
-#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
-#define CNPAPI_CALL(call)                                                    \
-  do {                                                                       \
-    cnpapiResult _status = call;                                             \
-    if (_status != CNPAPI_SUCCESS) {                                         \
-      const char* errstr;                                                    \
-      cnpapiGetResultString(_status, &errstr);                               \
-      LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
-    }                                                                        \
-  } while (0)
-namespace paddle {
-namespace platform {
-namespace {
-void BufferRequestedCallback(uint64_t** buffer,
-                             size_t* size,
-                             size_t* max_num_records) {
-  constexpr size_t kBufferSize = 1 << 23;  // 8 MB
-  constexpr size_t kBufferAlignSize = 8;
-  *buffer = reinterpret_cast<uint64_t*>(
-      paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize));
-  *size = kBufferSize;
-  *max_num_records = 0;
-}
-void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) {
-  if (buffer == nullptr || valid_size == 0) {
-    return;
-  }
-  auto mlu_tracer = &MluTracer::GetInstance();
-  mlu_tracer->ProcessCnpapiActivity(buffer, valid_size);
-  paddle::framework::AlignedFree(buffer);
-}
-}  // namespace
-MluTracer::MluTracer() {
-#ifdef PADDLE_WITH_MLU
-  CNPAPI_CALL(cnpapiInit());
-  CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback,
-                                              BufferCompletedCallback));
-#endif
-}
-void MluTracer::PrepareTracing() {
-  PADDLE_ENFORCE_EQ(
-      state_ == TracerState::UNINITED || state_ == TracerState::STOPED,
-      true,
-      platform::errors::PreconditionNotMet("MluTracer must be UNINITED"));
-  EnableCnpapiActivity();
-  state_ = TracerState::READY;
-}
-void MluTracer::StartTracing() {
-  PADDLE_ENFORCE_EQ(state_ == TracerState::READY,
-                    true,
-                    platform::errors::PreconditionNotMet(
-                        "MluTracer must be READY or STOPPED"));
-  tracing_start_ns_ = PosixInNsec();
-  state_ = TracerState::STARTED;
-}
-void MluTracer::StopTracing() {
-  PADDLE_ENFORCE_EQ(
-      state_,
-      TracerState::STARTED,
-      platform::errors::PreconditionNotMet("MluTracer must be STARTED"));
-  DisableCnpapiActivity();
-  state_ = TracerState::STOPED;
-}
-void MluTracer::CollectTraceData(TraceEventCollector* collector) {
-  PADDLE_ENFORCE_EQ(
-      state_,
-      TracerState::STOPED,
-      platform::errors::PreconditionNotMet("MluTracer must be STOPED"));
-  for (auto he : collector_.HostEvents()) {
-    collector->AddHostEvent(std::move(he));
-  }
-  for (auto rte : collector_.RuntimeEvents()) {
-    collector->AddRuntimeEvent(std::move(rte));
-  }
-  for (auto de : collector_.DeviceEvents()) {
-    collector->AddDeviceEvent(std::move(de));
-  }
-  for (auto tn : collector_.ThreadNames()) {
-    collector->AddThreadName(tn.first, tn.second);
-  }
-  collector_.ClearAll();
-}
-void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) {
-#ifdef PADDLE_WITH_MLU
-  cnpapiActivity* record = nullptr;
-  while (true) {
-    cnpapiResult status =
-        cnpapiActivityGetNextRecord(buffer, valid_size, &record);
-    if (status == CNPAPI_SUCCESS) {
-      details::ProcessCnpapiActivityRecord(
-          record, tracing_start_ns_, &collector_);
-    } else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY ||
-               status == CNPAPI_ERROR_MAX_LIMIT_REACHED) {
-      break;
-    } else {
-      CNPAPI_CALL(status);
-    }
-  }
-#endif
-}
-void MluTracer::EnableCnpapiActivity() {
-#ifdef PADDLE_WITH_MLU
-  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL));
-  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
-  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
-  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET));
-  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
-  VLOG(3) << "enable cnpapi activity";
-#endif
-}
-void MluTracer::DisableCnpapiActivity() {
-#ifdef PADDLE_WITH_MLU
-  CNPAPI_CALL(cnpapiActivityFlushAll());
-  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL));
-  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
-  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
-  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET));
-  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
-  VLOG(3) << "disable cnpapi activity";
-#endif
-}
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.h
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <cstdint>
-#include <vector>
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/profiler/tracer_base.h"
-namespace paddle {
-namespace platform {
-class MluTracer : public TracerBase {
- public:
-  static MluTracer& GetInstance() {
-    static MluTracer instance;
-    return instance;
-  }
-  void PrepareTracing() override;
-  void StartTracing() override;
-  void StopTracing() override;
-  void CollectTraceData(TraceEventCollector* collector) override;
-  void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size);
- private:
-  MluTracer();
-  DISABLE_COPY_AND_ASSIGN(MluTracer);
-  void EnableCnpapiActivity();
-  void DisableCnpapiActivity();
-  uint64_t tracing_start_ns_ = UINT64_MAX;
-  TraceEventCollector collector_;
-};
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -29,10 +29,6 @@
 #include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/enforce.h"
-#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
-#endif
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -49,9 +45,6 @@ void SynchronizeDevice() {
 #ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
-#ifdef PADDLE_WITH_MLU
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
  for (const auto& dev_type : dev_types) {
@@ -86,9 +79,6 @@ bool Profiler::IsCuptiSupported() {
 bool Profiler::IsCnpapiSupported() {
  bool supported = false;
-#ifdef PADDLE_WITH_MLU
-  supported = true;
-#endif
  return supported;
 }
@@ -104,11 +94,6 @@ Profiler::Profiler(const ProfilerOptions& options,
  if (trace_switch.test(kProfileGPUOptionBit)) {
    tracers_.emplace_back(&CudaTracer::GetInstance(), false);
  }
-#ifdef PADDLE_WITH_MLU
-  if (trace_switch.test(kProfileMLUOptionBit)) {
-    tracers_.emplace_back(&MluTracer::GetInstance(), false);
-  }
-#endif
  if (trace_switch.test(kProfileCustomDeviceOptionBit)) {
    for (const auto& dev_type : custom_device_types) {
      tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);

--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -34,10 +34,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/enforce.h"
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
@@ -112,13 +108,6 @@ void SynchronizeAllDevice() {
    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  int count = GetMLUDeviceCount();
-  for (int i = 0; i < count; i++) {
-    SetMLUDeviceId(i);
-    PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
  for (const auto &dev_type : dev_types) {

--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -32,10 +32,6 @@ static void StreamCallbackFunc(gpuStream_t stream,
    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
 #endif
 #endif
-#if PADDLE_WITH_MLU
-        static void StreamCallbackFunc(void *user_data)
-#endif
 {
  std::unique_ptr<std::function<void()>> func(
      reinterpret_cast<std::function<void()> *>(user_data));
@@ -71,20 +67,12 @@ void StreamCallbackManager<Stream>::AddCallback(
      cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 #endif
-#if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
-  cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
-#endif
 }
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
 #if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
  platform::GpuStreamSync(stream_);
-#endif
-#ifdef PADDLE_WITH_MLU
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
 #endif
  {
    std::lock_guard<std::mutex> lock(mtx_);
@@ -100,10 +88,5 @@ template struct StreamCallbackManager<gpuStream_t>;
 #ifdef PADDLE_WITH_HIP
 template struct StreamCallbackManager<hipStream_t>;
 #endif
-#ifdef PADDLE_WITH_MLU
-template struct StreamCallbackManager<mluStream>;
-#endif
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -151,8 +151,6 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
    return place_obj.cast<platform::IPUPlace>();
  } else if (py::isinstance<platform::Place>(place_obj)) {
    return place_obj.cast<platform::Place>();
-  } else if (py::isinstance<platform::MLUPlace>(place_obj)) {
-    return place_obj.cast<platform::MLUPlace>();
  } else if (py::isinstance<platform::CustomPlace>(place_obj)) {
    return place_obj.cast<platform::CustomPlace>();
  } else {
@@ -207,8 +205,6 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
    SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
  } else if (platform::is_ipu_place(place)) {
    SetTensorFromPyArray<platform::IPUPlace>(tensor, array, place, zero_copy);
-  } else if (platform::is_mlu_place(place)) {
-    SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
  } else if (platform::is_custom_place(place)) {
    SetTensorFromPyArray<platform::CustomPlace>(
        tensor, array, place, zero_copy);
@@ -727,14 +723,6 @@ void BindImperative(py::module *m_ptr) {
           py::arg("zero_copy") = false,
           py::arg("name") = "",
           py::arg("stop_gradient") = -1)
-      .def("__init__",
-           &InitVarBaseFromNumpyWithArg<platform::MLUPlace>,
-           py::arg("value"),
-           py::arg("place"),
-           py::arg("persistable") = false,
-           py::arg("zero_copy") = false,
-           py::arg("name") = "",
-           py::arg("stop_gradient") = -1)
      .def("__init__",
           &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
           py::arg("value"),
@@ -773,11 +761,6 @@ void BindImperative(py::module *m_ptr) {
           py::arg("tensor"),
           py::arg("place"),
           py::arg("name") = "")
-      .def("__init__",
-           &InitVarBaseFromTensorWithArg<platform::MLUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("name") = "")
      .def("__init__",
           &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
           py::arg("tensor"),
@@ -1878,18 +1861,6 @@ void BindImperative(py::module *m_ptr) {
            return new_var;
          },
          py::return_value_policy::copy)
-      .def(
-          "_copy_to",
-          [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::MLUPlace &place,
-             bool blocking) {
-            auto new_var = self->NewVarBase(place, blocking);
-            if (!blocking) {
-              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-            }
-            return new_var;
-          },
-          py::return_value_policy::copy)
      .def(
          "_copy_to",
          [](const std::shared_ptr<imperative::VarBase> &self,
@@ -2217,11 +2188,6 @@ void BindImperative(py::module *m_ptr) {
              self.SetExpectedPlace(*p);
              VLOG(4) << "Tracer(" << &self << ")"
                      << " set expected place " << *p;
-            } else if (py::isinstance<platform::MLUPlace>(obj)) {
-              auto p = obj.cast<platform::MLUPlace *>();
-              self.SetExpectedPlace(*p);
-              VLOG(4) << "Tracer(" << &self << ")"
-                      << " set expected place " << *p;
            } else if (py::isinstance<platform::CustomPlace>(obj)) {
              auto p = obj.cast<platform::CustomPlace *>();
              self.SetExpectedPlace(*p);
@@ -2412,28 +2378,6 @@ void BindImperative(py::module *m_ptr) {
                                                 inplace_map);
             }
           })
-      .def("trace",
-           [](imperative::Tracer &self,
-              const std::string &type,
-              const PyNameVarBaseMap &ins,
-              const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs,
-              const platform::MLUPlace &place,
-              bool trace_backward,
-              const std::map<std::string, std::string> &inplace_map = {}) {
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
-             {
-               py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(type,
-                                                 std::move(ins_map),
-                                                 std::move(outs_map),
-                                                 std::move(attrs),
-                                                 place,
-                                                 trace_backward,
-                                                 inplace_map);
-             }
-           })
      .def("trace",
           [](imperative::Tracer &self,
              const std::string &type,
@@ -2505,7 +2449,6 @@ void BindImperative(py::module *m_ptr) {
  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::CustomPlace>);
-  m.def("varbase_copy", &VarBaseCopy<platform::MLUPlace>);
  m.def(
      "dygraph_partial_grad",
@@ -2616,19 +2559,6 @@ void BindImperative(py::module *m_ptr) {
           py::arg("ring_id"));
 #endif
-#if defined(PADDLE_WITH_CNCL)
-  py::class_<imperative::CNCLParallelContext,
-             imperative::ParallelContext,
-             std::shared_ptr<imperative::CNCLParallelContext>>(
-      m, "CNCLParallelContext")
-      .def(py::init<const imperative::ParallelStrategy &,
-                    const platform::MLUPlace &>())
-      .def("init", [](imperative::CNCLParallelContext &self) { self.Init(); })
-      .def("init_with_ring_id",
-           &imperative::CNCLParallelContext::InitWithRingID,
-           py::arg("ring_id"));
-#endif
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
    defined(PADDLE_WITH_XPU_BKCL)
  py::class_<imperative::HeterParallelContext,

--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -152,10 +152,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -152,10 +152,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif
@@ -194,7 +190,6 @@ PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
 PyTypeObject *g_npuplace_pytype = nullptr;
 PyTypeObject *g_cudapinnedplace_pytype = nullptr;
-PyTypeObject *g_mluplace_pytype = nullptr;
 PyTypeObject *g_ipuplace_pytype = nullptr;
 template <typename PlaceType>
@@ -371,7 +366,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
      .def("_get_device_id",
@@ -614,82 +608,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
-#ifdef PADDLE_WITH_IPU
-      .def("get_device_id",
-           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
-#endif
      .def("__str__", string::to_string<const platform::IPUPlace &>);
-  // MLUPlace
-  py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
-    MLUPlace is a descriptor of a device.
-    It represents a MLU device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          import paddle
-          # required: mlu
-          mlu_place = paddle.MLUPlace(0)
-        )DOC");
-  g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
-  mluplace
-      .def("__init__",
-           [](platform::MLUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_MLU
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid MLUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
-               if (platform::GetMLUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use MLU because there is no MLU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetMLUDeviceCount(),
-                     platform::GetMLUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::MLUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use MLU because you have installed CPU/GPU/... "
-                 "version "
-                 "PaddlePaddle.\n"
-                 "If you want to use MLU, please try to install MLU version "
-                 "PaddlePaddle by: pip install paddlepaddle-mlu\n"
-                 "If you only have CPU, please change MLUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::MLUPlace>)
-#ifdef PADDLE_WITH_MLU
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::MLUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__str__", string::to_string<const platform::MLUPlace &>);
  py::class_<platform::Place> platformplace(m, "Place");
  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
  platformplace.def(py::init<>())
@@ -701,7 +621,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
      .def("is_gpu_place",
           [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -758,10 +677,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
             self = ipu_place;
           })
-      .def("set_place",
-           [](platform::Place &self, const platform::MLUPlace &mlu_place) {
-             self = mlu_place;
-           })
      .def("set_place",
           [](platform::Place &self, const platform::CustomPlace &plug_place) {
             self = plug_place;

--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -152,10 +152,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif
@@ -252,10 +248,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           [](phi::DenseTensor &self, paddle::platform::NPUPlace &place) {
             self.mutable_data<float>(place);
           })
-      .def("_alloc_float",
-           [](phi::DenseTensor &self, paddle::platform::MLUPlace &place) {
-             self.mutable_data<float>(place);
-           })
      .def("_alloc_double",
           [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<double>(place);
@@ -276,10 +268,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           [](phi::DenseTensor &self, paddle::platform::CUDAPlace &place) {
             self.mutable_data<int>(place);
           })
-      .def("_alloc_int",
-           [](phi::DenseTensor &self, paddle::platform::MLUPlace &place) {
-             self.mutable_data<int>(place);
-           })
      .def(
          "_alloc_int",
          [](phi::DenseTensor &self, paddle::platform::CUDAPinnedPlace &place) {
@@ -325,13 +313,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
-      .def("_mutable_data",
-           [](phi::DenseTensor &self,
-              paddle::platform::MLUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
      .def("_clear", &phi::DenseTensor::clear)
      .def("_mutable_data",
           [](phi::DenseTensor &self,
@@ -370,11 +351,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           py::arg("tensor"),
           py::arg("place"),
           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::MLUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
      .def("_copy_from",
           &TensorCopyFrom<paddle::platform::IPUPlace>,
           py::arg("tensor"),
@@ -415,11 +391,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           py::arg("array"),
           py::arg("place"),
           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::MLUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
      .def("set",
           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
           py::arg("array"),

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -292,13 +292,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
    auto p = self.place();
    paddle::memory::Copy(
        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
-#endif
-  } else if (platform::is_mlu_place(self.place())) {
-#ifdef PADDLE_WITH_MLU
-    const T *a = self.data<T>();
-    auto p = self.place();
-    paddle::memory::Copy(
-        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
 #endif
  } else if (platform::is_custom_place(self.place())) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -336,13 +329,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
    T *a = self->mutable_data<T>(p);
    paddle::memory::Copy(
        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
-#endif
-  } else if (platform::is_mlu_place(self->place())) {
-#ifdef PADDLE_WITH_MLU
-    auto p = self->place();
-    T *a = self->mutable_data<T>(p);
-    paddle::memory::Copy(
-        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
 #endif
  } else if (platform::is_custom_place(self->place())) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -413,21 +399,6 @@ void SetTensorFromPyArrayT(
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
        "Please recompile or reinstall Paddle with IPU support."));
-#endif
-  } else if (paddle::platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-    platform::Place tmp_place = place;
-    platform::MLUDeviceGuard guard(tmp_place.device);
-    auto dst = self->mutable_data<T>(place);
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto dev_ctx = static_cast<platform::MLUDeviceContext *>(pool.Get(place));
-    paddle::platform::MLUMemcpyH2DAsync(
-        dst, array.data(), array.nbytes(), dev_ctx->stream());
-    dev_ctx->Wait();
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use MLUPlace in CPU/GPU version, "
-        "Please recompile or reinstall Paddle with MLU support."));
 #endif
  } else if (paddle::platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -779,10 +750,6 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
  } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
    output->mutable_data(place, self.dtype());
-#endif
-  } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-    output->mutable_data(place, self.dtype());
 #endif
  } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1064,39 +1031,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use CUDAPlace in CPU only version, "
        "Please recompile or reinstall Paddle with CUDA support."));
-#endif
-  } else if (is_mlu_tensor) {
-#ifdef PADDLE_WITH_MLU
-    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-    PADDLE_ENFORCE_EQ(py_arr.writeable(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "PyArray is not writable, in which case memory leak "
-                          "or double free would occur"));
-    PADDLE_ENFORCE_EQ(
-        py_arr.owndata(),
-        true,
-        platform::errors::InvalidArgument(
-            "PyArray does not own data, in which case  memory leak "
-            "or double free would occur"));
-    size_t copy_bytes = sizeof_dtype * numel;
-    auto p = tensor.place();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(tensor.place());
-    paddle::memory::Copy(
-        platform::CPUPlace(),
-        py_arr.mutable_data(),
-        p,
-        tensor_buf_ptr,
-        copy_bytes,
-        reinterpret_cast<const platform::MLUDeviceContext &>(ctx).stream());
-    ctx.Wait();
-    return py_arr;
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
-        "Please recompile or reinstall Paddle with MLU support."));
 #endif
  } else if (is_custom_device_tensor) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE

--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -21,9 +21,6 @@ limitations under the License. */
 #include "paddle/phi/core/errors.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/phi/backends/mlu/mlu_info.h"
-#endif
 namespace phi {
@@ -42,11 +39,9 @@ inline size_t Alignment(size_t size,
      alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
      alignment = alignment;
-#elif defined(PADDLE_WITH_MLU)
-      alignment = phi::backends::mlu::MLUMinChunkSize();
 #else
      PADDLE_THROW(phi::errors::PreconditionNotMet(
-          "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
+          "Fluid is not compiled with CUDA/XPU/NPU."));
 #endif
    }
  }

--- a/paddle/phi/backends/mlu/mlu_info.h
+++ b/paddle/phi/backends/mlu/mlu_info.h
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef PADDLE_WITH_MLU
-namespace phi {
-namespace backends {
-namespace mlu {
-//! Get the minimum chunk size for MLU buddy allocator.
-inline size_t MLUMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-}  // namespace mlu
-}  // namespace backends
-}  // namespace phi
-#endif
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -193,16 +193,6 @@ class IPUPlace : public Place {
      : Place(AllocationType::IPU, place.GetDeviceId()) {}
 };
-class MLUPlace : public Place {
- public:
-  MLUPlace() : Place(AllocationType::MLU, 0) {}
-  explicit MLUPlace(int device_id) : Place(AllocationType::MLU, device_id) {}
-  MLUPlace(const MLUPlace&) = default;
-  MLUPlace(const Place& place)  // NOLINT
-      : Place(AllocationType::MLU, place.GetDeviceId()) {}
-};
 class CustomPlace : public Place {
 public:
  CustomPlace() : Place(AllocationType::CUSTOM, 0, "") {}

--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -62,15 +62,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
      PADDLE_THROW(phi::errors::Unavailable(
          ("Paddle is not compiled with IPU. Cannot visit ipu device")));
      return typename Visitor::result_type();
-#endif
-    }
-    case phi::AllocationType::MLU: {
-#ifdef PADDLE_WITH_MLU
-      phi::MLUPlace p(place.GetDeviceId());
-      return visitor(p);
-#else
-      PADDLE_THROW(phi::errors::Unavailable(
-          ("Paddle is not compiled with MLU. Cannot visit mlu device")));
 #endif
    }
    case phi::AllocationType::CUSTOM: {

--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1980,11 +1980,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  }
  static constexpr ActBwdOpFwdDeps FwdDeps() {
-#ifdef PADDLE_WITH_MLU
-    return ActBwdOpFwdDeps::kDepX;
-#else
    return ActBwdOpFwdDeps::kDepOut;
-#endif
  }
 };

--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -203,13 +203,6 @@ void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
  phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }
-template <>
-void set_constant_with_place<phi::MLUPlace>(const phi::DeviceContext& context,
-                                            phi::DenseTensor* tensor,
-                                            float value) {
-  PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
-}
 template <>
 void set_constant_with_place<phi::GPUPinnedPlace>(
    const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {

--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -56,8 +56,7 @@ inline void CopyWithContext(const Context& ctx,
                            const Place& src_place,
                            const void* src,
                            size_t num) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    defined(PADDLE_WITH_MLU)
  memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
  PADDLE_THROW(

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -72,7 +72,6 @@ from .core import (
    CUDAPlace,
    CUDAPinnedPlace,
    IPUPlace,
-    MLUPlace,
    CustomPlace,
 )
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
@@ -127,7 +126,6 @@ __all__ = (
        'CUDAPlace',
        'CUDAPinnedPlace',
        'IPUPlace',
-        'MLUPlace',
        'Tensor',
        'ParamAttr',
        'WeightNormParamAttr',

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -25,7 +25,6 @@ from ..fluid.core import IPUPlace  # noqa: F401
 from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401
-from ..fluid.core import MLUPlace  # noqa: F401
 from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid import core  # noqa: F401

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -170,9 +170,6 @@ if(${len} GREATER_EQUAL 1)
      if(WITH_XPU)
        target_link_libraries(${test_name} xpulib)
      endif()
-      if(WITH_MLU)
-        target_link_libraries(${test_name} neuware_lib)
-      endif()
      if(NOT
         ("${test_name}" STREQUAL "c_broadcast_op_npu_test"
          OR "${test_name}" STREQUAL "c_allreduce_sum_op_npu_test"

--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -28,12 +28,6 @@ else()
      SRCS bkcl_context_test.cc
      DEPS bkcl_context)
  endif()
-  if(WITH_CNCL)
-    cc_test(
-      cncl_context_test
-      SRCS cncl_context_test.cc
-      DEPS cncl_context)
-  endif()
 endif()
 cc_test(

--- a/test/cpp/imperative/cncl_context_test.cc
+++ b/test/cpp/imperative/cncl_context_test.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/imperative/cncl_context.h"
-#include <thread>  // NOLINT
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-namespace imperative = paddle::imperative;
-namespace platform = paddle::platform;
-namespace framework = paddle::framework;
-// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test
-// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test
-int nrings = 1;
-imperative::ParallelStrategy GetStrategy(int local_rank) {
-  std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
-  imperative::ParallelStrategy strategy;
-  strategy.trainer_endpoints_ = eps;
-  strategy.current_endpoint_ = eps[local_rank];
-  strategy.nranks_ = 2;
-  strategy.local_rank_ = local_rank;
-  strategy.nrings_ = nrings;
-  return strategy;
-}
-#if defined(PADDLE_WITH_CNCL)
-void Broadcast(int local_rank, int device_id) {
-  int data_size = 4;
-  float test_data = 7;
-  const auto& place = platform::MLUPlace(device_id);
-  platform::MLUDeviceContext ctx(place);
-  imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
-  // init
-  cpc.Init();
-  framework::Variable* src_dev_var(new framework::Variable());
-  auto* src_dev_tensor = src_dev_var->GetMutable<phi::DenseTensor>();
-  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
-  // fill data for rank 0 only
-  std::vector<float> src_vec;
-  if (local_rank == 0) {
-    for (int i = 0; i < data_size; ++i) {
-      src_vec.push_back(test_data);
-    }
-    framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
-  }
-  ctx.Wait();
-  // call broadcast
-  cpc.Broadcast(src_dev_var, 0);
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  // check result
-  std::vector<float> dst_vec;
-  framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec);
-  ctx.Wait();
-  for (int i = 0; i < data_size; ++i) {
-    EXPECT_EQ(dst_vec[i], test_data);
-  }
-}
-TEST(Broadcast, Run) {
-  if (platform::GetMLUDeviceCount() >= 2) {
-    int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
-    int device_id = atoi(getenv("FLAGS_selected_mlus"));
-    Broadcast(local_rank, device_id);
-  }
-}
-void AllReduceByStream(int local_rank, int device_id) {
-  int data_size = 32;
-  const auto& place = platform::MLUPlace(device_id);
-  platform::MLUDeviceContext ctx(place);
-  imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
-  // init
-  cpc.Init();
-  // input data
-  framework::Variable* src_dev_var(new framework::Variable());
-  auto* src_dev_tensor = src_dev_var->GetMutable<phi::DenseTensor>();
-  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
-  // fill input data
-  std::vector<float> src_vec;
-  for (int i = 0; i < data_size; ++i) {
-    src_vec.push_back(1.0 + local_rank);
-  }
-  framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
-  ctx.Wait();
-  // output data
-  framework::Variable* dst_dev_var(new framework::Variable());
-  auto* dst_dev_tensor = dst_dev_var->GetMutable<phi::DenseTensor>();
-  dst_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
-  // call allreduce
-  cpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  // check result
-  std::vector<float> dst_vec;
-  framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec);
-  ctx.Wait();
-  EXPECT_EQ(dst_vec.size(), src_vec.size());
-  for (int i = 0; i < data_size; ++i) {
-    EXPECT_EQ(dst_vec[i], 3.0);
-  }
-}
-TEST(AllReduceByStream, Run) {
-  if (platform::GetMLUDeviceCount() >= 2) {
-    int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
-    int device_id = atoi(getenv("FLAGS_selected_mlus"));
-    AllReduceByStream(local_rank, device_id);
-  }
-}
-#endif
--- a/test/cpp/imperative/test_group.cc
+++ b/test/cpp/imperative/test_group.cc
@@ -76,8 +76,7 @@ void GroupConcatSplit(Place place, size_t size) {
      value.push_back(static_cast<T>(1.0 * j));
    }
-    if (std::is_same<Place, platform::CUDAPlace>::value ||
+    if (std::is_same<Place, platform::CUDAPlace>::value) {
-        std::is_same<Place, platform::MLUPlace>::value) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
    defined(PADDLE_WITH_CNCL)
      paddle::memory::Copy(
@@ -185,20 +184,5 @@ TEST(TestGroup, TestXPUConcatSplit) {
  GroupConcatSplit<float>(xpu_place, size);
 }
 #endif
-#if defined(PADDLE_WITH_CNCL)
-TEST(TestGroup, TestMLUConcatSplit) {
-  platform::MLUPlace mlu_place(0);
-  platform::CPUPlace cpu_place;
-  int size = 3;
-  GroupConcatSplit<float>(cpu_place, size);
-  GroupConcatSplit<float>(mlu_place, size);
-  size = 15;
-  GroupConcatSplit<float>(cpu_place, size);
-  GroupConcatSplit<float>(mlu_place, size);
-}
-#endif
 }  // namespace imperative
 }  // namespace paddle