clean up WITH_MLU (#52546)

e75c01f9 · Wang Xin · GitHub · 075d6b14 · e75c01f9 · e75c01f9
120 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
 option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
 option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
 option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
-option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
@@ -81,9 +80,6 @@ endif()
 if(WITH_GPU AND WITH_ROCM)
  message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
-if(WITH_GPU AND WITH_MLU)
-  message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
-endif()

 if(WITH_GPU AND NOT APPLE)
  enable_language(CUDA)
@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL)
      CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
 endif()

-if(NOT WITH_MLU AND WITH_CNCL)
-  message(
-    WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
-  set(WITH_MLU
-      OFF
-      CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
-endif()
-
 if(WITH_NCCL)
  add_definitions("-DPADDLE_WITH_NCCL")
  include(nccl)
@@ -469,10 +457,6 @@ if(WITH_GPU)
  endif()
 endif()

-if(WITH_MLU)
-  include(neuware)
-endif()
-
 if(WITH_ROCM)
  include(hip)
  include(miopen) # set miopen libraries, must before configure

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -116,11 +116,6 @@ if(WITH_IPU)
  add_definitions(-DPADDLE_WITH_IPU)
 endif()

-if(WITH_MLU)
-  message(STATUS "Compile with MLU!")
-  add_definitions(-DPADDLE_WITH_MLU)
-endif()
-
 if(WITH_GPU)
  add_definitions(-DPADDLE_WITH_CUDA)
  add_definitions(-DEIGEN_USE_GPU)

--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
-if(NOT WITH_MLU)
-  return()
-endif()
-
-if(NOT ENV{NEUWARE_HOME})
-  set(NEUWARE_HOME "/usr/local/neuware")
-else()
-  set(NEUWARE_HOME $ENV{NEUWARE_HOME})
-endif()
-message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME})
-
-set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include)
-set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
-
-include_directories(${NEUWARE_INCLUDE_DIR})
-
-set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
-set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
-set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
-set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
-set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
-
-generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
-                     ${CNPAPI_LIB})
-
-if(WITH_CNCL)
-  message(STATUS "Compile with CNCL!")
-  add_definitions(-DPADDLE_WITH_CNCL)
-  set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
-  list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
-endif()
-
-target_link_libraries(neuware_lib ${NEUWARE_LIB_DEPS})
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -74,9 +74,6 @@ function(op_library TARGET)
  set(MKLDNN_FILE)
  set(op_common_deps operator op_registry math_function layer
                     common_infer_shape_functions)
-  if(WITH_MLU)
-    set(op_common_deps ${op_common_deps} mlu_baseop)
-  endif()

  # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
  set(options UNITY)
@@ -169,12 +166,6 @@ function(op_library TARGET)
        list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
      endif()
    endif()
-    if(WITH_MLU)
-      string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
-        list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
-      endif()
-    endif()
  else()
    foreach(src ${op_library_SRCS})
      if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
@@ -201,8 +192,6 @@ function(op_library TARGET)
        list(APPEND xpu_kp_cc_srcs ${src})
      elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
        list(APPEND xpu_kp_cc_srcs ${src})
-      elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
-        list(APPEND mlu_cc_srcs ${src})
      elseif(${src} MATCHES ".*\\.cc$")
        list(APPEND cc_srcs ${src})
      elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
@@ -519,18 +508,6 @@ function(op_library TARGET)
    endforeach()
  endif()

-  # pybind USE_OP_DEVICE_KERNEL for MLU
-  if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
-    foreach(mlu_src ${mlu_cc_srcs})
-      set(op_name "")
-      find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
-      if(NOT ${op_name} EQUAL "")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
-        set(pybind_flag 1)
-      endif()
-    endforeach()
-  endif()
-
  # pybind USE_OP_DEVICE_KERNEL for MKLDNN
  if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
    # Append first implemented MKLDNN activation operator

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -356,11 +356,6 @@ if(WITH_XPU)
  list(APPEND third_party_deps extern_xpu)
 endif()

-if(WITH_MLU)
-  include(external/concurrentqueue) # download, build, install concurrentqueue
-  list(APPEND third_party_deps extern_concurrentqueue)
-endif()
-
 if(WITH_PSLIB)
  include(external/pslib) # download, build, install pslib
  list(APPEND third_party_deps extern_pslib)

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -99,11 +99,6 @@ struct DLDeviceVisitor
        "platform::NPUPinnedPlace is not supported"));
  }

-  inline ::DLDevice operator()(const platform::MLUPlace &place) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("platform::MLUPlace is not supported"));
-  }
-
  inline ::DLDevice operator()(const platform::CustomPlace &place) const {
    PADDLE_THROW(platform::errors::Unimplemented(
        "platform::CustomPlace is not supported"));

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
-#endif
-    } else if (platform::is_mlu_place(place_)) {
-#ifdef PADDLE_WITH_MLU
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
-      } else {
-        gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
-      }
-#else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
 #endif
    } else if (platform::is_custom_place(place_)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif

-#ifdef PADDLE_WITH_MLU
-MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
-    const platform::MLUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void MLUDefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
-}
-
-void MLUDefaultStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
-}
-MLUUnsafeFastGarbageCollector::MLUUnsafeFastGarbageCollector(
-    const platform::MLUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void MLUUnsafeFastGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback();
-}
-
-MLUStreamGarbageCollector::MLUStreamGarbageCollector(
-    const platform::MLUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {
-  platform::MLUDeviceGuard guard(place.device);
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueCreate(&stream_));
-  callback_manager_.reset(
-      new platform::StreamCallbackManager<mluStream>(stream_));
-}
-
-MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
-  auto place = this->dev_ctx_->GetPlace();
-  platform::MLUDeviceGuard guard(place.device);
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
-  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
-}
-
-mluStream MLUStreamGarbageCollector::stream() const { return stream_; }
-
-void MLUStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
-
-void MLUStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback_manager_->AddCallback(callback);
-}
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
    const platform::CustomPlace &place, size_t max_memory_size)

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -22,9 +22,6 @@

 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif
 #include "paddle/fluid/platform/stream_callback_manager.h"

 namespace paddle {
@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif

-#ifdef PADDLE_WITH_MLU
-class MLUDefaultStreamGarbageCollector : public GarbageCollector {
- public:
-  MLUDefaultStreamGarbageCollector(const platform::MLUPlace &place,
-                                   size_t max_memory_size);
-
-  void Wait() const override;
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-
-class MLUUnsafeFastGarbageCollector : public GarbageCollector {
- public:
-  MLUUnsafeFastGarbageCollector(const platform::MLUPlace &place,
-                                size_t max_memory_size);
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-class MLUStreamGarbageCollector : public GarbageCollector {
- public:
-  MLUStreamGarbageCollector(const platform::MLUPlace &place,
-                            size_t max_memory_size);
-
-  ~MLUStreamGarbageCollector();
-
-  void Wait() const override;
-
-  mluStream stream() const;
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-
- private:
-  mluStream stream_;
-  std::unique_ptr<platform::StreamCallbackManager<mluStream>> callback_manager_;
-};
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 class CustomDefaultStreamGarbageCollector : public GarbageCollector {
 public:

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
 #define REGISTER_OP_NPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)

-#define REGISTER_OP_MLU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
-
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)

-#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...)                  \
-  REGISTER_OP_KERNEL_EX(                                              \
-      op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE,       \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
-
 #define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...)                  \
  REGISTER_OP_KERNEL_EX(                                              \
      op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE,       \

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -57,10 +57,6 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif

-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = place.device;
      platform::SetXPUDeviceId(dev_id);
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#ifndef PADDLE_WITH_MLU
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Cannot run operator on place %s, please recompile paddle or "
-          "reinstall Paddle with MLU support.",
-          place));
-#else
-      auto dev_id = place.device;
-      platform::SetMLUDeviceId(dev_id);
 #endif
    } else if (platform::is_custom_place(place)) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
  }
 #endif

-#ifdef PADDLE_WITH_MLU
-  if (kernel_iter == kernels.end() &&
-      platform::is_mlu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "missing MLU kernel: " << type_
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  if (kernel_iter == kernels.end() &&
      platform::is_custom_place(expected_kernel_key.place_)) {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use CUDA device since it's not compiled with CUDA,"
          "Please recompile or reinstall Paddle with GPU support."));
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
-      } else {
-        gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use MLU device since it's not compiled with MLU,"
-          "Please recompile or reinstall Paddle with MLU support."));
 #endif
    } else if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)

--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (kernel_key.backend() == phi::Backend::MLU) {
-    VLOG(3) << "phi missing MLU kernel: " << op.Type()
-            << ", expected_kernel_key:" << kernel_key
-            << ", fallback to CPU one!";
-    return phi::KernelKey(
-        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
-  }
-#endif
 #ifdef PADDLE_WITH_IPU
  if (kernel_key.backend() == phi::Backend::IPU) {
    VLOG(3) << "phi missing IPU kernel: " << op.Type()

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src,
        "Copying from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto src_mlu_place = src_place;
-    auto dst_cpu_place = dst_place;
-    auto stream =
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    auto src_cpu_place = src_place;
-    auto dst_mlu_place = dst_place;
-    auto stream =
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  }
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    auto src_mlu_place = src_place;
-    auto dst_mlu_place = dst_place;
-    auto stream =
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copying from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_ipu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copying from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 }

 template <typename TENSOR>
@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_mlu_place(src_place) &&  // NOLINT
-           platform::is_mlu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os,
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
          "XPUPlace is not supported when not compiled with XPU"));
-#endif
-    } else if (platform::is_mlu_place(tensor.place())) {
-#ifdef PADDLE_WITH_MLU
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& mlu_dev_ctx =
-          static_cast<const platform::MLUDeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu,
-                     buf.get(),
-                     tensor.place(),
-                     reinterpret_cast<const void*>(data),
-                     size_to_write,
-                     mlu_dev_ctx.stream());
-        mlu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "MLUPlace is not supported when not compiled with MLU"));
 #endif
    } else if (platform::is_custom_place(tensor.place())) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is,
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
      phi::DenseTensor cpu_tensor;
      cpu_tensor.Resize(phi::make_ddim(shape));
      framework::VisitDataType(
@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is,
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
-      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "MLUPlace is not supported when not compiled with MLU"));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "NPUPlace is not supported when not compiled with NPU"));
      }
 #endif
    } else {
@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is,
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
      phi::DenseTensor cpu_tensor;
      cpu_tensor.Resize(phi::make_ddim(dims));
      framework::VisitDataType(
@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is,
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
-      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "MLUPlace is not supported when not compiled with MLU"));
      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -26,9 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif

 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -142,11 +139,6 @@ void TensorFromArray(const T* src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(dst_place)) {  // NOLINT
    memory::Copy(
@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(dst_place)) {  // NOLINT
    memory::Copy(
@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src,
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src.place())) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  else if (platform::is_mlu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src.place())) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -177,10 +177,6 @@ if(WITH_GLOO)
  endif()
 endif()

-if(WITH_MLU)
-  set(MLU_DEPS mlu_baseop)
-endif()
-
 if(NOT WITH_ASCEND_CL)
  cc_library(
    gradient_accumulator

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -159,15 +159,6 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                unsupported_ops_xpu_bf16.end());
-#elif defined(PADDLE_WITH_MLU)
-  auto unsupported_ops_mlu_fp16 = std::get<2>(
-      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
-  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
-                                unsupported_ops_mlu_fp16.end());
-  auto unsupported_ops_mlu_bf16 = std::get<2>(
-      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
-  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
-                                unsupported_ops_mlu_bf16.end());
 #endif
  VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
          << unsupported_fp16_ops_->size() << " "

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -34,9 +34,6 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "xpu/refactor/math.h"
 #endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
  }
 #endif

-#ifdef PADDLE_WITH_MLU
-  if (platform::is_mlu_place(place)) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::DeviceContext* ctx = pool.Get(place);
-    auto dev_ctx = dynamic_cast<platform::MLUDeviceContext*>(ctx);
-    if (data_type == framework::DataTypeTrait<float>::DataType()) {
-      dst_tensor->mutable_data<float>(place);
-    } else if (data_type ==
-               framework::DataTypeTrait<platform::float16>::DataType()) {
-      dst_tensor->mutable_data<platform::float16>(place);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Gradient accumulation of data type (%s) on place (%s) is not "
-          "supported in imperative mode",
-          framework::DataTypeToString(data_type),
-          place));
-    }
-    static const float alpha = 1.f;
-    static const float beta = 1.f;
-    operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor);
-    operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor);
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlAssignAdd(dev_ctx->cnnl_handle(),
-                      static_cast<const void*>(&alpha),
-                      src_tensor_desc.get(),
-                      operators::GetBasePtr(&src_tensor),
-                      nullptr,
-                      0,
-                      static_cast<const void*>(&beta),
-                      dst_tensor_desc.get(),
-                      operators::GetBasePtr(dst_tensor)));
-    return;
-  }
-#endif
-
  PADDLE_THROW(platform::errors::Unimplemented(
      "Gradient accumulation of data type (%s) on place (%s) is not "
      "supported in imperative mode",

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
      kernel_signature_(std::move(kernel_signature)),
      phi_kernel_(phi_kernel) {}

-#ifdef PADDLE_WITH_MLU
-
-static void tokenize(const std::string& ops,
-                     char delim,
-                     std::unordered_set<std::string>* op_set) {
-  std::string::size_type beg = 0;
-  for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
-       ++end) {
-    op_set->insert(ops.substr(beg, end - beg));
-    beg = end + 1;
-  }
-
-  op_set->insert(ops.substr(beg));
-}
-
-static bool is_in_mlu_black_list(const std::string& op_name) {
-  static bool inited = false;
-  static std::unordered_set<std::string> mlu_black_list;
-  static std::mutex s_mtx;
-  if (!inited) {
-    std::lock_guard<std::mutex> guard(s_mtx);
-    if (!inited) {
-      if (std::getenv("MLU_BLACK_LIST") != nullptr) {
-        std::string ops(std::getenv("MLU_BLACK_LIST"));
-        tokenize(ops, ',', &mlu_black_list);
-      }
-      inited = true;
-      VLOG(3) << "MLU Black List: ";
-      for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
-           ++iter) {
-        VLOG(3) << *iter << " ";
-      }
-    }
-  }
-  if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
-    return true;
-  }
-  return false;
-}
-
-#endif
-
 template <typename VarType>
 PreparedOp PrepareImpl(
    const NameVarMap<VarType>& ins,
@@ -258,12 +216,6 @@ PreparedOp PrepareImpl(
                              op.Type(), expected_kernel_key.dtype());
 #endif

-#ifdef PADDLE_WITH_MLU
-  if (is_in_mlu_black_list(op.Type())) {
-    expected_kernel_key.set_backend(phi::Backend::CPU);
-  }
-#endif
-
  bool has_phi_kernel = false;

  const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
@@ -468,16 +420,6 @@ PreparedOp PrepareImpl(
    kernel_iter = kernels.find(fluid_kernel_type);
  }
 #endif
-#ifdef PADDLE_WITH_MLU
-  if (kernel_iter == kernels.end() &&
-      paddle::platform::is_mlu_place(fluid_kernel_type.place_)) {
-    VLOG(3) << "missing MLU kernel: " << op.Type()
-            << ", expected_kernel_key:" << fluid_kernel_type
-            << ", fallbacking to CPU one!";
-    fluid_kernel_type.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(fluid_kernel_type);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  if (kernel_iter == kernels.end() &&
      paddle::platform::is_custom_place(fluid_kernel_type.place_)) {

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use IPU device since it's not compiled with IPU,"
          "Please recompile or reinstall Paddle with IPU support."));
-#endif
-    } else if (platform::is_mlu_place(place)) {
-#if defined(PADDLE_WITH_MLU)
-      gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
-      VLOG(10) << "Created GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use MLU device since it's not compiled with MLU,"
-          "Please recompile or reinstall Paddle with MLU support."));
 #endif
    } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type,
    } else if (platform::is_npu_place(place)) {
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with NPU if use NPUPlace."));
-    } else if (platform::is_mlu_place(place)) {
-#ifdef PADDLE_WITH_MLU
-      platform::SetMLUDeviceId(place.device);
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with MLU if use MLUPlace."));
-#endif
    } else if (platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
      phi::DeviceManager::SetDevice(place);

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -58,10 +58,6 @@
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/fluid/memory/allocation/custom_allocator.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
@@ -194,11 +190,6 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
 #endif
-#ifdef PADDLE_WITH_MLU
-        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
-        }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
        for (const auto& dev_type : device_types) {
@@ -254,11 +245,6 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
 #endif
-#ifdef PADDLE_WITH_MLU
-        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
-        }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
        for (const auto& dev_type : device_types) {
@@ -290,11 +276,6 @@ class AllocatorFacadePrivate {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
-#endif
-#ifdef PADDLE_WITH_MLU
-        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
-        }
 #endif
        break;
      }
@@ -801,12 +782,6 @@ class AllocatorFacadePrivate {
  }
 #endif

-#ifdef PADDLE_WITH_MLU
-  void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-  }
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -851,13 +826,6 @@ class AllocatorFacadePrivate {
      system_allocators_[p] = CreateCUDAAllocator(p);
    }
 #endif
-#ifdef PADDLE_WITH_MLU
-    int device_count = platform::GetMLUDeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::MLUPlace p(i);
-      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-    }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {
@@ -894,12 +862,6 @@ class AllocatorFacadePrivate {
      places.emplace_back(platform::IPUPlace(dev_id));
    }
 #endif
-#ifdef PADDLE_WITH_MLU
-    int device_count = platform::GetMLUDeviceCount();
-    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
-      places.emplace_back(platform::MLUPlace(dev_id));
-    }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {

--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    init_allocate_size_func_ = &platform::GpuInitAllocSize;
    re_allocate_size_func_ = &platform::GpuReallocSize;
-#elif defined(PADDLE_WITH_MLU)
-    init_allocate_size_func_ = &platform::MLUInitAllocSize;
-    re_allocate_size_func_ = &platform::MLUReallocSize;
 #endif
  }
 #endif
@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  allocate_bytes = DeviceAllocateSize(
      &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
-#elif defined(PADDLE_WITH_MLU)
-  allocate_bytes = DeviceAllocateSize(
-      &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
 #endif
 #endif


--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -25,9 +25,6 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) {
 }
 #endif

-#ifdef PADDLE_WITH_MLU
-TEST(BuddyAllocator, MluFraction) {
-  // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  buddy_allocator.Release();
-
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     600 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-
-TEST(BuddyAllocator, InitRealloc) {
-  FLAGS_initial_gpu_memory_in_mb = 100;
-  FLAGS_reallocate_gpu_memory_in_mb = 50;
-
-  EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(100 << 20));
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-
-  // Less then initial size and reallocate size
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  // Between initial size and reallocate size and not exceed pool
-  TestBuddyAllocator(&buddy_allocator, 80 << 20);
-  TestBuddyAllocator(&buddy_allocator, 99 << 20);
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     101 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-
-TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
-  FLAGS_initial_gpu_memory_in_mb = 5;
-  FLAGS_reallocate_gpu_memory_in_mb = 10;
-
-  EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(10 << 20));
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-
-  // Less than initial size and reallocate size
-  TestBuddyAllocator(&buddy_allocator, 1 << 20);
-  // Between initial size and reallocate size and exceed pool
-  TestBuddyAllocator(&buddy_allocator, 6 << 20);
-  TestBuddyAllocator(&buddy_allocator, 8 << 20);
-  TestBuddyAllocator(&buddy_allocator, 9 << 20);
-  // Greater than max trunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     11 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-
-TEST(BuddyAllocator, FractionRefillPool) {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.6;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  size_t max_chunk_size = platform::MLUMaxChunkSize();
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      max_chunk_size);
-
-  // Less than pool size
-  int* p0 = TestBuddyAllocator(&buddy_allocator,
-                               max_chunk_size - 1000,
-                               /* use_system_allocator = */ false,
-                               /* free_ptr = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-
-  size_t alloc =
-      platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
-  // Exceed pool trigger refilling size of fraction of avaiable mlu, and should
-  // be able to alloc 60% of the remaining MLU
-  int* p1 = TestBuddyAllocator(&buddy_allocator,
-                               alloc,
-                               /* use_system_allocator = */ false,
-                               /* free_ptr = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-
-  alloc =
-      platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
-  // Exceed pool trigger refilling size of fraction of avaiable mlu, and should
-  // be able to alloc 60% of the remaining MLU
-  TestBuddyAllocator(&buddy_allocator,
-                     alloc,
-                     /* use_system_allocator = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-
-  buddy_allocator.Free(p0);
-  buddy_allocator.Free(p1);
-}
-
-TEST(BuddyAllocator, AllocFromAvailable) {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.7;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  size_t total = 0, available = 0;
-  platform::SetMLUDeviceId(0);
-  platform::MLUMemoryUsage(&available, &total);
-
-  // Take half of available MLU
-  void* p;
-
-  cnrtStatus result = cnrtMalloc(&p, available >> 1);
-  EXPECT_TRUE(result == cnrtSuccess);
-
-  // BuddyAllocator should be able to alloc the remaining MLU
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
-
-  if (p) {
-    EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
-  }
-}
-
-TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
-  FLAGS_fraction_of_gpu_memory_to_use = 1.0;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  void* p = nullptr;
-
-  EXPECT_TRUE(cnrtMalloc(&p, static_cast<size_t>(1) << 30) == cnrtSuccess);
-
-  // BuddyAllocator should be able to alloc the remaining MLU
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
-
-  if (p) {
-    EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
-  }
-}
-
-TEST(BuddyAllocator, Release) {
-  // In a 8 GB machine, the pool size will be about 800 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
-      platform::MLUMinChunkSize(),
-      platform::MLUMaxChunkSize());
-
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 50 << 20);
-
-  buddy_allocator.Release();
-}
-#endif
-
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>(
 #endif
 }

-// For MLU
-#ifdef PADDLE_WITH_MLU
-class MLUBuddyAllocatorList {
- private:
-  MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) {
-    auto mlu_num = devices_.size();
-    allocators_.resize(mlu_num);
-    init_flags_.reserve(mlu_num);
-    for (size_t i = 0; i < mlu_num; ++i) {
-      init_flags_.emplace_back(new std::once_flag());
-    }
-  }
-
-  static MLUBuddyAllocatorList *CreateNewInstance() {
-    return new MLUBuddyAllocatorList();
-  }
-
- public:
-  static MLUBuddyAllocatorList *Instance() {
-    static auto *instance = CreateNewInstance();
-    return instance;
-  }
-
-  BuddyAllocator *Get(int mlu_id) {
-    auto pos = std::distance(
-        devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
-    PADDLE_ENFORCE_LT(pos,
-                      devices_.size(),
-                      platform::errors::OutOfRange(
-                          "The index exceeds the size of devices, the size of "
-                          "devices is %d, the index is %d",
-                          devices_.size(),
-                          pos));
-
-    std::call_once(*init_flags_[pos], [this, pos] {
-      platform::SetMLUDeviceId(devices_[pos]);
-      allocators_[pos].reset(
-          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                 new detail::MLUAllocator(devices_[pos])),
-                             platform::MLUMinChunkSize(),
-                             platform::MLUMaxChunkSize()));
-      VLOG(10) << "\n\nNOTE:\n"
-               << "You can set GFlags environment variable "
-               << "(mlu reuse gpu GFlags) "
-               << "'FLAGS_fraction_of_gpu_memory_to_use' "
-               << "or 'FLAGS_initial_gpu_memory_in_mb' "
-               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
-               << "to change the memory size for MLU usage.\n"
-               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
-               << FLAGS_fraction_of_gpu_memory_to_use
-               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
-               << FLAGS_initial_gpu_memory_in_mb
-               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
-               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    });
-
-    return allocators_[pos].get();
-  }
-
- private:
-  std::vector<int> devices_;
-  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
-  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
-};
-
-BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) {
-  return MLUBuddyAllocatorList::Instance()->Get(mlu_id);
-}
-#endif
-
-template <>
-size_t Used<platform::MLUPlace>(const platform::MLUPlace &place) {
-#ifdef PADDLE_WITH_MLU
-  return GetMLUBuddyAllocator(place.device)->Used();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
-#ifdef PADDLE_WITH_MLU
-  auto *buddy_allocator = GetMLUBuddyAllocator(place.device);
-  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    platform::MLUDeviceGuard(place.device);
-    size_t avail = 0, total = 0;
-    platform::MLUMemoryUsage(&avail, &total);
-    PADDLE_THROW(platform::errors::ResourceExhausted(
-        "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
-        "%s, MLUMinChunkSize %s, MLU memory used: %s.",
-        string::HumanReadableSize(size),
-        place.device,
-        string::HumanReadableSize(avail),
-        string::HumanReadableSize(total),
-        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
-        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
-        string::HumanReadableSize(Used<platform::MLUPlace>(place))));
-  } else {
-    if (FLAGS_init_allocated_mem) {
-      cnrtMemset(ptr, 0xEF, size);
-    }
-  }
-  return ptr;
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-void Free<platform::MLUPlace>(const platform::MLUPlace &place,
-                              void *p,
-                              size_t size) {
-#ifdef PADDLE_WITH_MLU
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetMLUBuddyAllocator(place.device)->Free(p);
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
-#ifdef PADDLE_WITH_MLU
-  return GetMLUBuddyAllocator(place.device)->Release();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'MLUPlace' is not supported in CPU only device."));
-#endif
-}
-
 // For CustomDevice
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 class BuddyAllocatorList {

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif

-#ifdef PADDLE_WITH_MLU
-TEST(NaiveBestFitAllocatorTest, MluAlloc) {
-  NaiveBestFitAllocator alloc{platform::MLUPlace(0)};
-  {
-    size_t size = (1 << 20);
-    auto allocation = alloc.Allocate(size);
-  }
-  sleep(10);
-  alloc.Release(platform::MLUPlace(0));
-
-  size_t size = (1 << 20);
-  auto allocation = alloc.Allocate(size);
-  alloc.Release(platform::MLUPlace(0));
-}
-#endif
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -31,9 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }

 #endif

-#ifdef PADDLE_WITH_MLU
-void* MLUAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-
-  void* p;
-  auto result = platform::RecordedMLUMalloc(&p, size, mlu_id_);
-
-  if (result == cnrtSuccess) {
-    *index = 0;
-    mlu_alloc_size_ += size;
-    return p;
-  } else {
-    size_t avail, total, actual_avail, actual_total;
-    bool is_limited = platform::RecordedMLUMemGetInfo(
-        &avail, &total, &actual_avail, &actual_total, mlu_id_);
-    size_t allocated = total - avail;
-
-    std::string err_msg;
-    if (is_limited) {
-      auto limit_size = (total >> 20);
-      err_msg = string::Sprintf(
-          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
-          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
-          "maximum MLU memory usage is limited to %d MB.\n"
-          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size,
-          limit_size);
-    }
-
-    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
-        "\n\nOut of memory error on MLU %d. "
-        "Cannot allocate %s memory on MLU %d, %s memory has been allocated and "
-        "available memory is only %s.\n\n"
-        "Please check whether there is any other process using MLU %d.\n"
-        "1. If yes, please stop them, or start PaddlePaddle on another MLU.\n"
-        "2. If no, please try one of the following suggestions:\n"
-        "   1) Decrease the batch size of your model.\n"
-        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
-        "please set it to a higher value but less than 1.0.\n"
-        "      The command is "
-        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        mlu_id_,
-        string::HumanReadableSize(size),
-        mlu_id_,
-        string::HumanReadableSize(allocated),
-        string::HumanReadableSize(avail),
-        mlu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use,
-        err_msg));
-  }
-}
-
-void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(mlu_alloc_size_,
-                    size,
-                    platform::errors::InvalidArgument(
-                        "The size of memory (%d) to free exceeds the size of "
-                        "allocated gpu memory (%d)",
-                        size,
-                        mlu_alloc_size_));
-  mlu_alloc_size_ -= size;
-
-  platform::RecordedMLUFree(p, size, mlu_id_);
-}
-
-bool MLUAllocator::UseGpu() const { return true; }
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 void* CustomAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif

-#ifdef PADDLE_WITH_MLU
-class MLUAllocator : public SystemAllocator {
- public:
-  explicit MLUAllocator(int mlu_id) : mlu_id_(mlu_id) {}
-
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
-
- private:
-  size_t mlu_alloc_size_ = 0;
-  int mlu_id_;
-};
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 class CustomAllocator : public SystemAllocator {
 public:

--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) {
  }
 }
 #endif
-
-#ifdef PADDLE_WITH_MLU
-TEST(MLUAllocator, Alloc) {
-  paddle::memory::detail::MLUAllocator a(0);
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
-
-TEST(MLUAllocator, AllocFailure) {
-  paddle::memory::detail::MLUAllocator allocator(0);
-  size_t index;
-  size_t alloc_size = (static_cast<size_t>(1) << 40);  // Very large number
-  try {
-    allocator.Alloc(&index, alloc_size);
-    ASSERT_TRUE(false);
-  } catch (paddle::memory::allocation::BadAlloc&) {
-    PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetLastError());
-  }
-}
-#endif
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -23,10 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #endif

-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-
 namespace paddle {
 namespace memory {

@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
 }
 #endif

-#ifdef PADDLE_WITH_MLU
-template <>
-void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::MLUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetMLUDeviceId(src_place.device);
-  if (stream) {
-    VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::MLUMemcpyD2HAsync(
-        dst, src, num, reinterpret_cast<mluStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
-
-    VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place;
-    platform::RecordEvent record_event(
-        "MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::MLUMemcpyD2HSync(dst, src, num);
-  }
-}
-
-template <>
-void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetMLUDeviceId(dst_place.device);
-  if (stream) {
-    VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::MLUMemcpyH2DAsync(
-        dst, src, num, reinterpret_cast<mluStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::MLUDeviceContext*>(pool.Get(dst_place))->Wait();
-
-    VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-            << " to " << dst_place;
-    platform::RecordEvent record_event(
-        "MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
-    platform::MLUMemcpyH2DSync(dst, src, num);
-  }
-}
-
-template <>
-void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
-                                                  void* dst,
-                                                  platform::MLUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  if (dst_place == src_place) {
-    platform::SetMLUDeviceId(dst_place.device);
-    if (stream) {
-      VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyD2DAsync(
-          dst, src, num, reinterpret_cast<mluStream>(stream));
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
-
-      VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyD2DSync(dst, src, num);
-    }
-  } else {
-    if (stream) {
-      VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyPeerAsync(dst,
-                                   dst_place.device,
-                                   src,
-                                   src_place.device,
-                                   num,
-                                   reinterpret_cast<mluStream>(stream));
-    } else {
-      VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
-              << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::MLUMemcpyPeerSync(
-          dst, dst_place.device, src, src_place.device, num);
-    }
-  }
-}
-
-// NOTE: only for CPUPlace and MLUPlace.
-template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  void* dst,
-                                  phi::Place src_place,
-                                  const void* src,
-                                  size_t num,
-                                  void* stream) {
-  if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CPUPlace place_dst, place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&
-             dst_place.GetType() == phi::AllocationType::MLU) {
-    platform::MLUPlace place_dst(dst_place.GetDeviceId());
-    platform::CPUPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::MLU &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::MLUPlace place_src(src_place.GetDeviceId());
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::MLU &&
-             dst_place.GetType() == phi::AllocationType::MLU) {
-    platform::MLUPlace place_src(src_place.GetDeviceId());
-    platform::MLUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CPUPlace place_src;
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CustomPlace place_src(src_place);
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CustomPlace place_src(src_place);
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#endif
-  }
-}
-
-// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
-template <>
-void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
-       dst,
-       src_place,
-       src,
-       num,
-       stream);
-}
-
-// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
-template <>
-void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::MLUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(dst_place,
-       dst,
-       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src,
-       num,
-       stream);
-}
-
-// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
-template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
-}
-
-// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
-template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::CPUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
-}
-
-#endif  // PADDLE_WITH_MLU
-
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
 template <>
 void Copy<phi::Place, phi::Place>(phi::Place dst_place,

--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -16,9 +16,6 @@ limitations under the License. */

 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#endif

 namespace paddle {
 namespace memory {

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -55,10 +55,6 @@ if (WITH_LITE)
    add_subdirectory(lite)
 endif()

-if (WITH_MLU)
-    add_subdirectory(mlu)
-endif()
-
 if(WITH_CINN)
    add_subdirectory(cinn)
 endif()
@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL)
  op_library(sync_batch_norm_op)
 endif()

-if (WITH_MLU)
-  op_library(sync_batch_norm_op)
-endif()
-
 op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})


--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
-
-    found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    MLUCnnlTensorDesc scale_desc(*scale);
-    MLUCnnlTensorDesc found_inf_desc(
-        *found_inf, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
-
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(ctx.GetPlace());
-
-      // check is_finite or is_nan
-      phi::DenseTensor is_finite(found_inf->type());
-      if (i != 0) {
-        is_finite.Resize(phi::make_ddim({1}));
-        is_finite.mutable_data<bool>(ctx.GetPlace());
-      } else {
-        is_finite.ShareDataWith(*found_inf);
-      }
-
-      MLUCnnlTensorDesc x_desc(*x);
-      MLUCnnlTensorDesc out_desc(*out);
-
-      MLUCnnl::IsNanInf(
-          ctx, x_desc.get(), GetBasePtr(x), GetBasePtr(&is_finite));
-
-      // save is_finite by logical_and op after checking every input
-      if (i != 0) {
-        MLUCnnlTensorDesc is_finite_desc(
-            is_finite, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
-        MLUCnnl::Logic(ctx,
-                       CNNL_LOGIC_OP_OR,
-                       found_inf_desc.get(),
-                       GetBasePtr(found_inf),
-                       is_finite_desc.get(),
-                       GetBasePtr(&is_finite),
-                       found_inf_desc.get(),
-                       GetBasePtr(found_inf));
-      }
-
-      // The normal logic is :
-      // out = in, if found_inf = true
-      // out = in/scale, if found_inf = false
-      // But when found_inf is true, the data of Out should not be used.
-      // So, on MLU, we always compute out with in/scale.
-      phi::DenseTensor float_x;
-      phi::DenseTensor float_out;
-      if (std::is_same<T, paddle::platform::float16>::value) {
-        float_x.Resize(x->dims());
-        float_out.Resize(out->dims());
-        float_x.mutable_data<MPDType>(ctx.GetPlace());
-        float_out.mutable_data<MPDType>(ctx.GetPlace());
-
-        MLUCnnlTensorDesc float_x_desc(float_x);
-        MLUCnnlTensorDesc float_out_desc(float_out);
-        auto cast_fp16_type =
-            GetCastDataType(DataType::FLOAT16, DataType::FLOAT32);
-        MLUCnnl::Cast(ctx,
-                      cast_fp16_type,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      float_x_desc.get(),
-                      GetBasePtr(&float_x));
-
-        MLUCnnl::Div(ctx,
-                     CNNL_COMPUTATION_HIGH_PRECISION,
-                     float_x_desc.get(),
-                     GetBasePtr(&float_x),
-                     scale_desc.get(),
-                     GetBasePtr(scale),
-                     float_out_desc.get(),
-                     GetBasePtr(&float_out));
-
-        auto cast_fp32_type =
-            GetCastDataType(DataType::FLOAT32, DataType::FLOAT16);
-        MLUCnnl::Cast(ctx,
-                      cast_fp32_type,
-                      float_out_desc.get(),
-                      GetBasePtr(&float_out),
-                      out_desc.get(),
-                      GetBasePtr(out));
-      } else {
-        MLUCnnl::Div(ctx,
-                     CNNL_COMPUTATION_HIGH_PRECISION,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     scale_desc.get(),
-                     GetBasePtr(scale),
-                     out_desc.get(),
-                     GetBasePtr(out));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
-                       ops::CheckFiniteAndUnscaleMLUKernel<float>,
-                       ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -21,9 +21,7 @@ limitations under the License. */

 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
+
 #include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel {
    }
    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN

-#ifdef PADDLE_WITH_MLU
-    auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
-    auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
-    if (src_type == dst_type || MLUSupportsCast(src_type, dst_type)) {
-      return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
-                            tensor_place);
-    } else {
-      VLOG(3) << "MLU not support cast type: "
-              << framework::DataTypeToString(src_type)
-              << " to type: " << framework::DataTypeToString(dst_type)
-              << ", fallbacking to CPU one!";
-      return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
-                            platform::CPUPlace());
-    }
-#endif
    return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
                          tensor_place);
  }

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -23,9 +23,6 @@
 #include "paddle/phi/kernels/funcs/math_function.h"

 #include "paddle/fluid/framework/convert_utils.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/infermeta/multiary.h"

@@ -57,17 +54,8 @@ struct FillConstantVisitor {
  void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
                                       std::is_same<T, int16_t>::value)>::type
                 * = nullptr) const {
-#if defined(PADDLE_WITH_MLU)
-    if (platform::is_mlu_place(context_.GetPlace())) {
-      FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
-    } else {
-      phi::funcs::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
-    }
-#else
    phi::funcs::SetConstant<DeviceContext, T> set_constant;
    set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
-#endif
  }

  const DeviceContext &dev_ctx_;
@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-#if defined(PADDLE_WITH_MLU)
-REGISTER_OP_MLU_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
-    ops::CoalesceTensorOpKernel<phi::CPUContext, float>);
-#endif
-
 REGISTER_OP_VERSION(coalesce_tensor)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/collective/barrier_op_mlu.cc
+++ b/paddle/fluid/operators/collective/barrier_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/barrier_op.h"
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class BarrierOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CNCL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
-    int64_t numel = in->numel();
-    const void* sendbuff = in->data();
-    void* recvbuff = out->mutable_data<T>(place);
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
-    auto* comm = cncl_comm->comm();
-    auto comm_stream = cncl_comm->stream();
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    cnclReduceOp_t cncl_red_type = cnclSum;
-    dev_ctx.Wait();
-    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
-        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
-    PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
-#else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "PaddlePaddle should compile with CNCL."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
--- a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-#include "paddle/fluid/framework/convert_utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-#if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    int nranks = ctx.Attr<int>("nranks");
-    int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
-    PADDLE_ENFORCE_EQ(
-        nranks,
-        comm->nranks(),
-        platform::errors::InvalidArgument(
-            "nranks: %s should equal to %s", nranks, comm->nranks()));
-
-    framework::DDim out_dims = x->dims();
-    out_dims[0] *= nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    uint32_t send_numel = x->numel();
-    void* send_buff;
-    void* recv_buff;
-    phi::DenseTensor in_tensor, out_tensor;
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      // cast from int64 to int32 since cncl do not support int64
-      in_tensor.mutable_data<int32_t>(x->dims(), place);
-      out_tensor.mutable_data<int32_t>(out->dims(), place);
-      MLUCnnlTensorDesc x_int64_desc(*x);
-      MLUCnnlTensorDesc x_int32_desc(in_tensor);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    x_int64_desc.get(),
-                    GetBasePtr(x),
-                    x_int32_desc.get(),
-                    GetBasePtr(&in_tensor));
-      send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
-      recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
-    } else {
-      in_tensor.ShareDataWith(*x);
-      out_tensor.ShareDataWith(*out);
-      send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
-      recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
-    }
-
-    mluStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-    cnclDataType_t dtype = platform::ToCNCLDataType(
-        framework::TransToProtoVarType(in_tensor.dtype()));
-
-    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
-        send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      // cast back from int64 out_tensor to out
-      MLUCnnlTensorDesc out_int64_desc(*out);
-      MLUCnnlTensorDesc out_int32_desc(out_tensor);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    out_int32_desc.get(),
-                    GetBasePtr(&out_tensor),
-                    out_int64_desc.get(),
-                    GetBasePtr(out));
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with MLU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_allgather,
-                       ops::CAllGatherOpMLUKernel<float>,
-                       ops::CAllGatherOpMLUKernel<uint8_t>,
-                       ops::CAllGatherOpMLUKernel<int>,
-                       ops::CAllGatherOpMLUKernel<int8_t>,
-                       ops::CAllGatherOpMLUKernel<int16_t>,
-                       ops::CAllGatherOpMLUKernel<int64_t>,
-                       ops::CAllGatherOpMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_allreduce_max,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMax, uint8_t>)
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_allreduce_min,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedMin, uint8_t>)
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_allreduce_prod,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedProd, uint8_t>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    int numel = x->numel();
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
-
-    mluStream stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int root = ctx.Attr<int>("root");
-    if (root == comm->rank()) {
-      PADDLE_ENFORCE_MLU_SUCCESS(
-          cnclBcast(reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
-                    numel,
-                    dtype,
-                    root,
-                    comm->comm(),
-                    stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
-              << x->numel();
-
-      if (out != x) {
-        framework::TensorCopy(
-            *static_cast<const phi::DenseTensor*>(x),
-            place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<phi::DenseTensor*>(out));
-      }
-    } else {
-      PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place),
-                                           numel,
-                                           dtype,
-                                           root,
-                                           comm->comm(),
-                                           stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
-              << phi::product(out->dims());
-    }
-
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with MLU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_broadcast,
-                       ops::CBroadcastOPMLUKernel<float>,
-                       ops::CBroadcastOPMLUKernel<plat::float16>,
-                       ops::CBroadcastOPMLUKernel<int>,
-                       ops::CBroadcastOPMLUKernel<int16_t>,
-                       ops::CBroadcastOPMLUKernel<int8_t>,
-                       ops::CBroadcastOPMLUKernel<uint8_t>);
--- a/paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_reduce_max,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMax, uint8_t>)
--- a/paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_reduce_min,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedMin, uint8_t>)
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_reduce_prod,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedProd, uint8_t>)
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(c_reduce_sum,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, float>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, plat::float16>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, int>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, int16_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, int8_t>,
-                       ops::CReduceOpMLUKernel<ops::kRedSum, uint8_t>)
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -33,22 +33,7 @@ Call calculation stream synchronization.

 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using MLU = plat::MLUPlace;

 REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream,
                             ops::CSyncCalcStreamOp,
                             ops::CSyncCalcStreamOpMaker);
-
-REGISTER_OP_NPU_KERNEL(c_sync_calc_stream,
-                       ops::CSyncCalcStreamKernel<float, MLU>,
-                       ops::CSyncCalcStreamKernel<double, MLU>,
-                       ops::CSyncCalcStreamKernel<int, MLU>,
-                       ops::CSyncCalcStreamKernel<int64_t, MLU>,
-                       ops::CSyncCalcStreamKernel<plat::float16, MLU>);
-
-REGISTER_OP_MLU_KERNEL(c_sync_calc_stream,
-                       ops::CSyncCalcStreamKernel<float, MLU>,
-                       ops::CSyncCalcStreamKernel<double, MLU>,
-                       ops::CSyncCalcStreamKernel<int, MLU>,
-                       ops::CSyncCalcStreamKernel<int64_t, MLU>,
-                       ops::CSyncCalcStreamKernel<plat::float16, MLU>);
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,

 REGISTER_OP_NPU_KERNEL(c_sync_comm_stream,
                       ops::CSyncCommStreamKernel<float, plat::NPUPlace>);
-
-REGISTER_OP_MLU_KERNEL(c_sync_comm_stream,
-                       ops::CSyncCommStreamKernel<float, plat::MLUPlace>);
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(mp_allreduce_sum,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
-                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
--- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class EqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_EQ,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NotEqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_NE,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LessThanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_LT,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LessEqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_LE,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GreaterThanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_GT,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GreaterEqualMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_x(
-        *x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
-    MLUCnnlTensorDesc input_y(
-        *y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
-    MLUCnnlTensorDesc output(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_GE,
-                   input_x.get(),
-                   GetBasePtr(x),
-                   input_y.get(),
-                   GetBasePtr(y),
-                   output.get(),
-                   GetBasePtr(out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(
-    equal,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
-
-REGISTER_OP_MLU_KERNEL(
-    not_equal,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
-
-REGISTER_OP_MLU_KERNEL(
-    less_than,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
-    ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
-
-REGISTER_OP_MLU_KERNEL(
-    less_equal,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
-
-REGISTER_OP_MLU_KERNEL(
-    greater_than,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
-    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
-
-REGISTER_OP_MLU_KERNEL(
-    greater_equal,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
-    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
--- a/paddle/fluid/operators/controlflow/logical_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, cnnlLogicOp_t log_method>
-class LogicalMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    if (log_method == CNNL_LOGIC_OP_NOT) {
-      y = x;
-    }
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc out_desc(*out);
-
-    MLUCnnl::Logic(ctx,
-                   log_method,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   y_desc.get(),
-                   GetBasePtr(y),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(logical_not,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_NOT>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_NOT>);
-
-REGISTER_OP_MLU_KERNEL(logical_and,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_AND>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_AND>);
-
-REGISTER_OP_MLU_KERNEL(logical_or,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_OR>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_OR>);
-
-REGISTER_OP_MLU_KERNEL(logical_xor,
-                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_XOR>,
-                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_XOR>);
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -38,11 +38,6 @@ if(WITH_XPU)
  detection_library(prior_box_op SRCS prior_box_op.cc)
  detection_library(yolo_box_op SRCS yolo_box_op.cc)
  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
-elseif(WITH_MLU)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-                    iou_similarity_op_mlu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
-  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
 else()
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op.cu)

--- a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct IouFunction {
- public:
-  explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
-    place = ctx.GetPlace();
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    //  y should be init first
-    TransposeFromMLUTensor<T>(ctx, axis, x, y, false /*need_reshape_or_alloc*/);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-
-    MLUCnnlOpTensorDesc add_op_desc(
-        CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      add_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z),
-                      ToCnnlDataType<T>());
-  }
-
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-
-    MLUCnnlOpTensorDesc sub_op_desc(
-        CNNL_OP_TENSOR_SUB, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      sub_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z),
-                      ToCnnlDataType<T>());
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      mul_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z),
-                      ToCnnlDataType<T>());
-  }
-  void DivNoNan(const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-
-    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
-
-    MLUCnnl::DivNoNan(ctx,
-                      prefer,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      y_desc.get(),
-                      GetBasePtr(y),
-                      z_desc.get(),
-                      GetBasePtr(z));
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    float alpha = 1.0;
-    float beta = scalar;
-    MLUCnnl::Transform(ctx,
-                       &alpha,
-                       &beta,
-                       x_desc.get(),
-                       GetBasePtr(x),
-                       y_desc.get(),
-                       GetBasePtr(y));
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-
-    MLUCnnl::Maximum(ctx,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     z_desc.get(),
-                     GetBasePtr(z));
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc z_desc(*z);
-
-    MLUCnnl::Minimum(ctx,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     z_desc.get(),
-                     GetBasePtr(z));
-  }
-
- private:
-  platform::Place place;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-class IouSimilarityMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    bool normalized = ctx.Attr<bool>("box_normalized");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto _type = x->dtype();
-    auto place = ctx.GetPlace();
-
-    IouFunction<T> F(ctx);
-
-    auto N = x->dims()[0];
-    auto M = y->dims()[0];
-
-    out->mutable_data<T>({N, M}, place);
-    phi::DenseTensor xt(_type);
-    phi::DenseTensor yt(_type);
-    xt.mutable_data<T>({4, N}, place);
-    yt.mutable_data<T>({4, M}, place);
-    std::vector<int> vec_trans = {1, 0};
-    F.Transpose(x, &xt, vec_trans);
-    F.Transpose(y, &yt, vec_trans);
-    phi::DenseTensor xmin1 = xt.Slice(0, 1);
-    phi::DenseTensor ymin1 = xt.Slice(1, 2);
-    phi::DenseTensor xmax1 = xt.Slice(2, 3);
-    phi::DenseTensor ymax1 = xt.Slice(3, 4);
-    phi::DenseTensor xmin2 = yt.Slice(0, 1);
-    phi::DenseTensor ymin2 = yt.Slice(1, 2);
-    phi::DenseTensor xmax2 = yt.Slice(2, 3);
-    phi::DenseTensor ymax2 = yt.Slice(3, 4);
-    xmin1.Resize({N, 1});
-    ymin1.Resize({N, 1});
-    xmax1.Resize({N, 1});
-    ymax1.Resize({N, 1});
-    xmin2.Resize({1, M});
-    ymin2.Resize({1, M});
-    xmax2.Resize({1, M});
-    ymax2.Resize({1, M});
-
-    phi::DenseTensor w1(_type);
-    phi::DenseTensor h1(_type);
-    phi::DenseTensor w2(_type);
-    phi::DenseTensor h2(_type);
-    phi::DenseTensor area1(_type);
-    phi::DenseTensor area2(_type);
-    w1.mutable_data<T>({N, 1}, place);
-    h1.mutable_data<T>({N, 1}, place);
-    w2.mutable_data<T>({1, M}, place);
-    h2.mutable_data<T>({1, M}, place);
-    area1.mutable_data<T>({N, 1}, place);
-    area2.mutable_data<T>({1, M}, place);
-    F.Sub(&xmax1, &xmin1, &w1);
-    F.Sub(&ymax1, &ymin1, &h1);
-    F.Sub(&xmax2, &xmin2, &w2);
-    F.Sub(&ymax2, &ymin2, &h2);
-    if (!normalized) {
-      F.Adds(&w1, 1.0f, &w1);
-      F.Adds(&h1, 1.0f, &h1);
-      F.Adds(&w2, 1.0f, &w2);
-      F.Adds(&h2, 1.0f, &h2);
-    }
-    F.Mul(&w1, &h1, &area1);
-    F.Mul(&w2, &h2, &area2);
-
-    phi::DenseTensor inter_xmax(_type);
-    phi::DenseTensor inter_ymax(_type);
-    phi::DenseTensor inter_xmin(_type);
-    phi::DenseTensor inter_ymin(_type);
-    inter_xmax.mutable_data<T>({N, M}, place);
-    inter_ymax.mutable_data<T>({N, M}, place);
-    inter_xmin.mutable_data<T>({N, M}, place);
-    inter_ymin.mutable_data<T>({N, M}, place);
-    F.Minimum(&xmax1, &xmax2, &inter_xmax);
-    F.Minimum(&ymax1, &ymax2, &inter_ymax);
-    F.Maximum(&xmin1, &xmin2, &inter_xmin);
-    F.Maximum(&ymin1, &ymin2, &inter_ymin);
-
-    phi::DenseTensor inter_w(_type);
-    phi::DenseTensor inter_h(_type);
-    inter_w.mutable_data<T>({N, M}, place);
-    inter_h.mutable_data<T>({N, M}, place);
-    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
-    F.Sub(&inter_ymax, &inter_ymin, &inter_h);
-
-    if (!normalized) {
-      F.Adds(&inter_w, 1.0f, &inter_w);
-      F.Adds(&inter_h, 1.0f, &inter_h);
-    }
-    phi::DenseTensor zeros(_type);
-    zeros.mutable_data<T>({1}, place);
-    FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
-    F.Maximum(&inter_w, &zeros, &inter_w);
-    F.Maximum(&inter_h, &zeros, &inter_h);
-
-    F.Mul(&inter_w, &inter_h, out);
-    phi::DenseTensor union_area(_type);
-    union_area.mutable_data<T>({N, M}, place);
-    F.Add(&area1, &area2, &union_area);
-    F.Sub(&union_area, out, &union_area);
-    F.DivNoNan(out, &union_area, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(iou_similarity,
-                       ops::IouSimilarityMLUKernel<float>,
-                       ops::IouSimilarityMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/detection/prior_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PriorBoxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* image = ctx.Input<phi::DenseTensor>("Image");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
-    float step_w = ctx.Attr<float>("step_w");
-    float step_h = ctx.Attr<float>("step_h");
-    float offset = ctx.Attr<float>("offset");
-    bool clip = ctx.Attr<bool>("clip");
-    bool min_max_aspect_ratios_order =
-        ctx.Attr<bool>("min_max_aspect_ratios_order");
-
-    int im_width = image->dims()[3];
-    int im_height = image->dims()[2];
-    int width = input->dims()[3];
-    int height = input->dims()[2];
-
-    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
-    bool flip = ctx.Attr<bool>("flip");
-    std::vector<float> new_aspect_ratios;
-    ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    phi::DenseTensor ratios;
-    paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
-    MLUOpTensorDesc new_aspect_ratios_desc(ratios);
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    phi::DenseTensor min;
-    paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
-    MLUOpTensorDesc min_sizes_desc(min);
-
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    phi::DenseTensor max;
-    paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
-    MLUOpTensorDesc max_sizes_desc(max);
-
-    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
-    phi::DenseTensor var_tensor;
-    paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
-    MLUOpTensorDesc variances_attr_desc(var_tensor);
-
-    auto place = ctx.GetPlace();
-
-    boxes->mutable_data<T>(place);
-    variances->mutable_data<T>(place);
-
-    MLUOpTensorDesc var_desc(*variances);
-    MLUOpTensorDesc output_desc(*boxes);
-    MLUOP::OpPriorBox(ctx,
-                      min_sizes_desc.get(),
-                      GetBasePtr(&min),
-                      new_aspect_ratios_desc.get(),
-                      GetBasePtr(&ratios),
-                      variances_attr_desc.get(),
-                      GetBasePtr(&var_tensor),
-                      max_sizes_desc.get(),
-                      GetBasePtr(&max),
-                      height,
-                      width,
-                      im_height,
-                      im_width,
-                      step_h,
-                      step_w,
-                      offset,
-                      clip,
-                      min_max_aspect_ratios_order,
-                      output_desc.get(),
-                      GetBasePtr(boxes),
-                      var_desc.get(),
-                      GetBasePtr(variances));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class YoloBoxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* scores = ctx.Output<phi::DenseTensor>("Scores");
-    const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
-    auto class_num = ctx.Attr<int>("class_num");
-    auto conf_thresh = ctx.Attr<float>("conf_thresh");
-    auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    auto clip_bbox = ctx.Attr<bool>("clip_bbox");
-    auto scale = ctx.Attr<float>("scale_x_y");
-    auto iou_aware = ctx.Attr<bool>("iou_aware");
-    auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-
-    int anchor_num = anchors.size() / 2;
-    int64_t size = anchors.size();
-    auto dim_x = x->dims();
-    int n = dim_x[0];
-    int s = anchor_num;
-    int h = dim_x[2];
-    int w = dim_x[3];
-
-    // The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
-    // H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
-    // anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
-    std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
-    std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
-
-    // In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
-    // of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
-    // classification scores of boxes.
-    std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
-    std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
-
-    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    phi::DenseTensor boxes_tensor_mluops =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
-    phi::DenseTensor scores_tensor_mluops =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
-                                                   dev_ctx);
-    MLUOpTensorDesc boxes_trans_desc_mluops(
-        4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
-    MLUCnnlTensorDesc boxes_trans_desc_cnnl(
-        4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
-    MLUOpTensorDesc scores_trans_desc_mluops(
-        4, scores_dim_mluops.data(), ToMluOpDataType<T>());
-    MLUCnnlTensorDesc scores_trans_desc_cnnl(
-        4, scores_dim_mluops.data(), ToCnnlDataType<T>());
-
-    boxes->mutable_data<T>(ctx.GetPlace());
-    scores->mutable_data<T>(ctx.GetPlace());
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
-
-    MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
-    MLUOpTensorDesc img_size_desc(
-        *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
-    phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
-    anchors_temp.Resize({size});
-    paddle::framework::TensorFromVector(
-        anchors, ctx.device_context(), &anchors_temp);
-    MLUOpTensorDesc anchors_desc(anchors_temp);
-    MLUCnnlTensorDesc boxes_desc_cnnl(
-        4, boxes_out_dim.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc scores_desc_cnnl(
-        4, scores_out_dim.data(), ToCnnlDataType<T>());
-
-    MLUOP::OpYoloBox(ctx,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     img_size_desc.get(),
-                     GetBasePtr(img_size),
-                     anchors_desc.get(),
-                     GetBasePtr(&anchors_temp),
-                     class_num,
-                     conf_thresh,
-                     downsample_ratio,
-                     clip_bbox,
-                     scale,
-                     iou_aware,
-                     iou_aware_factor,
-                     boxes_trans_desc_mluops.get(),
-                     GetBasePtr(&boxes_tensor_mluops),
-                     scores_trans_desc_mluops.get(),
-                     GetBasePtr(&scores_tensor_mluops));
-    const std::vector<int> perm = {0, 1, 3, 2};
-
-    // transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
-    MLUCnnl::Transpose(ctx,
-                       perm,
-                       4,
-                       boxes_trans_desc_cnnl.get(),
-                       GetBasePtr(&boxes_tensor_mluops),
-                       boxes_desc_cnnl.get(),
-                       GetBasePtr(boxes));
-
-    // transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
-    // class_num]
-    MLUCnnl::Transpose(ctx,
-                       perm,
-                       4,
-                       scores_trans_desc_cnnl.get(),
-                       GetBasePtr(&scores_tensor_mluops),
-                       scores_desc_cnnl.get(),
-                       GetBasePtr(scores));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_ADD);
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-
-    MLUCnnlTensorDesc dout_desc(*dout);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
-
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
-
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(elementwise_add,
-                       ops::ElementwiseAddMLUKernel<float>,
-                       ops::ElementwiseAddMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(elementwise_add_grad,
-                       ops::ElementwiseAddGradMLUKernel<float>,
-                       ops::ElementwiseAddGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<DIV, T>(ctx);
-  }
-};
-
-template <typename T>
-class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims,
-                           y_dims,
-                           x_dims_array.data(),
-                           y_dims_array.data(),
-                           out_dims_array.data(),
-                           max_dim,
-                           axis);
-
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-
-    // compute dout/y == 1/y * dout
-    phi::DenseTensor dout_div_y(dout->dtype());
-    dout_div_y.Resize(dout->dims());
-    dout_div_y.mutable_data<T>(ctx.GetPlace());
-    MLUBinary<DIV>(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   dout_desc.get(),
-                   GetBasePtr(dout),
-                   y_desc.get(),
-                   GetBasePtr(y),
-                   dout_desc.get(),
-                   GetBasePtr(&dout_div_y));
-
-    if (dx) {
-      // compute dx = dout/y = 1/y * dout
-      if (dx->dims() != dout->dims()) {
-        dx->mutable_data<T>(ctx.GetPlace());
-
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dout_div_y.dims(), dx->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(*dx);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dout_div_y),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      } else {
-        dx->ShareDataWith(dout_div_y);
-      }
-    }
-
-    if (dy) {
-      // compute dy = -out * (dout/y) = -out/y * dout
-      phi::DenseTensor neg_out(out->type());
-      neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());
-
-      MLUCnnlTensorDesc out_desc(*out);
-      MLUUnary<NEG>(ctx,
-                    CNNL_COMPUTATION_HIGH_PRECISION,
-                    out_desc.get(),
-                    GetBasePtr(out),
-                    out_desc.get(),
-                    GetBasePtr(&neg_out));
-
-      phi::DenseTensor dy_temp(y->dtype());
-      dy_temp.Resize(dout->dims());
-      dy_temp.mutable_data<T>(ctx.GetPlace());
-
-      MLUCnnl::OpTensor(ctx,
-                        mul_op_desc.get(),
-                        dout_desc.get(),
-                        GetBasePtr(&neg_out),
-                        dout_desc.get(),
-                        GetBasePtr(&dout_div_y),
-                        dout_desc.get(),
-                        GetBasePtr(&dy_temp),
-                        ToCnnlDataType<T>());
-
-      if (dy->dims() != dout->dims()) {
-        dy->mutable_data<T>(ctx.GetPlace());
-
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(*dy);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dy_temp),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-      } else {
-        dy->ShareDataWith(dy_temp);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(elementwise_div,
-                       ops::ElementwiseDivMLUKernel<int>,
-                       ops::ElementwiseDivMLUKernel<float>,
-                       ops::ElementwiseDivMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(elementwise_div_grad,
-                       ops::ElementwiseDivGradMLUKernel<int>,
-                       ops::ElementwiseDivGradMLUKernel<float>,
-                       ops::ElementwiseDivGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_MLU
-
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseMaxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<MAXIMUM, T>(ctx);
-  }
-};
-
-template <typename T>
-class ElementwiseMaxGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUMinMaxGradHelper<MAXIMUM_GRAD, T>(ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(elementwise_max,
-                       ops::ElementwiseMaxMLUKernel<int>,
-                       ops::ElementwiseMaxMLUKernel<float>,
-                       ops::ElementwiseMaxMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradMLUKernel<int>,
-    ops::ElementwiseMaxGradMLUKernel<float>,
-    ops::ElementwiseMaxGradMLUKernel<paddle::platform::float16>);
-#endif
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<MINIMUM, T>(ctx);
-  }
-};
-
-template <typename T>
-class ElementwiseMinGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUMinMaxGradHelper<MINIMUM_GRAD, T>(ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(elementwise_min,
-                       ops::ElementwiseMinMLUKernel<int>,
-                       ops::ElementwiseMinMLUKernel<float>,
-                       ops::ElementwiseMinMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(elementwise_min_grad,
-                       ops::ElementwiseMinGradMLUKernel<int>,
-                       ops::ElementwiseMinGradMLUKernel<float>,
-                       ops::ElementwiseMinGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_MLU
-#include <vector>
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetReduceAxes(const int axis,
-                          const framework::DDim& src_ddims,
-                          const framework::DDim& target_ddims,
-                          std::vector<int>* axes) {
-  int64_t src_dim_size = src_ddims.size();
-  int64_t target_dim_size = target_ddims.size();
-  for (int64_t i = 0; i < src_dim_size; ++i) {
-    if (i < axis || i >= target_dim_size + axis) {
-      axes->push_back(i);
-      continue;
-    }
-    if (src_ddims[i] > target_ddims[i - axis]) {
-      axes->push_back(i);
-    }
-  }
-}
-
-inline void GetReduceAxesAndDstDims(const int axis,
-                                    const framework::DDim& src_ddims,
-                                    const framework::DDim& target_ddims,
-                                    std::vector<int>* reduce_axes,
-                                    std::vector<int>* dst_dims_vec) {
-  int64_t src_dim_size = src_ddims.size();
-  int64_t target_dim_size = target_ddims.size();
-
-  int src_axis = (target_dim_size < src_dim_size ? axis : 0);
-  for (int ax = 0; ax < src_dim_size; ++ax) {
-    if ((ax < src_axis || ax >= src_axis + target_dim_size) ||
-        (src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) {
-      reduce_axes->push_back(ax);
-    } else {
-      dst_dims_vec->push_back(src_ddims[ax]);
-    }
-  }
-  if (dst_dims_vec->size() == 0) {
-    // target_var is scalar
-    dst_dims_vec->push_back(1);
-  }
-}
-
-template <typename T>
-void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
-                       const cnnlOpTensorDesc_t op_tensor_op) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_mlu_place(ctx.GetPlace()),
-      true,
-      platform::errors::Unavailable("This kernel only runs on MLU."));
-  PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) ||
-                        (op_tensor_op == CNNL_OP_TENSOR_SUB) ||
-                        (op_tensor_op == CNNL_OP_TENSOR_MUL),
-                    true,
-                    platform::errors::Unavailable(
-                        "This kernel of MLU only support ADD, SUB, MUL."));
-
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* y = ctx.Input<phi::DenseTensor>("Y");
-  auto* out = ctx.Output<phi::DenseTensor>("Out");
-  out->mutable_data<T>(ctx.GetPlace());
-
-  int axis = ctx.Attr<int>("axis");
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  axis =
-      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims,
-                         y_dims,
-                         x_dims_array.data(),
-                         y_dims_array.data(),
-                         out_dims_array.data(),
-                         max_dim,
-                         axis);
-
-  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*out);
-  MLUCnnlOpTensorDesc op_tensor_desc(
-      op_tensor_op, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-
-  MLUCnnl::OpTensor(ctx,
-                    op_tensor_desc.get(),
-                    x_desc.get(),
-                    GetBasePtr(x),
-                    y_desc.get(),
-                    GetBasePtr(y),
-                    out_desc.get(),
-                    GetBasePtr(out),
-                    ToCnnlDataType<T>());
-}
-
-// ------------------ BinaryOp -----------------
-enum BINARY_FUNCTOR {
-  DIV,
-  DIVNONAN,
-  MAXIMUM,
-  MINIMUM,
-  POW,
-};
-
-template <BINARY_FUNCTOR func>
-void MLUBinary(const framework::ExecutionContext& ctx,
-               cnnlComputationPreference_t prefer,
-               const cnnlTensorDescriptor_t x_desc,
-               const void* x,
-               const cnnlTensorDescriptor_t y_desc,
-               const void* y,
-               const cnnlTensorDescriptor_t out_desc,
-               void* out);
-
-template <>
-inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
-                           cnnlComputationPreference_t prefer,
-                           const cnnlTensorDescriptor_t x_desc,
-                           const void* x,
-                           const cnnlTensorDescriptor_t y_desc,
-                           const void* y,
-                           const cnnlTensorDescriptor_t out_desc,
-                           void* out) {
-  MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
-}
-
-template <>
-inline void MLUBinary<MAXIMUM>(
-    const framework::ExecutionContext& ctx,
-    cnnlComputationPreference_t prefer,  // useless, only for compatible
-    const cnnlTensorDescriptor_t x_desc,
-    const void* x,
-    const cnnlTensorDescriptor_t y_desc,
-    const void* y,
-    const cnnlTensorDescriptor_t out_desc,
-    void* out) {
-  MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out);
-}
-
-template <>
-inline void MLUBinary<MINIMUM>(const framework::ExecutionContext& ctx,
-                               cnnlComputationPreference_t prefer,
-                               const cnnlTensorDescriptor_t in1_desc,
-                               const void* in1,
-                               const cnnlTensorDescriptor_t in2_desc,
-                               const void* in2,
-                               const cnnlTensorDescriptor_t out_desc,
-                               void* out) {
-  MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out);
-}
-
-template <>
-inline void MLUBinary<POW>(const framework::ExecutionContext& ctx,
-                           cnnlComputationPreference_t prefer,
-                           const cnnlTensorDescriptor_t x_desc,
-                           const void* x,
-                           const cnnlTensorDescriptor_t y_desc,
-                           const void* y,
-                           const cnnlTensorDescriptor_t out_desc,
-                           void* out) {
-  MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
-}
-
-template <BINARY_FUNCTOR Functor, typename T>
-void MLUBinaryOp(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* y = ctx.Input<phi::DenseTensor>("Y");
-  auto* out = ctx.Output<phi::DenseTensor>("Out");
-  out->mutable_data<T>(ctx.GetPlace());
-
-  int axis = ctx.Attr<int>("axis");
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  axis =
-      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims,
-                         y_dims,
-                         x_dims_array.data(),
-                         y_dims_array.data(),
-                         out_dims_array.data(),
-                         max_dim,
-                         axis);
-
-  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-
-  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
-  MLUBinary<Functor>(ctx,
-                     prefer_type,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     out_desc.get(),
-                     GetBasePtr(out));
-}
-
-// ------------------ UnaryOp -----------------
-enum UNARY_FUNCTOR {
-  NEG,
-  RECIPROCAL,
-};
-
-template <UNARY_FUNCTOR func>
-void MLUUnary(const framework::ExecutionContext& ctx,
-              cnnlComputationPreference_t prefer,
-              const cnnlTensorDescriptor_t input_desc,
-              const void* input,
-              const cnnlTensorDescriptor_t output_desc,
-              void* output);
-
-template <>
-inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
-                          cnnlComputationPreference_t prefer,
-                          const cnnlTensorDescriptor_t input_desc,
-                          const void* input,
-                          const cnnlTensorDescriptor_t output_desc,
-                          void* output) {
-  MLUCnnl::Neg(ctx, input_desc, input, output_desc, output);
-}
-
-template <>
-inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
-                                 cnnlComputationPreference_t prefer,
-                                 const cnnlTensorDescriptor_t input_desc,
-                                 const void* input,
-                                 const cnnlTensorDescriptor_t output_desc,
-                                 void* output) {
-  MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output);
-}
-
-template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
-void MLUUnaryOp(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-  out->mutable_data<Tout>(ctx.GetPlace());
-
-  MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tin>());
-  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tout>());
-
-  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
-  MLUUnary<Functor>(ctx,
-                    prefer_type,
-                    x_desc.get(),
-                    GetBasePtr(x),
-                    out_desc.get(),
-                    GetBasePtr(out));
-}
-
-// ------------------ MLUElementwiseGradOp -----------------
-enum MINMAX_GRAD_FUNCTOR {
-  MAXIMUM_GRAD,
-  MINIMUM_GRAD,
-};
-template <MINMAX_GRAD_FUNCTOR Functor, typename Tin, typename Tout = Tin>
-void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<phi::DenseTensor>("X");
-  auto* y = ctx.Input<phi::DenseTensor>("Y");
-  auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-  auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-  int axis = ctx.Attr<int>("axis");
-
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  axis =
-      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims,
-                         y_dims,
-                         x_dims_array.data(),
-                         y_dims_array.data(),
-                         out_dims_array.data(),
-                         max_dim,
-                         axis);
-
-  // mask = Logic(x, y) only support min & max
-  cnnlLogicOp_t logic =
-      Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE;
-  phi::DenseTensor mask(x->dtype());
-  mask.Resize(phi::make_ddim(out_dims_array));
-  mask.mutable_data<Tin>(ctx.GetPlace());
-
-  cnnlDataType_t data_type = ToCnnlDataType<Tin>();
-  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
-  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
-  MLUCnnlTensorDesc mask_desc(max_dim, out_dims_array.data(), data_type);
-  MLUCnnl::Logic(ctx,
-                 logic,
-                 x_desc.get(),
-                 GetBasePtr(x),
-                 y_desc.get(),
-                 GetBasePtr(y),
-                 mask_desc.get(),
-                 GetBasePtr(&mask));
-
-  // dx = Mul(dz, mask)
-  phi::DenseTensor dx_temp(x->dtype());
-  dx_temp.Resize(dout->dims());
-  dx_temp.mutable_data<Tout>(ctx.GetPlace());
-  MLUCnnlTensorDesc dout_desc(*dout);
-  MLUCnnlOpTensorDesc mul_op_desc(
-      CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    mul_op_desc.get(),
-                    dout_desc.get(),
-                    GetBasePtr(dout),
-                    dout_desc.get(),
-                    GetBasePtr(&mask),
-                    dout_desc.get(),
-                    GetBasePtr(&dx_temp),
-                    data_type);
-
-  // dy = Sub(dz, dx)
-  phi::DenseTensor dy_temp(y->dtype());
-  dy_temp.Resize(dout->dims());
-  dy_temp.mutable_data<Tout>(ctx.GetPlace());
-  MLUCnnlOpTensorDesc sub_op_desc(
-      CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    sub_op_desc.get(),
-                    dout_desc.get(),
-                    GetBasePtr(dout),
-                    dout_desc.get(),
-                    GetBasePtr(&dx_temp),
-                    dout_desc.get(),
-                    GetBasePtr(&dy_temp),
-                    data_type);
-
-  if (dx) {
-    if (dx->dims() != dout->dims()) {
-      dx->mutable_data<Tout>(ctx.GetPlace());
-      std::vector<int> reduce_axes;
-      GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
-      MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                       CNNL_REDUCE_ADD,
-                                       data_type,
-                                       CNNL_NOT_PROPAGATE_NAN,
-                                       CNNL_REDUCE_NO_INDICES,
-                                       CNNL_32BIT_INDICES);
-      MLUCnnlTensorDesc dx_desc(*dx);
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      dout_desc.get(),
-                      GetBasePtr(&dx_temp),
-                      0,
-                      nullptr,
-                      nullptr,
-                      dx_desc.get(),
-                      GetBasePtr(dx));
-    } else {
-      dx->ShareDataWith(dx_temp);
-    }
-  }
-
-  if (dy) {
-    if (dy->dims() != dout->dims()) {
-      dy->mutable_data<Tout>(ctx.GetPlace());
-      std::vector<int> reduce_axes;
-      GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
-      MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                       CNNL_REDUCE_ADD,
-                                       data_type,
-                                       CNNL_NOT_PROPAGATE_NAN,
-                                       CNNL_REDUCE_NO_INDICES,
-                                       CNNL_32BIT_INDICES);
-      MLUCnnlTensorDesc dy_desc(*dy);
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      dout_desc.get(),
-                      GetBasePtr(&dy_temp),
-                      0,
-                      nullptr,
-                      nullptr,
-                      dy_desc.get(),
-                      GetBasePtr(dy));
-    } else {
-      dy->ShareDataWith(dy_temp);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-#endif
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-
-namespace paddle {
-namespace operators {
-
-using MLUDeviceContext = platform::MLUDeviceContext;
-
-template <typename T>
-class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_MUL);
-  }
-};
-
-template <typename T>
-class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims,
-                           y_dims,
-                           x_dims_array.data(),
-                           y_dims_array.data(),
-                           out_dims_array.data(),
-                           max_dim,
-                           axis);
-
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() == dout->dims()) {
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          y_desc.get(),
-                          GetBasePtr(y),
-                          x_desc.get(),
-                          GetBasePtr(dx),
-                          ToCnnlDataType<T>());
-      } else {
-        phi::DenseTensor dx_temp(x->dtype());
-        dx_temp.Resize(dout->dims());
-        dx_temp.mutable_data<T>(ctx.GetPlace());
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          y_desc.get(),
-                          GetBasePtr(y),
-                          dout_desc.get(),
-                          GetBasePtr(&dx_temp),
-                          ToCnnlDataType<T>());
-
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(*dx);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dx_temp),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() == dout->dims()) {
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          x_desc.get(),
-                          GetBasePtr(x),
-                          y_desc.get(),
-                          GetBasePtr(dy),
-                          ToCnnlDataType<T>());
-      } else {
-        phi::DenseTensor dy_temp(y->dtype());
-        dy_temp.Resize(dout->dims());
-        dy_temp.mutable_data<T>(ctx.GetPlace());
-        MLUCnnl::OpTensor(ctx,
-                          mul_op_desc.get(),
-                          dout_desc.get(),
-                          GetBasePtr(dout),
-                          x_desc.get(),
-                          GetBasePtr(x),
-                          dout_desc.get(),
-                          GetBasePtr(&dy_temp),
-                          ToCnnlDataType<T>());
-
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(*dy);
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(&dy_temp),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(elementwise_mul,
-                       ops::ElementwiseMulMLUKernel<float>,
-                       ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
-                       ops::ElementwiseMulMLUKernel<int>);
-
-REGISTER_OP_MLU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradMLUKernel<float>,
-    ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
-    ops::ElementwiseMulGradMLUKernel<int>);
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwisePowMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUBinaryOp<POW, T>(ctx);
-  }
-};
-
-template <typename T>
-class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto place = ctx.GetPlace();
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis =
-        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims,
-                           y_dims,
-                           x_dims_array.data(),
-                           y_dims_array.data(),
-                           out_dims_array.data(),
-                           max_dim,
-                           axis);
-    cnnlDataType_t data_type = ToCnnlDataType<T>();
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
-    MLUCnnlTensorDesc out_desc(max_dim, out_dims_array.data(), data_type);
-
-    auto dout_dims = dout->dims();
-    if (dx) {
-      // dx = dout * y * pow(x, y - 1);
-      phi::DenseTensor one_dx(y->type());
-      one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
-
-      phi::DenseTensor sub_dx(y->type());
-      sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
-      MLUCnnlOpTensorDesc op_tensor_desc(
-          CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
-      MLUCnnl::OpTensor(ctx,
-                        op_tensor_desc.get(),
-                        y_desc.get(),
-                        GetBasePtr(y),
-                        y_desc.get(),
-                        GetBasePtr(&one_dx),
-                        y_desc.get(),
-                        GetBasePtr(&sub_dx),
-                        data_type);
-
-      phi::DenseTensor tmp_dx(x->type());
-      tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
-      MLUCnnl::Pow(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   y_desc.get(),
-                   GetBasePtr(&sub_dx),
-                   out_desc.get(),
-                   GetBasePtr(&tmp_dx));
-
-      MLUCnnl::MulAx(ctx,
-                     y_desc.get(),
-                     GetBasePtr(y),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dx));
-      MLUCnnl::MulAx(ctx,
-                     out_desc.get(),
-                     GetBasePtr(dout),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dx));
-
-      if (x_dims != dout_dims) {
-        dx->mutable_data<T>(place);
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dout_dims, x_dims, &reduce_axes);
-        if (!reduce_axes.empty()) {
-          MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                           CNNL_REDUCE_ADD,
-                                           data_type,
-                                           CNNL_NOT_PROPAGATE_NAN,
-                                           CNNL_REDUCE_NO_INDICES,
-                                           CNNL_32BIT_INDICES);
-          MLUCnnlTensorDesc dx_desc(*dx);
-          MLUCnnl::Reduce(ctx,
-                          true /*need_workspace*/,
-                          reduction_desc.get(),
-                          nullptr,
-                          out_desc.get(),
-                          GetBasePtr(&tmp_dx),
-                          0,
-                          nullptr,
-                          nullptr,
-                          dx_desc.get(),
-                          GetBasePtr(dx));
-        }
-      } else {
-        dx->ShareDataWith(tmp_dx);
-      }
-    }
-    if (dy) {
-      // dy = dout * log(x) * pow(x, y)
-      phi::DenseTensor tmp_dy(y->type());
-      tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
-      MLUCnnl::Pow(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   y_desc.get(),
-                   GetBasePtr(y),
-                   out_desc.get(),
-                   GetBasePtr(&tmp_dy));
-
-      phi::DenseTensor log_x(x->type());
-      log_x.mutable_data<T>(x->dims(), place);
-      MLUCnnl::Log(ctx,
-                   CNNL_COMPUTATION_HIGH_PRECISION,
-                   CNNL_LOG_E,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   x_desc.get(),
-                   GetBasePtr(&log_x));
-      MLUCnnl::MulAx(ctx,
-                     x_desc.get(),
-                     GetBasePtr(&log_x),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dy));
-      MLUCnnl::MulAx(ctx,
-                     out_desc.get(),
-                     GetBasePtr(dout),
-                     out_desc.get(),
-                     GetBasePtr(&tmp_dy));
-
-      if (y_dims != dout_dims) {
-        dy->mutable_data<T>(place);
-        std::vector<int> reduce_axes;
-        GetReduceAxes(axis, dout_dims, y_dims, &reduce_axes);
-        if (!reduce_axes.empty()) {
-          MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                           CNNL_REDUCE_ADD,
-                                           data_type,
-                                           CNNL_NOT_PROPAGATE_NAN,
-                                           CNNL_REDUCE_NO_INDICES,
-                                           CNNL_32BIT_INDICES);
-          MLUCnnlTensorDesc dy_desc(*dy);
-          MLUCnnl::Reduce(ctx,
-                          true /*need_workspace*/,
-                          reduction_desc.get(),
-                          nullptr,
-                          out_desc.get(),
-                          GetBasePtr(&tmp_dy),
-                          0,
-                          nullptr,
-                          nullptr,
-                          dy_desc.get(),
-                          GetBasePtr(dy));
-        }
-      } else {
-        dy->ShareDataWith(tmp_dy);
-      }
-    }
-    if (!dx && !dy) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Not support all outputs to be empty."));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(elementwise_pow,
-                       ops::ElementwisePowMLUKernel<plat::float16>,
-                       ops::ElementwisePowMLUKernel<float>);
-
-REGISTER_OP_MLU_KERNEL(elementwise_pow_grad,
-                       ops::ElementwisePowGradMLUKernel<plat::float16>,
-                       ops::ElementwisePowGradMLUKernel<float>);
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_SUB);
-  }
-};
-
-template <typename T>
-class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-
-    MLUCnnlTensorDesc dout_desc(*dout);
-
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
-
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dx_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        GetReduceAxesAndDstDims(
-            axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
-
-        MLUCnnlReduceDesc reduction_desc(reduce_axes,
-                                         CNNL_REDUCE_ADD,
-                                         ToCnnlDataType<T>(),
-                                         CNNL_NOT_PROPAGATE_NAN,
-                                         CNNL_REDUCE_NO_INDICES,
-                                         CNNL_32BIT_INDICES);
-        MLUCnnlTensorDesc dy_desc(
-            dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
-        MLUCnnl::Reduce(ctx,
-                        true /*need_workspace*/,
-                        reduction_desc.get(),
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        0,
-                        nullptr,
-                        nullptr,
-                        dy_desc.get(),
-                        GetBasePtr(dy));
-        tmp_dout = dy;
-      }
-
-      // call neg op, dy = -dout
-      MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout);
-      MLUCnnlTensorDesc dy_desc(*dy);
-
-      MLUUnary<NEG>(ctx,
-                    CNNL_COMPUTATION_HIGH_PRECISION,
-                    tmp_dout_desc.get(),
-                    GetBasePtr(tmp_dout),
-                    dy_desc.get(),
-                    GetBasePtr(dy));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(elementwise_sub,
-                       ops::ElementwiseSubMLUKernel<int>,
-                       ops::ElementwiseSubMLUKernel<float>,
-                       ops::ElementwiseSubMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(elementwise_sub_grad,
-                       ops::ElementwiseSubGradMLUKernel<int>,
-                       ops::ElementwiseSubGradMLUKernel<float>,
-                       ops::ElementwiseSubGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape(
          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
      shape_data = cpu_shape_tensor.data<int>();
    }
-#endif
-#ifdef PADDLE_WITH_MLU
-    if (platform::is_mlu_place(shape_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-      shape_data = cpu_shape_tensor.data<int>();
-    }
 #endif
    auto vec_shape =
        std::vector<int>(shape_data, shape_data + shape_tensor->numel());
@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape(
        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
        vec_epxand_shape.push_back(*temp.data<int32_t>());
      }
-#endif
-#ifdef PADDLE_WITH_MLU
-      else if (platform::is_mlu_place(tensor->place())) {  // NOLINT
-        phi::DenseTensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_shape.push_back(*temp.data<int32_t>());
-      }
 #endif
      else {  // NOLINT
        vec_epxand_shape.push_back(*tensor->data<int32_t>());

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,11 +6,7 @@ if(WITH_XPU)
 endif()

 # please add new math_library in alphabetical order
-if(WITH_MLU)
-  math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
-else()
-  math_library(concat_and_split DEPS concat_and_split_functor)
-endif()
+math_library(concat_and_split DEPS concat_and_split_functor)
 math_library(context_project DEPS im2col math_function)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)

--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -17,9 +17,6 @@ limitations under the License. */

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"

-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#endif
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"

@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
 };
 #endif

-#ifdef PADDLE_WITH_MLU
-template <typename T>
-class ConcatFunctor<platform::MLUDeviceContext, T> {
- public:
-  void operator()(const platform::MLUDeviceContext& context,
-                  const std::vector<phi::DenseTensor>& input,
-                  int axis,
-                  phi::DenseTensor* output) {
-    int dev_id = context.GetPlace().GetDeviceId();
-    platform::MLUDeviceGuard guard(dev_id);
-
-    auto ins_size = input.size();
-
-    const int axis_t = axis;
-    const int ins_size_t = ins_size;
-
-    // mlu should do sth
-    // init ins tensors
-    std::vector<const void*> inputs;
-    std::vector<MLUCnnlTensorDesc> input_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < ins_size; i++) {
-      input_descs.emplace_back(MLUCnnlTensorDesc(
-          input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
-      desc_vector.push_back(input_descs.back().get());
-      inputs.push_back(input[i].data());
-    }
-    // init out tensors
-    MLUCnnlTensorDesc output_desc(
-        *output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
-
-    // MLU should do sth
-    MLUCnnl::Concat(context,
-                    ins_size_t,
-                    axis_t,
-                    desc_vector.data(),
-                    inputs.data(),
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-
-template <typename T>
-class SplitFunctor<platform::MLUDeviceContext, T> {
- public:
-  void operator()(const platform::MLUDeviceContext& context,
-                  const phi::DenseTensor& input,
-                  const std::vector<const phi::DenseTensor*>& ref_inputs,
-                  const int axis,
-                  std::vector<phi::DenseTensor*>* outputs) {
-    if (input.numel() == 0) {
-      return;
-    }
-
-    int dev_id = context.GetPlace().GetDeviceId();
-    platform::MLUDeviceGuard guard(dev_id);
-
-    auto in_dims = input.dims();
-    auto out_size = outputs->size();
-
-    std::vector<framework::DDim> outs_dims(out_size, in_dims);
-    for (size_t i = 0; i < out_size; ++i) {
-      outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
-    }
-
-    // init out tensors
-    std::vector<void*> vct_tensor;
-    std::vector<MLUCnnlTensorDesc> output_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < out_size; i++) {
-      (*outputs)[i]->Resize(outs_dims[i]);
-      output_descs.emplace_back(
-          MLUCnnlTensorDesc(*(*outputs)[i],
-                            CNNL_LAYOUT_ARRAY,
-                            ToCnnlDataType((*outputs)[i]->dtype())));
-      desc_vector.push_back(output_descs.back().get());
-      vct_tensor.push_back(GetBasePtr((*outputs)[i]));
-    }
-    // init in tensors
-    MLUCnnlTensorDesc input_desc(
-        input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input.dtype()));
-
-    // MLU should do sth
-    MLUCnnl::Split(context,
-                   out_size,
-                   axis,
-                   input_desc.get(),
-                   input.data(),
-                   desc_vector.data(),
-                   vct_tensor.data());
-  }
-};
-#endif
-
 #define DEFINE_FUNCTOR(type)                           \
  template class ConcatFunctor<phi::CPUContext, type>; \
  template class SplitFunctor<phi::CPUContext, type>;
@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
 DEFINE_XPU_FUNCTOR(float)
 DEFINE_XPU_FUNCTOR(platform::float16)
 #endif
-
-#ifdef PADDLE_WITH_MLU
-#define DEFINE_MLU_FUNCTOR(type)                                  \
-  template class ConcatFunctor<platform::MLUDeviceContext, type>; \
-  template class SplitFunctor<platform::MLUDeviceContext, type>;
-DEFINE_MLU_FUNCTOR(float)
-DEFINE_MLU_FUNCTOR(platform::float16)
-DEFINE_MLU_FUNCTOR(int64_t)
-DEFINE_MLU_FUNCTOR(bool)
-DEFINE_MLU_FUNCTOR(int)
-DEFINE_MLU_FUNCTOR(int8_t)
-DEFINE_MLU_FUNCTOR(int16_t)
-DEFINE_MLU_FUNCTOR(uint8_t)
-#endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class AccuracyMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-
-    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
-    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
-    auto* total = ctx.Output<phi::DenseTensor>("Total");
-
-    int num_samples = indices->dims()[0];
-    if (num_samples == 0) {
-      return;
-    }
-
-    // cast `indices` or `label` if their type is not INT32
-    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    phi::DenseTensor label_int32(framework::TransToPhiDataType(VT::INT32));
-    auto indices_type = framework::TransToProtoVarType(indices->type());
-    if (indices_type != VT::INT32) {
-      PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32),
-                        true,
-                        platform::errors::Unimplemented(
-                            "In accuracy mlu kernel, cast indices from [%s] to "
-                            "[%s] is not supported.",
-                            framework::DataTypeToString(indices_type),
-                            framework::DataTypeToString(VT::INT32)));
-      indices_int32.Resize(indices->dims());
-      indices_int32.mutable_data<int>(ctx.GetPlace());
-      MLUCnnlTensorDesc org_indices_desc(*indices);
-      MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-      cnnlCastDataType_t cast_type = GetCastDataType(indices_type, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    org_indices_desc.get(),
-                    GetBasePtr(indices),
-                    indices_int32_desc.get(),
-                    GetBasePtr(&indices_int32));
-    } else {
-      indices_int32.ShareDataWith(*indices);
-    }
-    auto label_type = framework::TransToProtoVarType(label->type());
-    if (label_type != VT::INT32) {
-      PADDLE_ENFORCE_EQ(
-          MLUSupportsCast(label_type, VT::INT32),
-          true,
-          platform::errors::Unimplemented(
-              "In accuracy mlu kernel, cast label from [%s] to [%s] "
-              "is not supported.",
-              framework::DataTypeToString(label_type),
-              framework::DataTypeToString(VT::INT32)));
-      label_int32.Resize(label->dims());
-      label_int32.mutable_data<int>(ctx.GetPlace());
-      MLUCnnlTensorDesc org_label_desc(*label);
-      MLUCnnlTensorDesc label_int32_desc(label_int32);
-      cnnlCastDataType_t cast_type = GetCastDataType(label_type, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    org_label_desc.get(),
-                    GetBasePtr(label),
-                    label_int32_desc.get(),
-                    GetBasePtr(&label_int32));
-    } else {
-      label_int32.ShareDataWith(*label);
-    }
-
-    // equal
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnlTensorDesc label_int32_desc(label_int32);
-    phi::DenseTensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
-    equal_tensor.Resize(indices->dims());
-    equal_tensor.mutable_data<bool>(ctx.GetPlace());
-    MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
-    MLUCnnl::Logic(ctx,
-                   CNNL_LOGIC_OP_EQ,
-                   indices_int32_desc.get(),
-                   GetBasePtr(&indices_int32),
-                   label_int32_desc.get(),
-                   GetBasePtr(&label_int32),
-                   equal_tensor_desc.get(),
-                   GetBasePtr(&equal_tensor));
-
-    // cast equal
-    phi::DenseTensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
-    equal_fp32.Resize(indices->dims());
-    equal_fp32.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
-    cnnlCastDataType_t equal_cast_type = GetCastDataType(VT::BOOL, VT::FP32);
-    MLUCnnl::Cast(ctx,
-                  equal_cast_type,
-                  equal_tensor_desc.get(),
-                  GetBasePtr(&equal_tensor),
-                  equal_fp32_desc.get(),
-                  GetBasePtr(&equal_fp32));
-
-    // [correct]
-    // reduce_max
-    phi::DenseTensor correct_max(framework::TransToPhiDataType(VT::FP32));
-    correct_max.Resize(phi::make_ddim({num_samples}));
-    correct_max.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc correct_max_desc(correct_max);
-    MLUCnnlReduceDesc reduce_max_desc({1},
-                                      CNNL_REDUCE_MAX,
-                                      ToCnnlDataType<float>(),
-                                      CNNL_NOT_PROPAGATE_NAN,
-                                      CNNL_REDUCE_NO_INDICES,
-                                      CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(ctx,
-                    true /*need_workspace*/,
-                    reduce_max_desc.get(),
-                    nullptr,
-                    equal_fp32_desc.get(),
-                    GetBasePtr(&equal_fp32),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    correct_max_desc.get(),
-                    GetBasePtr(&correct_max));
-
-    // reduce_sum
-    phi::DenseTensor correct_sum(framework::TransToPhiDataType(VT::FP32));
-    correct_sum.Resize(correct->dims());
-    correct_sum.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc correct_sum_desc(correct_sum);
-    MLUCnnlReduceDesc reduce_sum_desc({0},
-                                      CNNL_REDUCE_ADD,
-                                      ToCnnlDataType<float>(),
-                                      CNNL_NOT_PROPAGATE_NAN,
-                                      CNNL_REDUCE_NO_INDICES,
-                                      CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(ctx,
-                    true /*need_workspace*/,
-                    reduce_sum_desc.get(),
-                    nullptr,
-                    correct_max_desc.get(),
-                    GetBasePtr(&correct_max),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    correct_sum_desc.get(),
-                    GetBasePtr(&correct_sum));
-
-    // cast to int
-    correct->mutable_data<int>(ctx.GetPlace());
-    MLUCnnlTensorDesc correct_desc(*correct);
-    cnnlCastDataType_t correct_cast_type = GetCastDataType(VT::FP32, VT::INT32);
-    MLUCnnl::Cast(ctx,
-                  correct_cast_type,
-                  correct_sum_desc.get(),
-                  GetBasePtr(&correct_sum),
-                  correct_desc.get(),
-                  GetBasePtr(correct));
-
-    // [total]
-    total->mutable_data<int>(ctx.GetPlace());
-    MLUCnnlTensorDesc total_desc(*total);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &num_samples,
-                  total_desc.get(),
-                  GetBasePtr(total));
-
-    // use `total` of type `float32` for calculating accuracy
-    phi::DenseTensor total_fp32(framework::TransToPhiDataType(VT::FP32));
-    total_fp32.Resize(total->dims());
-    total_fp32.mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc total_fp32_desc(total_fp32);
-    float num_samples_fp32 = static_cast<float>(num_samples);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &num_samples_fp32,
-                  total_fp32_desc.get(),
-                  GetBasePtr(&total_fp32));
-
-    // [accuracy]
-    accuracy->mutable_data<float>(ctx.GetPlace());
-    MLUCnnlTensorDesc accuracy_desc(*accuracy);
-    MLUCnnl::Div(ctx,
-                 CNNL_COMPUTATION_HIGH_PRECISION,
-                 correct_sum_desc.get(),
-                 GetBasePtr(&correct_sum),
-                 total_fp32_desc.get(),
-                 GetBasePtr(&total_fp32),
-                 accuracy_desc.get(),
-                 GetBasePtr(accuracy));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(accuracy,
-                       ops::AccuracyMLUKernel<float>,
-                       ops::AccuracyMLUKernel<paddle::platform::float16>,
-                       ops::AccuracyMLUKernel<int16_t>,
-                       ops::AccuracyMLUKernel<int64_t>,
-                       ops::AccuracyMLUKernel<uint8_t>,
-                       ops::AccuracyMLUKernel<int>);
--- a/paddle/fluid/operators/mlu/CMakeLists.txt
+++ b/paddle/fluid/operators/mlu/CMakeLists.txt
-if(WITH_MLU)
-  cc_library(
-    mlu_baseop
-    SRCS mlu_baseop.cc
-    DEPS neuware_lib device_context)
-  cc_test(
-    activation_op_mlu_test
-    SRCS activation_op_mlu_test.cc
-    DEPS op_registry activation_op scope device_context executor)
-endif()
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
--- a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
-if(WITH_MLU)
-  set(MLU_INFO mlu_info)
-endif()
-
-cc_library(
-  mlu_tracer
-  SRCS mlu_tracer.cc cnpapi_data_process.cc
-  DEPS workqueue_utils enforce glog ${MLU_INFO})
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.h
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
--- a/paddle/phi/backends/mlu/mlu_info.h
+++ b/paddle/phi/backends/mlu/mlu_info.h
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
--- a/test/cpp/imperative/cncl_context_test.cc
+++ b/test/cpp/imperative/cncl_context_test.cc
--- a/test/cpp/imperative/test_group.cc
+++ b/test/cpp/imperative/test_group.cc