remove mlu(#53007)

4d5a3ad6 · 张春乔 · GitHub · 240e13a2 · 4d5a3ad6 · 4d5a3ad6
37 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -62,7 +62,6 @@ function(op_library TARGET)
  set(hip_cc_srcs)
  set(xpu_cc_srcs)
  set(xpu_kp_cc_srcs)
-  set(mlu_cc_srcs)
  set(cudnn_cu_cc_srcs)
  set(miopen_cu_cc_srcs)
  set(cudnn_cu_srcs)
@@ -307,9 +306,8 @@ function(op_library TARGET)
    # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
    if(WITH_UNITY_BUILD AND op_library_UNITY)
      # Combine the cc source files.
-      compose_unity_target_sources(
+      compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs}
-        ${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}
+                                   ${mkldnn_cc_srcs} ${xpu_cc_srcs})
-        ${mlu_cc_srcs})
      if(TARGET ${UNITY_TARGET})
        # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
        target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -325,7 +323,7 @@ function(op_library TARGET)
    else()
      cc_library(
        ${TARGET}
-        SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${mlu_cc_srcs}
+        SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}
        DEPS ${op_library_DEPS} ${op_common_deps})
    endif()
  endif()
@@ -337,7 +335,6 @@ function(op_library TARGET)
  list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
  list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
  list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
-  list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
  # Define operators that don't need pybind here.
  foreach(
@@ -562,7 +559,6 @@ function(register_operators)
    "*_op.cc")
  string(REPLACE "_mkldnn" "" OPS "${OPS}")
  string(REPLACE "_xpu" "" OPS "${OPS}")
-  string(REPLACE "_mlu" "" OPS "${OPS}")
  string(REPLACE ".cc" "" OPS "${OPS}")
  list(REMOVE_DUPLICATES OPS)
  list(LENGTH register_operators_DEPS register_operators_DEPS_len)

--- a/paddle/fluid/eager/amp_auto_cast.h
+++ b/paddle/fluid/eager/amp_auto_cast.h
@@ -27,7 +27,6 @@ static inline bool NeedCast(const paddle::Tensor& tensor,
  if (paddle::platform::is_gpu_place(place) ||
      paddle::platform::is_cuda_pinned_place(place) ||
      paddle::platform::is_xpu_place(place) ||
-      paddle::platform::is_mlu_place(place) ||
      paddle::platform::is_npu_place(place) ||
      paddle::platform::is_npu_pinned_place(place) ||
      paddle::platform::is_custom_place(place)) {

--- a/paddle/fluid/eager/amp_utils.h
+++ b/paddle/fluid/eager/amp_utils.h
@@ -96,7 +96,6 @@ inline phi::DataType GetDtypeWithPlace(
      is_right_place = (paddle::platform::is_gpu_place(place) ||
                        paddle::platform::is_cuda_pinned_place(place) ||
                        paddle::platform::is_xpu_place(place) ||
-                        paddle::platform::is_mlu_place(place) ||
                        paddle::platform::is_npu_place(place) ||
                        paddle::platform::is_npu_pinned_place(place) ||
                        paddle::platform::is_custom_place(place));

--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -27,7 +27,6 @@ static inline bool NeedCast(const paddle::Tensor& tensor,
  if (paddle::platform::is_gpu_place(place) ||
      paddle::platform::is_cuda_pinned_place(place) ||
      paddle::platform::is_xpu_place(place) ||
-      paddle::platform::is_mlu_place(place) ||
      paddle::platform::is_npu_place(place) ||
      paddle::platform::is_npu_pinned_place(place) ||
      paddle::platform::is_custom_place(place) ||

--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -72,8 +72,6 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
-  } else if (s == std::string("MLU")) {
-    return LibraryType::kPlain;
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Unknown LibraryType string (%s), only support library type string "

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -276,7 +276,6 @@ void TensorCopyImpl(const TENSOR& src,
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
-      platform::is_mlu_place(dst_place) ||
      platform::is_custom_place(dst_place)) {
    dev_ctx = pool.Get(dst_place);
  } else {
@@ -615,7 +614,6 @@ void TensorFromStream(std::istream& is,
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
-        platform::is_mlu_place(dev_ctx.GetPlace()) ||
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
@@ -691,7 +689,6 @@ void TensorFromStream(std::istream& is,
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
-        platform::is_mlu_place(dev_ctx.GetPlace()) ||
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -166,5 +166,4 @@ cc_library(
       var_type_traits
       layer
       math_function
-       phi_tensor
+       phi_tensor)
-       ${MLU_DEPS})
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -52,7 +52,6 @@ OpSupportedInfos(const std::string& place,
      {"CPU", &platform::is_cpu_place},
      {"XPU", &platform::is_xpu_place},
      {"NPU", &platform::is_npu_place},
-      {"MLU", &platform::is_mlu_place},
  };
  PADDLE_ENFORCE_NE(is_target_place.count(query_place),
                    0,
@@ -245,7 +244,6 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
  if (paddle::platform::is_gpu_place(place) ||
      paddle::platform::is_cuda_pinned_place(place) ||
      paddle::platform::is_xpu_place(place) ||
-      paddle::platform::is_mlu_place(place) ||
      paddle::platform::is_custom_place(place) ||
      paddle::platform::is_npu_place(place) ||
      paddle::platform::is_npu_pinned_place(place)) {

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -66,9 +66,6 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #ifdef PADDLE_WITH_XPU_BKCL
 // TODO(liuyuhui) support xpu about div nranks in the future
 #endif
-  } else if (platform::is_mlu_place(tensor->place())) {
-    // TODO(zhangna)
-    VLOG(4) << "divnrank for mlu not support yet";
  }
 }

--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -61,8 +61,7 @@ class CCommInitOp : public framework::OperatorBase {
 #endif
    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(place) || platform::is_xpu_place(place) ||
+        platform::is_gpu_place(place) || platform::is_xpu_place(place),
-            platform::is_mlu_place(place),
        true,
        platform::errors::PreconditionNotMet(
            "CCommInitOp can run on gpu or xpu or mlu place only."));

--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -76,7 +76,6 @@ phi::KernelKey GetReduceExpectedKernelType(
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx.GetPlace()) ||
            platform::is_npu_place(ctx.GetPlace()) ||
-            platform::is_mlu_place(ctx.GetPlace()) ||
            platform::is_xpu_place(ctx.GetPlace()) ||
            platform::is_custom_place(ctx.GetPlace()),
        true,

--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -41,8 +41,7 @@ inline std::vector<int> get_new_shape(
                          "The shape of dimension tensor should be [1],"
                          "but received d%.",
                          tensor->dims()));
-    if (platform::is_gpu_place(tensor->place()) ||
+    if (platform::is_gpu_place(tensor->place())) {
-        platform::is_mlu_place(tensor->place())) {
      phi::DenseTensor temp;
      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
@@ -60,8 +59,7 @@ inline std::vector<T> get_new_data_from_tensor(
  std::vector<T> vec_new_data;
  auto* new_data = new_data_tensor->data<T>();
  phi::DenseTensor cpu_starts_tensor;
-  if (platform::is_gpu_place(new_data_tensor->place()) ||
+  if (platform::is_gpu_place(new_data_tensor->place())) {
-      platform::is_mlu_place(new_data_tensor->place())) {
    paddle::framework::TensorCopySync(
        *new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -99,7 +99,6 @@ BufferedReader::BufferedReader(
  cpu_buffer_.resize(buffer_size);
  cuda_buffer_.resize(buffer_size);
  npu_buffer_.resize(buffer_size);
-  mlu_buffer_.resize(buffer_size);
  xpu_buffer_.resize(buffer_size);
  custom_device_buffer_.resize(buffer_size);
  ReadTillBufferFullAsync();
@@ -387,8 +386,6 @@ void BufferedReader::ReadNextImpl(paddle::framework::LoDTensorArray *out) {
    *out = std::move(cuda_buffer_[i]);
  } else if (platform::is_npu_place(place_)) {
    *out = std::move(npu_buffer_[i]);
-  } else if (platform::is_mlu_place(place_)) {
-    *out = std::move(mlu_buffer_[i]);
  } else if (platform::is_xpu_place(place_)) {
    *out = std::move(xpu_buffer_[i]);
  } else if (platform::is_custom_place(place_)) {

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -78,7 +78,6 @@ class BufferedReader : public framework::DecoratedReader {
  std::vector<TensorVec> cpu_buffer_;
  std::vector<TensorVec> cuda_buffer_;
  std::vector<TensorVec> npu_buffer_;
-  std::vector<TensorVec> mlu_buffer_;
  std::vector<TensorVec> xpu_buffer_;
  std::vector<TensorVec> custom_device_buffer_;
  size_t prev_pos_{-1UL};

--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -625,7 +625,6 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE_EQ(
          platform::is_gpu_place(ctx.GetPlace()) ||
              platform::is_npu_place(ctx.GetPlace()) ||
-              platform::is_mlu_place(ctx.GetPlace()) ||
              platform::is_xpu_place(ctx.GetPlace()) ||
              platform::is_custom_place(ctx.GetPlace()),
          true,

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -45,7 +45,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
          platform::is_gpu_place(ctx.GetPlace()) ||
              platform::is_npu_place(ctx.GetPlace()) ||
              platform::is_xpu_place(ctx.GetPlace()) ||
-              platform::is_mlu_place(ctx.GetPlace()) ||
              platform::is_custom_place(ctx.GetPlace()),
          true,
          platform::errors::InvalidArgument(
@@ -131,7 +130,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
      if (!(platform::is_gpu_place(ctx.GetPlace()) ||
            platform::is_npu_place(ctx.GetPlace()) ||
            platform::is_xpu_place(ctx.GetPlace()) ||
-            platform::is_mlu_place(ctx.GetPlace()) ||
            platform::is_custom_place(ctx.GetPlace())))
        PADDLE_THROW(platform::errors::InvalidArgument(
            "float16 can only be used on GPU/NPU/XPU/MLU and custom place"));

--- a/paddle/fluid/operators/tile_op_functor.h
+++ b/paddle/fluid/operators/tile_op_functor.h
@@ -30,7 +30,6 @@ inline std::vector<int> get_repeat_times(
    phi::DenseTensor cpu_repeat_tensor;
    if (platform::is_gpu_place(repeat_tensor->place()) ||
        platform::is_xpu_place(repeat_tensor->place()) ||
-        platform::is_mlu_place(repeat_tensor->place()) ||
        platform::is_npu_place(repeat_tensor->place())) {
      paddle::framework::TensorCopySync(
          *repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
@@ -50,7 +49,6 @@ inline std::vector<int> get_repeat_times(
      auto tensor = list_repeat_times_tensor[i];
      if (platform::is_gpu_place(tensor->place()) ||
          platform::is_xpu_place(tensor->place()) ||
-          platform::is_mlu_place(tensor->place()) ||
          platform::is_npu_place(tensor->place())) {
        phi::DenseTensor temp;
        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -129,7 +129,6 @@ cc_library(
       dlpack
       cudnn_workspace_helper
       ${XPU_CTX_DEPS}
-       ${MLU_CTX_DEPS}
       phi_backends
       phi_device_context
       generator

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -47,8 +47,6 @@ DeviceType Place2DeviceType(const platform::Place& place) {
    return platform::DeviceType::IPU;
  } else if (platform::is_npu_place(place)) {
    return platform::DeviceType::NPU;
-  } else if (platform::is_mlu_place(place)) {
-    return platform::DeviceType::MLU;
  } else if (platform::is_custom_place(place)) {
    return platform::DeviceType::CUSTOM_DEVICE;
  } else {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -109,7 +109,6 @@ constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kNPU = DeviceType::NPU;
 constexpr DeviceType kIPU = DeviceType::IPU;
-constexpr DeviceType kMLU = DeviceType::MLU;
 constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE;
 using DeviceContext = phi::DeviceContext;

--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -46,21 +46,3 @@ DEFINE_INT_STATUS(STAT_npu4_mem_size)
 DEFINE_INT_STATUS(STAT_npu5_mem_size)
 DEFINE_INT_STATUS(STAT_npu6_mem_size)
 DEFINE_INT_STATUS(STAT_npu7_mem_size)
-// For MLU
-DEFINE_INT_STATUS(STAT_mlu0_mem_size)
-DEFINE_INT_STATUS(STAT_mlu1_mem_size)
-DEFINE_INT_STATUS(STAT_mlu2_mem_size)
-DEFINE_INT_STATUS(STAT_mlu3_mem_size)
-DEFINE_INT_STATUS(STAT_mlu4_mem_size)
-DEFINE_INT_STATUS(STAT_mlu5_mem_size)
-DEFINE_INT_STATUS(STAT_mlu6_mem_size)
-DEFINE_INT_STATUS(STAT_mlu7_mem_size)
-DEFINE_INT_STATUS(STAT_mlu8_mem_size)
-DEFINE_INT_STATUS(STAT_mlu9_mem_size)
-DEFINE_INT_STATUS(STAT_mlu10_mem_size)
-DEFINE_INT_STATUS(STAT_mlu11_mem_size)
-DEFINE_INT_STATUS(STAT_mlu12_mem_size)
-DEFINE_INT_STATUS(STAT_mlu13_mem_size)
-DEFINE_INT_STATUS(STAT_mlu14_mem_size)
-DEFINE_INT_STATUS(STAT_mlu15_mem_size)
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -200,21 +200,3 @@ class StatRegistry {
  USE_INT_STAT(STAT_npu5_mem_size); \
  USE_INT_STAT(STAT_npu6_mem_size); \
  USE_INT_STAT(STAT_npu7_mem_size)
-#define USE_MLU_MEM_STAT             \
-  USE_INT_STAT(STAT_mlu0_mem_size);  \
-  USE_INT_STAT(STAT_mlu1_mem_size);  \
-  USE_INT_STAT(STAT_mlu2_mem_size);  \
-  USE_INT_STAT(STAT_mlu3_mem_size);  \
-  USE_INT_STAT(STAT_mlu4_mem_size);  \
-  USE_INT_STAT(STAT_mlu5_mem_size);  \
-  USE_INT_STAT(STAT_mlu6_mem_size);  \
-  USE_INT_STAT(STAT_mlu7_mem_size);  \
-  USE_INT_STAT(STAT_mlu8_mem_size);  \
-  USE_INT_STAT(STAT_mlu9_mem_size);  \
-  USE_INT_STAT(STAT_mlu10_mem_size); \
-  USE_INT_STAT(STAT_mlu11_mem_size); \
-  USE_INT_STAT(STAT_mlu12_mem_size); \
-  USE_INT_STAT(STAT_mlu13_mem_size); \
-  USE_INT_STAT(STAT_mlu14_mem_size); \
-  USE_INT_STAT(STAT_mlu15_mem_size)
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -33,10 +33,6 @@ bool is_xpu_place(const Place &p) {
  return p.GetType() == phi::AllocationType::XPU;
 }
-bool is_mlu_place(const Place &p) {
-  return p.GetType() == phi::AllocationType::MLU;
-}
 bool is_npu_place(const Place &p) {
  return p.GetType() == phi::AllocationType::NPU;
 }
@@ -77,8 +73,6 @@ bool is_same_place(const Place &p1, const Place &p2) {
      return true;
    } else if (is_xpu_place(p1)) {
      return p1 == p2;
-    } else if (is_mlu_place(p1)) {
-      return p1 == p2;
    } else if (is_npu_place(p1)) {
      return p1 == p2;
    } else if (is_ipu_place(p1)) {

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -48,7 +48,6 @@ class PlaceHelper {
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
-bool is_mlu_place(const Place &);
 bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);

--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -257,7 +257,6 @@ void ChromeTracingLogger::LogHostTraceEventNode(
    case TracerEventType::UserDefined:
    case TracerEventType::OperatorInner:
    case TracerEventType::Communication:
-    case TracerEventType::MluRuntime:
    case TracerEventType::NumTypes:
    default:
      output_file_stream_ << string_format(

--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -34,7 +34,6 @@ namespace platform {
 static constexpr uint32_t kProfileCPUOptionBit = 0;
 static constexpr uint32_t kProfileGPUOptionBit = 1;
-static constexpr uint32_t kProfileMLUOptionBit = 2;
 static constexpr uint32_t kProfileCustomDeviceOptionBit = 3;
 void SynchronizeDevice();

--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -165,8 +165,7 @@ const char* StringTracerEventType(TracerEventType type) {
                                         "Optimization",
                                         "Communication",
                                         "PythonOp",
-                                         "PythonUserDefined",
+                                         "PythonUserDefined"};
-                                         "MluRuntime"};
  return categary_name_[static_cast<int>(type)];
 }

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -656,8 +656,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
           [](platform::Place &self) {
             return platform::is_cuda_pinned_place(self);
           })
-      .def("is_mlu_place",
-           [](platform::Place &self) { return platform::is_mlu_place(self); })
      .def(
          "is_custom_place",
          [](platform::Place &self) { return platform::is_custom_place(self); })
@@ -665,7 +663,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
      .def("npu_device_id", [](platform::Place &self) { return self.device; })
      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
-      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
      .def("custom_device_id",
           [](platform::Place &self) { return self.device; })
      .def("set_place",

--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -401,7 +401,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
        Args:
          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
          Tensor is to be set.
          zero_copy (bool, optional): Whether to share memory with the input numpy array.
          This parameter only works with CPUPlace. Default: False.

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -961,7 +961,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
  bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
  bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
  bool is_npu_tensor = platform::is_npu_place(tensor.place());
-  bool is_mlu_tensor = platform::is_mlu_place(tensor.place());
  bool is_custom_device_tensor = platform::is_custom_place(tensor.place());
  const auto &tensor_dims = tensor.dims();
  auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype());
@@ -982,7 +981,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
  std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(
      framework::TransToProtoVarType(tensor.dtype()));
-  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor &&
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor &&
      !is_custom_device_tensor) {
    if (!need_deep_copy) {
      auto base = py::cast(std::move(tensor));

--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -95,9 +95,6 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
    case Backend::NPU:
      os << "NPU";
      break;
-    case Backend::MLU:
-      os << "MLU";
-      break;
    case Backend::ONEDNN:
      os << "ONEDNN";
      break;
@@ -143,8 +140,6 @@ inline Backend StringToBackend(const char* backend_cstr) {
    return Backend::XPU;
  } else if (s == std::string("NPU")) {
    return Backend::NPU;
-  } else if (s == std::string("MLU")) {
-    return Backend::MLU;
  } else if (s == std::string("OneDNN")) {
    return Backend::ONEDNN;
  } else if (s == std::string("GPUDNN")) {
@@ -181,8 +176,6 @@ inline std::string BackendToString(const Backend& backend) {
      return "XPU";
    case Backend::NPU:
      return "NPU";
-    case Backend::MLU:
-      return "MLU";
    case Backend::ONEDNN:
      return "ONEDNN";
    case Backend::GPUDNN:

--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -41,8 +41,6 @@ const char *AllocationTypeStr(AllocationType type) {
      return "npu_pinned";
    case AllocationType::IPU:
      return "ipu";
-    case AllocationType::MLU:
-      return "mlu";
    default:
      PD_THROW("Invalid phi device type.");
      return {};

--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -41,8 +41,6 @@ Backend TransToPhiBackend(const phi::Place& place) {
      return Backend::NPU;
    case AllocationType::IPU:
      return Backend::IPU;
-    case AllocationType::MLU:
-      return Backend::MLU;
    case AllocationType::CUSTOM:
      return static_cast<Backend>(
          static_cast<size_t>(Backend::NUM_BACKENDS) +

--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -108,8 +108,6 @@ def _get_default_nprocs():
        return core.get_cuda_device_count()
    elif 'xpu' in device:
        return core.get_xpu_device_count()
-    elif 'mlu' in device:
-        return core.get_custom_device_count('mlu')
    elif 'cpu' in device:
        return multiprocessing.cpu_count()
    else:

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2734,9 +2734,7 @@ def cross_entropy(
            valid_label = (
                paddle.cast(label != ignore_index, dtype=label.dtype) * label
            )
-        if core.is_compiled_with_custom_device(
+        if core.is_compiled_with_custom_device("npu"):
-            "npu"
-        ) or core.is_compiled_with_custom_device("mlu"):
            if not soft_label:
                _, out = _legacy_C_ops.softmax_with_cross_entropy(
                    input,

--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -499,8 +499,6 @@ class Profiler:
            profileoption.trace_switch |= 1
        if ProfilerTarget.GPU in self.targets:
            profileoption.trace_switch |= 1 << 1
-        if ProfilerTarget.MLU in self.targets:
-            profileoption.trace_switch |= 1 << 2
        if ProfilerTarget.CUSTOM_DEVICE in self.targets:
            profileoption.trace_switch |= 1 << 3
            if not custom_device_types:

--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -343,7 +343,6 @@ class PRChecker:
                elif (
                    ('/xpu/' in filename.lower())
                    or ('/npu/' in filename.lower())
-                    or ('/mlu/' in filename.lower())
                    or ('/ipu/' in filename.lower())
                ):
                    filterFiles.append(filename)