rem is_compiled_with_mlu (#52378)

* rem is_compiled_with_mlu * fix some mlu_place and mlu_device_coount * make lint happy

rem is_compiled_with_mlu (#52378)
* rem is_compiled_with_mlu * fix some mlu_place and mlu_device_coount * make lint happy
4b28f4ff · Kim Yann · GitHub · a725c9a5 · 4b28f4ff · 4b28f4ff
24 changed file
--- a/.flake8
+++ b/.flake8
@@ -11,7 +11,6 @@ exclude =
    # Exclude files that will be removed in the future, see more at
    # https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
    ./python/paddle/fluid/tests/unittests/npu/**,
-    ./python/paddle/fluid/tests/unittests/mlu/**
 ignore =
    # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
    E203,

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,8 +5,7 @@ exclude: |
        paddle/fluid/framework/fleet/heter_ps/cudf/.+|
        paddle/fluid/distributed/ps/thirdparty/round_robin.h|
        python/paddle/utils/gast/.+|
-        python/paddle/fluid/tests/unittests/npu/.+|
-        python/paddle/fluid/tests/unittests/mlu/.+
+        python/paddle/fluid/tests/unittests/npu/.+
    )$
 repos:
 # Common hooks

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -175,10 +175,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif
@@ -335,14 +331,6 @@ bool IsCompiledWithCINN() {
 #endif
 }

-bool IsCompiledWithMLU() {
-#ifndef PADDLE_WITH_MLU
-  return false;
-#else
-  return true;
-#endif
-}
-
 bool IsCompiledWithHETERPS() {
 #ifndef PADDLE_WITH_HETERPS
  return false;
@@ -1612,18 +1600,6 @@ All parameter, weight, gradient are variables in Paddle.
          .GetZeroAllocator(paddle::platform::CPUPlace())
          .get());
      return context;
-#endif
-          })
-      .def_static(
-          "create",
-          [](paddle::platform::MLUPlace &place)
-              -> paddle::platform::DeviceContext * {
-#ifndef PADDLE_WITH_MLU
-            PADDLE_THROW(platform::errors::PermissionDenied(
-                "Cannot use MLUPlace in CPU/GPU version, "
-                "Please recompile or reinstall Paddle with MLU support."));
-#else
-                    return new paddle::platform::MLUDeviceContext(place);
 #endif
          })
      .def_static(
@@ -1828,13 +1804,6 @@ All parameter, weight, gradient are variables in Paddle.
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
-      .def("run",
-           [](OperatorBase &self,
-              const Scope &scope,
-              const platform::MLUPlace &place) {
-             pybind11::gil_scoped_release release;
-             self.Run(scope, place);
-           })
      .def("run",
           [](OperatorBase &self,
              const Scope &scope,
@@ -2041,7 +2010,6 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_compiled_with_mpi", IsCompiledWithMPI);
  m.def("is_compiled_with_mpi_aware", IsCompiledWithMPIAWARE);
  m.def("is_compiled_with_cinn", IsCompiledWithCINN);
-  m.def("is_compiled_with_mlu", IsCompiledWithMLU);
  m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
  m.def("supports_bfloat16", SupportsBfloat16);
  m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
@@ -2407,10 +2375,6 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
 #endif

-#ifdef PADDLE_WITH_MLU
-  m.def("get_mlu_device_count", platform::GetMLUDeviceCount);
-#endif
-
  py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
      .value("kDefault", platform::TracerOption::kDefault)
      .value("kOpDetail", platform::TracerOption::kOpDetail)

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,6 @@ extend_skip_glob = [
    "python/paddle/fluid/tra**",
    "python/paddle/utils/gast/**",
    "python/paddle/fluid/tests/unittests/npu/**",
-    "python/paddle/fluid/tests/unittests/mlu/**",
 ]

 [tool.ruff]
@@ -25,7 +24,6 @@ exclude = [
    "./python/paddle/fluid/tra**",
    "./python/paddle/utils/gast/**",
    "./python/paddle/fluid/tests/unittests/npu/**",
-    "./python/paddle/fluid/tests/unittests/mlu/**",
 ]
 target-version = "py37"
 select = [

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,8 +4,6 @@ set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES})

 if(WITH_GPU)
  set(PACKAGE_NAME "paddlepaddle-gpu")
-elseif(WITH_MLU)
-  set(PACKAGE_NAME "paddlepaddle-mlu")
 elseif(WITH_ROCM)
  set(PACKAGE_NAME "paddlepaddle-rocm")
 elseif(WITH_ASCEND_CL)

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -336,7 +336,6 @@ from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
-from .framework import MLUPlace  # noqa: F401
 from .framework import CustomPlace  # noqa: F401

 from .autograd import grad  # noqa: F401
@@ -366,7 +365,6 @@ from .device import get_device  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
-from .device import is_compiled_with_mlu  # noqa: F401
 from .device import is_compiled_with_cinn  # noqa: F401
 from .device import is_compiled_with_cuda  # noqa: F401
 from .device import is_compiled_with_rocm  # noqa: F401

--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -339,12 +339,11 @@ def amp_guard(
        )

    # check device_type:
-    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
+    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16.
    # Maybe we will support cpu for bfloat16.
    if enable and not (
        tracer._expected_place.is_gpu_place()
        or tracer._expected_place.is_xpu_place()
-        or tracer._expected_place.is_mlu_place()
        or tracer._expected_place.is_npu_place()
        or tracer._expected_place.is_custom_place()
    ):
@@ -361,10 +360,6 @@ def amp_guard(
    if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
        warnings.warn('XPUPlace only support float16 amp.')
        enable = False
-    # For mlu:
-    if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
-        warnings.warn('MLUPlace only support float16 amp.')
-        enable = False
    # For custom device:
    if tracer._expected_place.is_custom_place() and (dtype == 'bfloat16'):
        warnings.warn('CustomPlace only support float16 amp.')

--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -105,7 +105,6 @@ class AmpScaler:
        if enable and not (
            tracer._expected_place.is_gpu_place()
            or tracer._expected_place.is_xpu_place()
-            or tracer._expected_place.is_mlu_place()
            or tracer._expected_place.is_npu_place()
            or tracer._expected_place.is_custom_place()
        ):

--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -31,14 +31,12 @@ __all__ = [  # noqa
    'get_device',
    'XPUPlace',
    'IPUPlace',
-    'MLUPlace',
    'is_compiled_with_xpu',
    'is_compiled_with_ipu',
    'is_compiled_with_cinn',
    'is_compiled_with_cuda',
    'is_compiled_with_rocm',
    'is_compiled_with_npu',
-    'is_compiled_with_mlu',
    'is_compiled_with_custom_device',
    'get_all_device_type',
    'get_all_custom_device_type',
@@ -154,41 +152,6 @@ def XPUPlace(dev_id):
    return core.XPUPlace(dev_id)


-def is_compiled_with_mlu():
-    """
-    Whether paddle was built with WITH_MLU=ON to support Cambricon MLU
-
-    Returns (bool): whether paddle was built with WITH_MLU=ON
-
-    Examples:
-        .. code-block:: python
-
-            # required: mlu
-
-            import paddle
-            support_mlu = paddle.device.is_compiled_with_mlu()
-    """
-    return core.is_compiled_with_mlu()
-
-
-def MLUPlace(dev_id):
-    """
-    Return a Cambricon MLU Place
-
-    Parameters:
-        dev_id(int): MLU device id
-
-    Examples:
-        .. code-block:: python
-
-            # required: mlu
-
-            import paddle
-            place = paddle.device.MLUPlace(0)
-    """
-    return core.MLUPlace(dev_id)
-
-
 def get_cudnn_version():
    """
    This funciton return the version of cudnn. the retuen value is int which represents the
@@ -263,20 +226,10 @@ def _convert_to_place(device):
                "since PaddlePaddle is not compiled with IPU"
            )
        place = core.IPUPlace()
-    elif lower_device == 'mlu':
-        if not core.is_compiled_with_mlu():
-            raise ValueError(
-                "The device should not be 'mlu', "
-                "since PaddlePaddle is not compiled with MLU"
-            )
-        selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
-        device_id = int(selected_mlus[0])
-        place = core.MLUPlace(device_id)
    else:
        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
-        avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
        if avaliable_gpu_device:
            if not core.is_compiled_with_cuda():
                raise ValueError(
@@ -317,21 +270,10 @@ def _convert_to_place(device):
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.NPUPlace(device_id)
-        if avaliable_mlu_device:
-            if not core.is_compiled_with_mlu():
-                raise ValueError(
-                    "The device should not be {}, since PaddlePaddle is "
-                    "not compiled with mlu".format(avaliable_mlu_device)
-                )
-            device_info_list = device.split(':', 1)
-            device_id = device_info_list[1]
-            device_id = int(device_id)
-            place = core.MLUPlace(device_id)
        if (
            not avaliable_gpu_device
            and not avaliable_xpu_device
            and not avaliable_npu_device
-            and not avaliable_mlu_device
        ):
            device_info_list = device.split(':', 1)
            device_type = device_info_list[0]
@@ -344,7 +286,7 @@ def _convert_to_place(device):
                    "The device must be a string which is like 'cpu', {}".format(
                        ', '.join(
                            f"'{x}', '{x}:x'"
-                            for x in ['gpu', 'xpu', 'npu', 'mlu']
+                            for x in ['gpu', 'xpu', 'npu']
                            + core.get_all_custom_device_type()
                        )
                    )
@@ -354,14 +296,14 @@ def _convert_to_place(device):

 def set_device(device):
    """
-    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU, MLU and IPU.
+    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
    They are represented by string identifiers. This function can specify the global device
    which the OP will run.

    Parameters:
        device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``mlu``, ``gpu:x``, ``xpu:x``, ``npu:x``, ``mlu:x`` and ``ipu``,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
+            where ``x`` is the index of the GPUs, XPUs or NPUs.

    Examples:

@@ -382,7 +324,7 @@ def set_device(device):
 def get_device():
    """
    This funciton can get the current global device of the program is running.
-    It's a string which is like 'cpu', 'gpu:x', 'xpu:x', 'mlu:x' and 'npu:x'. if the global device is not
+    It's a string which is like 'cpu', 'gpu:x', 'xpu:x' and 'npu:x'. if the global device is not
    set, it will return a string which is 'gpu:x' when cuda is avaliable or it
    will return a string which is 'cpu' when cuda is not avaliable.

@@ -410,9 +352,7 @@ def get_device():
    elif isinstance(place, core.IPUPlace):
        num_devices = core.get_ipu_device_count()
        device = f"ipus:{{0-{num_devices - 1}}}"
-    elif isinstance(place, core.MLUPlace):
-        device_id = place.get_device_id()
-        device = 'mlu:' + str(device_id)
+        device = f"ipus:{{0-{num_devices - 1}}}"
    elif isinstance(place, core.CustomPlace):
        device_id = place.get_device_id()
        device_type = place.get_device_type()
@@ -529,7 +469,7 @@ class Event:
    Parameters:
        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
        enable_timing (bool, optional): indicates if the event should measure time, default is False
        blocking (bool, optional): if True, ``wait`` will be blocking, default is False
        interprocess (bool): if True, the event can be shared between processes, default is False
@@ -674,7 +614,7 @@ class Stream:
    Parameters:
        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
        priority(int, optional): priority of the CUDA stream. Can be either
            1 (high priority) or 2 (low priority). By default, streams have
            priority 2.
@@ -996,7 +936,7 @@ def synchronize(device=None):
    Parameters:
        device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for.  If device is None, the device is the current device. Default: None.
            It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
    Examples:
        .. code-block:: python
            # required: custom_device

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -293,11 +293,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
                core.HCCLParallelContext(strategy, place).init_with_ring_id(
                    ring_id
                )
-            elif core.is_compiled_with_mlu():
-                place = core.MLUPlace(genv.device_id)
-                core.CNCLParallelContext(strategy, place).init_with_ring_id(
-                    ring_id
-                )
            elif core.is_compiled_with_xpu():
                place = core.XPUPlace(genv.device_id)
                core.BKCLParallelContext(strategy, place).init_with_ring_id(

--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -165,16 +165,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        )
        base_group.add_argument("--selected_npus", dest="npus")

-    if framework.core.is_compiled_with_mlu():
-        base_group.add_argument(
-            "--mlus",
-            type=str,
-            default=None,
-            help="It's for mlu training. For example: "
-            "--mlus=\"0,1,2,3\" will launch four training processes each bound to one mlu.",
-        )
-        base_group.add_argument("--selected_mlus", dest="mlus")
-
    base_group.add_argument(
        "training_script",
        type=str,
@@ -507,8 +497,6 @@ def infer_backend(args):
        args.backend = 'unknown'
    elif framework.core.is_compiled_with_xpu():
        args.backend = 'bkcl'
-    elif framework.core.is_compiled_with_mlu():
-        args.backend = 'cncl'
    else:
        args.backend = 'gloo'

@@ -561,8 +549,6 @@ def which_distributed_mode(args):
        accelerators = framework.core.get_npu_device_count()
    elif framework.core.is_compiled_with_xpu():
        accelerators = framework.core.get_xpu_device_count()
-    elif framework.core.is_compiled_with_mlu():
-        accelerators = framework.core.get_mlu_device_count()
    else:
        accelerators = 0

@@ -589,11 +575,10 @@ def which_distributed_mode(args):
        if (
            not framework.core.is_compiled_with_cuda()
            and not framework.core.is_compiled_with_xpu()
-            and not framework.core.is_compiled_with_mlu()
        ):
            if args.servers:
                logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. "
+                    "Not found distinct arguments and not compiled with cuda or xpu or npu. "
                    "But found args.servers not empty, default use ps mode"
                )
                return DistributeMode.PS
@@ -601,7 +586,7 @@ def which_distributed_mode(args):
                return DistributeMode.COLLECTIVE
        else:
            logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. "
+                "Not found distinct arguments and compiled with cuda or xpu or npu. "
                "Default use collective mode"
            )
            return DistributeMode.COLLECTIVE
@@ -638,10 +623,6 @@ def launch():

        - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.

-        - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu.
-
-        - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``.
-
        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``

        - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``

--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -57,7 +57,6 @@ class DeviceMode:
    XPU = 2
    ASCEND_NPU = 3
    UNKNOWN = 3
-    MLU = 4


 class Cluster:
@@ -303,7 +302,6 @@ def get_cluster(
            if (
                device_mode == DeviceMode.GPU
                or device_mode == DeviceMode.ASCEND_NPU
-                or device_mode == DeviceMode.MLU
            ):
                if isinstance(devices_per_proc[i], (list, tuple)):
                    trainer.accelerators.extend(devices_per_proc[i])
@@ -554,10 +552,6 @@ def start_local_trainers(
            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
                [str(g) for g in t.accelerators]
            )
-        elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU:
-            proc_env["FLAGS_selected_mlus"] = "%s" % ",".join(
-                [str(g) for g in t.accelerators]
-            )

        if len(t.accelerators) > 0:
            proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
@@ -800,42 +794,6 @@ def get_npus(npus):
    return res_npus


-def get_mlus(mlus):
-    if mlus is None:
-        mlus_num = framework.core.get_mlu_device_count()
-        res_mlus = [str(x) for x in range(0, mlus_num)]
-    else:
-        mlu_visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
-        if mlu_visible_devices is None or mlu_visible_devices == "":
-            res_mlus = [x.strip() for x in mlus.split(',')]
-        else:
-            # change mlus into relative values
-            # e.g. MLU_VISIBLE_DEVICES=4,5,6,7; args.mlus=4,5,6,7;
-            # therefore mlus=0,1,2,3
-            mlu_visible_devices_list = mlu_visible_devices.split(',')
-            for x in mlus.split(','):
-                assert x in mlu_visible_devices_list, (
-                    "Can't find "
-                    "your mlus {} in MLU_VISIBLE_DEVICES[{}].".format(
-                        x,
-                        mlu_visible_devices,
-                    )
-                )
-            res_mlus = [
-                mlu_visible_devices_list.index(x.strip())
-                for x in mlus.split(',')
-            ]
-            logger.info(
-                "Change selected_mlus into reletive values. --ips:{} "
-                "will change into relative_ips:{} according to your "
-                "MLU_VISIBLE_DEVICES:{}".format(
-                    mlus, res_mlus, mlu_visible_devices_list
-                )
-            )
-
-    return res_mlus
-
-
 def get_device_mode(backend):
    if backend == 'heter':
        if (
@@ -869,10 +827,6 @@ def get_device_mode(backend):
        print("launch train in XPU mode")
        return DeviceMode.XPU

-    if backend == 'cncl' and framework.core.get_mlu_device_count() > 0:
-        print("launch train in MLU mode")
-        return DeviceMode.MLU
-
    if backend == 'gloo':
        print("launch train in CPU mode")
        return DeviceMode.CPU
@@ -925,19 +879,6 @@ def get_device_proc_info(args):
            devices_per_proc = [xpus[i : i + n] for i in range(0, len(xpus), n)]
        else:
            devices_per_proc = xpus
-    elif device_mode == DeviceMode.MLU:
-        mlus = get_mlus(args.mlus)
-        if args.nproc_per_node is not None:
-            assert (
-                len(mlus) % int(args.nproc_per_node)
-            ) == 0, "mlus' number:{} mod args.nproc_per_node:{} must == 0".format(
-                len(mlus), args.nproc_per_node
-            )
-
-            n = int(len(mlus) / int(args.nproc_per_node))
-            devices_per_proc = [mlus[i : i + n] for i in range(0, len(mlus), n)]
-        else:
-            devices_per_proc = mlus
    elif device_mode == DeviceMode.CPU:
        if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None:
            # NOTE (xiongkun03) set it to cpu core number
@@ -2144,12 +2085,6 @@ def check_backend(backend):
            "your paddle is not compiled with npu but you assign 'hccl' as backend."
        )

-    if backend == 'cncl' and not framework.core.is_compiled_with_mlu():
-        raise ValueError(
-            "paddle.distributed initialize error, "
-            "your paddle is not compiled with mlu but you assign 'cncl' as backend."
-        )
-

 def block_windows_and_macos(backend):
    if backend != 'gloo':
@@ -2174,7 +2109,4 @@ def get_backend_by_compile_flag():
    if framework.core.is_compiled_with_npu():
        return 'hccl'

-    if framework.core.is_compiled_with_mlu():
-        return 'cncl'
-
    return 'gloo'
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -25,7 +25,6 @@ class DeviceType:
    GPU = 'gpu'
    XPU = 'xpu'
    NPU = 'npu'
-    MLU = 'mlu'
    IPU = 'ipu'
    CUSTOM_DEVICE = 'custom_device'

@@ -73,8 +72,6 @@ class Device:
            return 'FLAGS_selected_npus'
        if self._dtype == DeviceType.XPU:
            return 'FLAGS_selected_xpus'
-        if self._dtype == DeviceType.MLU:
-            return 'FLAGS_selected_mlus'
        if self._dtype == DeviceType.IPU:
            return 'FLAGS_selected_ipus'
        if self._dtype == DeviceType.CUSTOM_DEVICE:
@@ -117,9 +114,6 @@ class Device:
        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
            dev._dtype = DeviceType.NPU
            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-        elif 'MLU_VISIBLE_DEVICES' in os.environ:
-            dev._dtype = DeviceType.MLU
-            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")

        if visible_devices is not None and visible_devices != 'all':
            dev._labels = visible_devices.split(',')
@@ -162,10 +156,6 @@ class Device:
            dev._dtype = DeviceType.NPU
            num = core.get_npu_device_count()
            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-        elif core.is_compiled_with_mlu():
-            dev._dtype = DeviceType.MLU
-            num = core.get_mlu_device_count()
-            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
        elif core.is_compiled_with_ipu():
            dev._dtype = DeviceType.IPU
            num = core.get_ipu_device_count()

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -724,9 +724,6 @@ class ParallelEnv:
            elif core.is_compiled_with_npu():
                selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
                self._device_id = int(selected_npus[0])
-            elif core.is_compiled_with_mlu():
-                selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
-                self._device_id = int(selected_mlus[0])

        self._trainer_endpoints = os.getenv(
            "PADDLE_TRAINER_ENDPOINTS", ""
@@ -897,7 +894,6 @@ def _is_cpuonly(backend):
            core.is_compiled_with_cuda()
            or core.is_compiled_with_xpu()
            or core.is_compiled_with_npu()
-            or core.is_compiled_with_mlu()
        )
    ) or backend == 'xccl':

@@ -999,7 +995,6 @@ def init_parallel_env():
        or core.is_compiled_with_cuda()
        or core.is_compiled_with_xpu()
        or core.is_compiled_with_npu()
-        or core.is_compiled_with_mlu()
        or backend == "xccl"
    ):
        raise NotImplementedError(
@@ -1021,9 +1016,6 @@ def init_parallel_env():
        elif not is_cpu_only and core.is_compiled_with_npu():
            _check_var_exists('FLAGS_selected_npus')
            backend = "hccl" if backend == "auto" else backend
-        elif not is_cpu_only and core.is_compiled_with_mlu():
-            _check_var_exists('FLAGS_selected_mlus')
-            backend = "cncl" if backend == "auto" else backend

    _check_var_exists("PADDLE_TRAINER_ID")
    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
@@ -1048,8 +1040,6 @@ def init_parallel_env():
        place = core.XPUPlace(parallel_env.device_id)
    elif core.is_compiled_with_npu():
        place = core.NPUPlace(parallel_env.device_id)
-    elif core.is_compiled_with_mlu():
-        place = core.MLUPlace(parallel_env.device_id)

    _set_expected_place(place)

@@ -1167,11 +1157,6 @@ def init_parallel_env():
        parallel_helper._set_parallel_ctx(
            core.HCCLParallelContext(strategy, place)
        )
-    elif core.is_compiled_with_mlu():
-        parallel_helper._set_parallel_ctx(
-            core.CNCLParallelContext(strategy, place)
-        )
-
    if backend != "heter":
        other_endpoints = strategy.trainer_endpoints[:]
        other_endpoints.remove(strategy.current_endpoint)

--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -76,7 +76,6 @@ def _options_valid_check(options):
        'ips',
        'gpus',
        'xpus',
-        'mlus',
        'print_config',
        'backend',
    ]
@@ -110,7 +109,7 @@ def _get_default_nprocs():
    elif 'xpu' in device:
        return core.get_xpu_device_count()
    elif 'mlu' in device:
-        return core.get_mlu_device_count()
+        return core.get_custom_device_count('mlu')
    elif 'cpu' in device:
        return multiprocessing.cpu_count()
    else:
@@ -267,7 +266,7 @@ def _get_subprocess_env_list(nprocs, options):
        env_devices = os.getenv("MLU_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
-                str(x) for x in range(core.get_mlu_device_count())
+                str(x) for x in range(core.get_custom_device_count('mlu'))
            ]
        else:
            env_devices_list = env_devices.split(',')

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -306,11 +306,7 @@ def monkey_patch_varbase():
            if _grad_scalar:
                # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                self = _grad_scalar.scale(self)
-            if (
-                paddle.is_compiled_with_xpu()
-                or paddle.is_compiled_with_npu()
-                or paddle.is_compiled_with_mlu()
-            ):
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                scaled_loss = scale_loss(self)
                if framework.global_var._in_eager_mode_:

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1585,7 +1585,6 @@ class Executor:
            program = pruned_program

        def _can_use_interpreter_core(program, place):
-
            compiled = isinstance(
                program, compiler.CompiledProgram
            ) or isinstance(program._graph, compiler.CompiledProgram)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -51,7 +51,6 @@ __all__ = [
    'cuda_places',
    'cpu_places',
    'xpu_places',
-    'mlu_places',
    'cuda_pinned_places',
    '_non_static_mode',
    'in_dygraph_mode',
@@ -649,18 +648,6 @@ def _current_expected_place():
                    "You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default."
                )
                _global_expected_place_ = core.CPUPlace()
-        elif core.is_compiled_with_mlu():
-            try:
-                device_count = core.get_mlu_device_count()
-            except Exception as e:
-                device_count = 0
-            if device_count > 0:
-                _global_expected_place_ = core.MLUPlace(_mlu_ids()[0])
-            else:
-                warnings.warn(
-                    "You are using MLU version Paddle, but your MLU device is not set properly. CPU device will be used by default."
-                )
-                _global_expected_place_ = core.CPUPlace()
        elif core.is_compiled_with_custom_device("npu"):
            # TODO(duanyanhui): Optimize DeviceManager and Return all expected places when device registered in DeviceManager is greater than 1.
            try:
@@ -746,15 +733,6 @@ def _custom_device_ids(device_type):
    return device_ids


-def _mlu_ids():
-    mlus_env = os.getenv("FLAGS_selected_mlus")
-    if mlus_env:
-        device_ids = [int(s) for s in mlus_env.split(",")]
-    else:
-        device_ids = range(core.get_mlu_device_count())
-    return device_ids
-
-
 def is_compiled_with_xpu():
    """
    Whether this whl package can be used to run the model on XPU.
@@ -1050,48 +1028,6 @@ def cuda_pinned_places(device_count=None):
    return [core.CUDAPinnedPlace()] * device_count


-def mlu_places(device_ids=None):
-    """
-    This function creates a list of :code:`paddle.device.MLUPlace` objects.
-    If :code:`device_ids` is None, environment variable of
-    :code:`FLAGS_selected_mlus` would be checked first. For example, if
-    :code:`FLAGS_selected_mlus=0,1,2`, the returned list would
-    be [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
-    If :code:`FLAGS_selected_mlus` is not set, all visible
-    mlu places would be returned.
-    If :code:`device_ids` is not None, it should be the device
-    ids of MLUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be
-    [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
-
-    Note:
-        For multi-card tasks, please use `FLAGS_selected_mlus` environment variable to set the visible MLU device.
-
-    Parameters:
-        device_ids (list or tuple of int, optional): list of MLU device ids.
-
-    Returns:
-        list of paddle.device.MLUPlace: Created MLU place list.
-
-    Examples:
-        .. code-block:: python
-
-            # required: mlu
-
-            import paddle
-            import paddle.static as static
-
-            paddle.enable_static()
-            mlu_places = static.mlu_places()
-    """
-    assert core.is_compiled_with_mlu(), "Not compiled with MLU"
-    if device_ids is None:
-        device_ids = _mlu_ids()
-    elif not isinstance(device_ids, (list, tuple)):
-        device_ids = [device_ids]
-    return [core.MLUPlace(dev_id) for dev_id in device_ids]
-
-
 class NameScope:
    def __init__(self, name="", parent=None):
        self._children = dict()
@@ -2645,10 +2581,6 @@ class Variable(metaclass=VariableMetaClass):
            p = core.Place()
            p.set_place(t._place())
            place = core.NPUPlace(p.npu_device_id())
-        elif p.is_mlu_place():
-            p = core.Place()
-            p.set_place(t._place())
-            place = core.MLUPlace(p.mlu_device_id())
        else:
            p = core.Place()
            p.set_place(t._place())
@@ -7574,9 +7506,9 @@ def device_guard(device=None):
        device, index = device.split(':')
        if device == 'cpu':
            raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', 'npu', 'xpu', 'mlu', '', None]:
+    if device not in ['cpu', 'gpu', 'npu', 'xpu', '', None]:
        raise ValueError(
-            "The Attr(device) should be 'cpu' 'npu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' 'npu' 'xpu' or 'gpu', and it can also be empty string or None "
            "when there is no need to specify device. But received %s" % device
        )
    if index:
@@ -7707,7 +7639,6 @@ def _get_paddle_place(place):
            core.CUDAPlace,
            core.NPUPlace,
            core.IPUPlace,
-            core.MLUPlace,
            core.CustomPlace,
        ),
    ):
@@ -7782,21 +7713,8 @@ def _get_paddle_place(place):
        device_id = int(device_id)
        return core.IPUPlace(device_id)

-    # MLU
-    avaliable_mlu_place = re.match(r'mlu:\d+', place)
-    if avaliable_mlu_place:
-        if not core.is_compiled_with_mlu():
-            raise ValueError(
-                "The device should not be {}, since PaddlePaddle is "
-                "not compiled with MLU".format(avaliable_mlu_place.group())
-            )
-        place_info_list = place.split(':', 1)
-        device_id = place_info_list[1]
-        device_id = int(device_id)
-        return core.MLUPlace(device_id)
-
    raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}.".format(
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace and NPUPlace, but received {}.".format(
            place
        )
    )

--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -379,9 +379,6 @@ class OpTest(unittest.TestCase):
        def is_npu_op_test():
            return hasattr(cls, "use_npu") and cls.use_npu

-        def is_mlu_op_test():
-            return hasattr(cls, "use_mlu") and cls.use_mlu
-
        def is_custom_device_op_test():
            return hasattr(cls, "use_custom_device") and cls.use_custom_device

@@ -415,7 +412,6 @@ class OpTest(unittest.TestCase):
                and not is_mkldnn_op_test()
                and not is_rocm_op_test()
                and not is_npu_op_test()
-                and not is_mlu_op_test()
                and not is_custom_device_op_test()
                and not cls.check_prim
            ):
@@ -1972,7 +1968,6 @@ class OpTest(unittest.TestCase):
        if (
            not paddle.is_compiled_with_xpu()
            and not paddle.is_compiled_with_npu()
-            and not paddle.is_compiled_with_mlu()
            and not isinstance(place, core.CustomPlace)
        ):
            self.check_inplace_output_with_place(

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -687,9 +687,6 @@ class TestParallelDyGraphRunnerBase:
        elif fluid.core.is_compiled_with_npu():
            device_id = int(os.getenv("FLAGS_selected_npus", "0"))
            place = fluid.NPUPlace(device_id)
-        elif fluid.core.is_compiled_with_mlu():
-            device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
-            place = fluid.MLUPlace(device_id)
        else:
            assert "Only support CUDAPlace or XPUPlace or CPU(Gloo) for now."

@@ -892,7 +889,6 @@ def runtime_main(test_class):
    parser.add_argument('--use_xpu', action='store_true')
    parser.add_argument('--use_dgc', action='store_true')
    parser.add_argument('--use_npu', action='store_true')
-    parser.add_argument('--use_mlu', action='store_true')
    parser.add_argument('--accumulate_gradient', action='store_true')
    parser.add_argument('--find_unused_parameters', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')
@@ -950,30 +946,20 @@ class TestDistBase(unittest.TestCase):
            self.__use_xpu = False
            self._use_dgc = False
            self.__use_npu = False
-            self._use_mlu = False
        elif self._enforce_place == "GPU":
            self.__use_cuda = True
            self.__use_xpu = False
            self.__use_npu = False
-            self._use_mlu = False
        elif self._enforce_place == "XPU":
            self.__use_cuda = False
            self.__use_xpu = True
            self._use_dgc = False
            self.__use_npu = False
-            self._use_mlu = False
        elif self._enforce_place == "NPU":
            self.__use_cuda = False
            self.__use_xpu = False
            self._use_dgc = False
            self.__use_npu = True
-            self._use_mlu = False
-        elif self._enforce_place == "MLU":
-            self.__use_cuda = False
-            self.__use_xpu = False
-            self._use_dgc = False
-            self.__use_npu = False
-            self._use_mlu = True
        else:
            if fluid.core.is_compiled_with_cuda():
                self.__use_cuda = True
@@ -1473,18 +1459,6 @@ class TestDistBase(unittest.TestCase):
                    "GLOG_v": "2",
                }
            )
-        elif self._use_mlu:
-            tr_cmd += " --use_mlu"
-            env.update(
-                {
-                    "FLAGS_selected_mlus": f"{trainer_id}",
-                    "PADDLE_TRAINERS_NUM": f"{trainer_num}",
-                    "PADDLE_TRAINER_ID": f"{trainer_id}",
-                    "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-                    "PADDLE_CURRENT_ENDPOINT": ep,
-                    "GLOG_v": "4",
-                }
-            )
        else:
            env.update({'CPU_NUM': '1'})


--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -60,7 +60,6 @@ from ..fluid.framework import program_guard  # noqa: F401
 from ..fluid.framework import cpu_places  # noqa: F401
 from ..fluid.framework import cuda_places  # noqa: F401
 from ..fluid.framework import xpu_places  # noqa: F401
-from ..fluid.framework import mlu_places  # noqa: F401
 from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.framework import Operator  # noqa: F401
@@ -120,7 +119,6 @@ __all__ = [  # noqa
    'cuda_places',
    'xpu_places',
    'npu_places',
-    'mlu_places',
    'Variable',
    'create_global_var',
    'accuracy',

--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -186,10 +186,6 @@ elif core.is_compiled_with_npu():
    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
        'NPU', core.VarDesc.VarType.FP16
    )
-elif core.is_compiled_with_mlu():
-    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
-        'MLU', core.VarDesc.VarType.FP16
-    )
 else:
    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
        'GPU', core.VarDesc.VarType.FP16

--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -1540,10 +1540,6 @@ def load(program, model_path, executor=None, var_list=None):
            p = paddle.fluid.core.Place()
            p.set_place(t._place())
            place = paddle.fluid.NPUPlace(p.npu_device_id())
-        elif p.is_mlu_place():
-            p = paddle.fluid.core.Place()
-            p.set_place(t._place())
-            place = paddle.fluid.MLUPlace(p.mlu_device_id())
        else:
            p = paddle.fluid.core.Place()
            p.set_place(t._place())
@@ -1684,10 +1680,6 @@ def set_program_state(program, state_dict):
                p = paddle.fluid.core.Place()
                p.set_place(ten_place)
                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
-            elif ten_place.is_mlu_place():
-                p = paddle.fluid.core.Place()
-                p.set_place(ten_place)
-                py_place = paddle.fluid.MLUPlace(p.mlu_device_id())

            ten.set(new_para_np, py_place)


--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -35,9 +35,6 @@ def download_file():
    if paddle.is_compiled_with_npu():
        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu')

-    if paddle.is_compiled_with_mlu():
-        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_mlu')
-
    f = requests.get(url)
    data = f.text
    status_code = f.status_code