rm npu (#53566)

6d396ace · 张春乔 · GitHub · 7dcf5e53 · 6d396ace · 6d396ace
35 changed file
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -89,11 +89,6 @@ struct DLDeviceVisitor
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
  }
-  inline ::DLDevice operator()(const platform::NPUPlace &place) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
-  }
  inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const {
    PADDLE_THROW(platform::errors::Unimplemented(
        "platform::NPUPinnedPlace is not supported"));

--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -55,8 +55,6 @@ static phi::Backend ConvertPlaceToBackend(const phi::Place& place) {
      return phi::Backend::GPU;
    case phi::AllocationType::XPU:
      return phi::Backend::XPU;
-    case phi::AllocationType::NPU:
-      return phi::Backend::NPU;
    default:
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Cannot convert place(%d).", static_cast<int>(place.GetType())));

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -374,9 +374,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
-#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@@ -413,12 +410,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)
-#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
-  REGISTER_OP_KERNEL_EX(                                              \
-      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
 #define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...)                  \
  REGISTER_OP_KERNEL_EX(                                              \
      op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE,       \

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1327,8 +1327,6 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
    device_name = "CPU";
  } else if (member_->use_device_ == p::kCUDA) {
    device_name = "CUDA";
-  } else if (member_->use_device_ == p::kNPU) {
-    device_name = "NPU";
  } else if (member_->use_device_ == p::kXPU) {
    device_name = "XPU";
  } else {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -138,8 +138,6 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
    case paddle_infer::PlaceType::kGPU:
      // NOTE: phi also support phi::Backend::GPUDNN.
      return phi::Backend::GPU;
-    case paddle_infer::PlaceType::kNPU:
-      return phi::Backend::NPU;
    case paddle_infer::PlaceType::kXPU:
      return phi::Backend::XPU;
    case paddle_infer::PlaceType::kCPU:

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -82,8 +82,6 @@ bool NativePaddlePredictor::Init(
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else if (config_.use_xpu) {
    place_ = paddle::platform::XPUPlace(config_.device);
-  } else if (config_.use_npu) {
-    place_ = paddle::platform::NPUPlace(config_.device);
  } else {
    place_ = paddle::platform::CPUPlace();
  }

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -124,9 +124,6 @@ T *Tensor::mutable_data(PlaceType place) {
    case static_cast<int>(PlaceType::kXPU): {
      return tensor->mutable_data<T>(paddle::platform::XPUPlace(device_));
    }
-    case static_cast<int>(PlaceType::kNPU): {
-      return tensor->mutable_data<T>(paddle::platform::NPUPlace(device_));
-    }
    case static_cast<int>(PlaceType::kCUSTOM): {
      return tensor->mutable_data<T>(
          paddle::platform::CustomPlace(device_type_, device_));

--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -67,7 +67,7 @@ enum DataType {
  // TODO(Inference): support more data types if needed.
 };
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU, kCUSTOM };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kIPU, kCUSTOM };
 enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW };

--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -53,6 +53,3 @@ namespace plat = paddle::platform;
 REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,
                             ops::CSyncCommStreamOp,
                             ops::CSyncCommStreamOpMaker);
-REGISTER_OP_NPU_KERNEL(c_sync_comm_stream,
-                       ops::CSyncCommStreamKernel<float, plat::NPUPlace>);
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -95,9 +95,6 @@ class FillConstantOp : public framework::OperatorWithKernel {
        case 3:
          kt.set_backend(phi::Backend::XPU);
          break;
-        case 4:
-          kt.set_backend(phi::Backend::NPU);
-          break;
        default:
          PADDLE_THROW(platform::errors::Unimplemented(
              "Could NOT determine the place of variable, place_type = %d .",
@@ -161,8 +158,7 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                 "0: CPUPlace. "
                 "1: CUDAPlace. "
                 "2: CUDAPinnedPlace. "
-                 "3: XPUPlace. "
+                 "3: XPUPlace. ")
-                 "4: NPUPlace. ")
        .SetDefault(-1);
    AddOutput("Out",
              "(Tensor) Tensor of specified shape will be filled "

--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -86,16 +86,15 @@ class MemcpyD2HOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out",
              "(phi::DenseTensor) The type of output "
              "is the same as input X.");
-    AddAttr<int>(
+    AddAttr<int>("dst_place_type",
-        "dst_place_type",
+                 "Determine the dst place of tensor copy. "
-        "Determine the dst place of tensor copy. "
+                 "By Now it ONLY support XPU/CUDAPlace <-> CUDAPinnedPlace/CPU"
-        "By Now it ONLY support XPU/NPUPlace/CUDAPlace <-> CUDAPinnedPlace/CPU"
+                 "Other place type is Unimplemented and will cause ERROR."
-        "Other place type is Unimplemented and will cause ERROR."
+                 "0: dst is on CPUPlace. "
-        "0: dst is on CPUPlace. "
+                 "1: dst is on CUDAPinnedPlace. ");
-        "1: dst is on CUDAPinnedPlace. ");
    AddComment(R"DOC(
    MemcpyD2H Operator.
-    By now, it ONLY supports the memcopy between NPUPlace/CUDAPlace <-> CUDAPinnedPlace/CPU.
+    By now, it ONLY supports the memcopy between CUDAPlace <-> CUDAPinnedPlace/CPU.
    You would have to update it if you want other more capacities.
 Out = X,  when type in [phi::DenseTensor]
 raise error if the type is not listed above.

--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -91,13 +91,12 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                 "Determine the dst place of tensor copy. "
                 "By Now it support:"
                 "0. CUDAPinnedPlace/CPU <->CUDAPlace"
-                 "1. NPUPinnedPlace/CPU <-> NPUPlace"
+                 "1. CPU <->XPUPlace"
-                 "2. CPU <->XPUPlace"
+                 "2. CPU <->IPUPlace"
-                 "3. CPU <->IPUPlace"
                 "Other place type is Unimplemented and will cause ERROR.");
    AddComment(R"DOC(
    MemcpyD2H Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace.
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> CUDAPlace.
    You would have to update it if you want other more capacities.
 Out = X,  when type in [phi::DenseTensor]
 raise error if the type is not listed above.

--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -105,20 +105,17 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "is the same as input X.");
    AddAttr<int>("dst_place_type",
                 "Determine the dst place of tensor copy. "
-                 "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
+                 "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace."
-                 "NPUPlace <-> CPUPlace. "
                 "Other place type is Unimplemented and will cause ERROR."
                 "0: dst is on CPUPlace. "
                 "1: dst is on CUDAPlace. "
                 "2: dst is on CUDAPinnedPlace. "
                 "3: dst is on XPUPlace. "
-                 "4: dst is on NPUPlace. "
+                 "4: dst is on NPUPinnerPlace. "
-                 "5: dst is on NPUPinnerPlace. "
+                 "5: dst is on CustomDevicePlace");
-                 "6: dst is on CustomDevicePlace");
    AddComment(R"DOC(
    Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace, and used as an internal op by Recompute-Offload.
-    NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
    You would have to update it if you want other more capacities.
 Out = X,  when type in [phi::DenseTensor]

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -106,7 +106,6 @@ DeviceType Place2DeviceType(const platform::Place& place);
 constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
-constexpr DeviceType kNPU = DeviceType::NPU;
 constexpr DeviceType kIPU = DeviceType::IPU;
 constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE;

--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -26,7 +26,6 @@
 using ::paddle::platform::kCPU;
 using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCUSTOM_DEVICE;
-using ::paddle::platform::kNPU;
 using ::paddle::platform::kXPU;
 USE_EVENT(kCPU)

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -102,8 +102,6 @@ Place PlaceHelper::CreatePlace(const std::string &dev_type, size_t dev_id) {
    return platform::CPUPlace();
  } else if (dev_type == "gpu") {
    return platform::CUDAPlace(dev_id);
-  } else if (dev_type == "npu") {
-    return platform::NPUPlace(dev_id);
  } else if (dev_type == "xpu") {
    return platform::XPUPlace(dev_id);
  } else {

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -28,7 +28,6 @@ using Place = phi::Place;
 using CPUPlace = phi::CPUPlace;
 using CUDAPlace = phi::GPUPlace;
 using CUDAPinnedPlace = phi::GPUPinnedPlace;
-using NPUPlace = phi::NPUPlace;
 using NPUPinnedPlace = phi::NPUPinnedPlace;
 using XPUPlace = phi::XPUPlace;
 using IPUPlace = phi::IPUPlace;
@@ -88,11 +87,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
      return typename Visitor::result_type();
 #endif
    }
-    case phi::AllocationType::NPU: {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
-      return typename Visitor::result_type();
-    }
    case phi::AllocationType::NPUPINNED: {
      PADDLE_THROW(platform::errors::Unavailable(
          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -159,7 +159,7 @@ void InitTensorWithNumpyValue(TensorObject* self,
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "Place should be one of "
-        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/CustomPlace"));
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/CustomPlace"));
  }
 }

--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -108,7 +108,7 @@ void InitTensorWithNumpyValue(const py::object& array,
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "Place should be one of "
-        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/CustomPlace"));
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/CustomPlace"));
  }
 }

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -52,7 +52,6 @@ extern PyTypeObject* g_place_pytype;
 extern PyTypeObject* g_cudaplace_pytype;
 extern PyTypeObject* g_cpuplace_pytype;
 extern PyTypeObject* g_xpuplace_pytype;
-extern PyTypeObject* g_npuplace_pytype;
 extern PyTypeObject* g_cudapinnedplace_pytype;
 extern PyTypeObject* g_customplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
@@ -529,9 +528,6 @@ platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
  } else if (PyObject_IsInstance(
                 obj, reinterpret_cast<PyObject*>(g_xpuplace_pytype))) {
    place = ::pybind11::handle(obj).cast<platform::XPUPlace>();
-  } else if (PyObject_IsInstance(
-                 obj, reinterpret_cast<PyObject*>(g_npuplace_pytype))) {
-    place = ::pybind11::handle(obj).cast<platform::NPUPlace>();
  } else if (PyObject_IsInstance(
                 obj, reinterpret_cast<PyObject*>(g_cudapinnedplace_pytype))) {
    place = ::pybind11::handle(obj).cast<platform::CUDAPinnedPlace>();
@@ -542,7 +538,7 @@ platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "argument (position %d) must be "
        "one "
-        "of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace,"
+        "of(Place,CUDAPlace,CPUPlace,XPUPlace,CUDAPinnedPlace,"
        "CustomPlace), "
        "but got %s",
        arg_pos + 1,

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -144,8 +144,6 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
    return place_obj.cast<platform::XPUPlace>();
  } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
    return place_obj.cast<platform::CUDAPinnedPlace>();
-  } else if (py::isinstance<platform::NPUPlace>(place_obj)) {
-    return place_obj.cast<platform::NPUPlace>();
  } else if (py::isinstance<platform::IPUPlace>(place_obj)) {
    return place_obj.cast<platform::IPUPlace>();
  } else if (py::isinstance<platform::Place>(place_obj)) {
@@ -155,7 +153,7 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "Place should be one of "
-        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/"
+        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/IPUPlace/"
        "CustomPlace"));
  }
 }
@@ -208,7 +206,7 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "Place should be one of "
-        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/"));
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/IPUPlace/"));
  }
  self->SetDataType(framework::TransToProtoVarType(tensor->dtype()));
 }
@@ -711,14 +709,6 @@ void BindImperative(py::module *m_ptr) {
           py::arg("zero_copy") = false,
           py::arg("name") = "",
           py::arg("stop_gradient") = -1)
-      .def("__init__",
-           &InitVarBaseFromNumpyWithArg<platform::NPUPlace>,
-           py::arg("value"),
-           py::arg("place"),
-           py::arg("persistable") = false,
-           py::arg("zero_copy") = false,
-           py::arg("name") = "",
-           py::arg("stop_gradient") = -1)
      .def("__init__",
           &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
           py::arg("value"),
@@ -752,11 +742,6 @@ void BindImperative(py::module *m_ptr) {
           py::arg("tensor"),
           py::arg("place"),
           py::arg("name") = "")
-      .def("__init__",
-           &InitVarBaseFromTensorWithArg<platform::NPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("name") = "")
      .def("__init__",
           &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
           py::arg("tensor"),
@@ -1877,18 +1862,6 @@ void BindImperative(py::module *m_ptr) {
            return new_var;
          },
          py::return_value_policy::copy)
-      .def(
-          "_copy_to",
-          [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::NPUPlace &place,
-             bool blocking) {
-            auto new_var = self->NewVarBase(place, blocking);
-            if (!blocking) {
-              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-            }
-            return new_var;
-          },
-          py::return_value_policy::copy)
      .def(
          "_copy_to",
          [](const std::shared_ptr<imperative::VarBase> &self,
@@ -2219,11 +2192,6 @@ void BindImperative(py::module *m_ptr) {
              self.SetExpectedPlace(*p);
              VLOG(4) << "Tracer(" << &self << ")"
                      << " set expected place " << *p;
-            } else if (py::isinstance<platform::NPUPlace>(obj)) {
-              auto p = obj.cast<platform::NPUPlace *>();
-              self.SetExpectedPlace(*p);
-              VLOG(4) << "Tracer(" << &self << ")"
-                      << " set expected place " << *p;
            } else if (py::isinstance<platform::IPUPlace>(obj)) {
              auto p = obj.cast<platform::IPUPlace *>();
              self.SetExpectedPlace(*p);
@@ -2242,7 +2210,7 @@ void BindImperative(py::module *m_ptr) {
            } else {
              PADDLE_THROW(platform::errors::InvalidArgument(
                  "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
-                  "CPUPlace, NPUPlace, IPUPlace"
+                  "CPUPlace, IPUPlace"
                  "and CUDAPinnedPlace, "
                  "but got Unknown Type!"));
            }
@@ -2375,28 +2343,6 @@ void BindImperative(py::module *m_ptr) {
                                                 inplace_map);
             }
           })
-      .def("trace",
-           [](imperative::Tracer &self,
-              const std::string &type,
-              const PyNameVarBaseMap &ins,
-              const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs,
-              const platform::NPUPlace &place,
-              bool trace_backward,
-              const std::map<std::string, std::string> &inplace_map = {}) {
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
-             {
-               py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(type,
-                                                 std::move(ins_map),
-                                                 std::move(outs_map),
-                                                 std::move(attrs),
-                                                 place,
-                                                 trace_backward,
-                                                 inplace_map);
-             }
-           })
      .def("trace",
           [](imperative::Tracer &self,
              const std::string &type,
@@ -2488,7 +2434,6 @@ void BindImperative(py::module *m_ptr) {
  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::XPUPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
-  m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::CustomPlace>);
  m.def(

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -637,7 +637,6 @@ void BindPaddlePlace(py::module *m) {
      .value("CPU", PaddlePlace::kCPU)
      .value("GPU", PaddlePlace::kGPU)
      .value("XPU", PaddlePlace::kXPU)
-      .value("NPU", PaddlePlace::kNPU)
      .value("CUSTOM", PaddlePlace::kCUSTOM);
 }

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -189,7 +189,6 @@ PyTypeObject *g_customplace_pytype = nullptr;
 PyTypeObject *g_cudaplace_pytype = nullptr;
 PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
-PyTypeObject *g_npuplace_pytype = nullptr;
 PyTypeObject *g_cudapinnedplace_pytype = nullptr;
 PyTypeObject *g_ipuplace_pytype = nullptr;
@@ -366,7 +365,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
      .def("_get_device_id",
@@ -495,7 +493,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_type", &PlaceIndex<platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
      .def("_equals",
@@ -548,8 +545,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
      .def("_equals",
@@ -557,30 +552,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
-  // NPUPlace
-  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
-    NPUPlace is a descriptor of a device.
-    It represents a NPU device on which a tensor will be allocated and a model will run.
-    Examples:
-        .. code-block:: python
-          # required: npu
-          import paddle
-          place = paddle.NPUPlace(0)
-        )DOC");
-  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
-  npuplace.def("__init__", [](platform::NPUPlace &self, int dev_id) {})
-      .def("_type", &PlaceIndex<platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
-      .def("__str__", string::to_string<const platform::NPUPlace &>);
  // IPUPlace
  py::class_<platform::IPUPlace> ipuplace(m, "IPUPlace", R"DOC(
    IPUPlace is a descriptor of a device.
@@ -625,7 +596,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
@@ -639,7 +609,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
@@ -685,10 +654,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              const platform::CUDAPinnedPlace &cuda_pinned_place) {
             self = cuda_pinned_place;
           })
-      .def("set_place",
-           [](platform::Place &self, const platform::NPUPlace &npu_place) {
-             self = npu_place;
-           })
      .def("set_place",
           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
             self = ipu_place;

--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -245,10 +245,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<float>(place);
           })
-      .def("_alloc_float",
-           [](phi::DenseTensor &self, paddle::platform::NPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
      .def("_alloc_double",
           [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<double>(place);
@@ -315,13 +311,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_clear", &phi::DenseTensor::clear)
-      .def("_mutable_data",
-           [](phi::DenseTensor &self,
-              paddle::platform::NPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
      .def("_copy_from",
           &TensorCopyFrom<paddle::platform::CPUPlace>,
           py::arg("tensor"),
@@ -342,11 +331,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           py::arg("tensor"),
           py::arg("place"),
           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::NPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
      .def("_copy_from",
           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
           py::arg("tensor"),
@@ -382,11 +366,6 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           py::arg("array"),
           py::arg("place"),
           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::NPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
      .def("set",
           SetTensorFromPyArray<paddle::platform::IPUPlace>,
           py::arg("array"),
@@ -402,7 +381,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
        Args:
          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace): The place where the
          Tensor is to be set.
          zero_copy (bool, optional): Whether to share memory with the input numpy array.
          This parameter only works with CPUPlace. Default: False.

--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -629,7 +629,7 @@ class PADDLE_API Tensor final {
   * unified to Tensor, but Tensor itself is heterogeneous.
   *
   * Tensor can generally be represented by void* and size_t, place.
-   * This is suitable for most scenarios including CPU, GPU, HIP, NPU, etc.,
+   * This is suitable for most scenarios including CPU, GPU, HIP, etc.,
   * but there are a few cases where this definition cannot be described,
   * such as the Tensor representation in third-party lib such as Metal,
   * OpenCL, etc., as well as some special Tensor implementations, including

--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -696,8 +696,6 @@ class DeviceTracerImpl : public DeviceTracer {
          event->set_device_id(r.place.GetDeviceId());
        } else if (r.place.GetType() == phi::AllocationType::GPUPINNED) {
          event->set_place(proto::MemEvent::CUDAPinnedPlace);
-        } else if (r.place.GetType() == phi::AllocationType::NPU) {
-          event->set_place(proto::MemEvent::NPUPlace);
        } else {
          PADDLE_THROW(
              errors::Unimplemented("The current place is not supported."));

--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -91,9 +91,6 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
    case Backend::XPU:
      os << "XPU";
      break;
-    case Backend::NPU:
-      os << "NPU";
-      break;
    case Backend::ONEDNN:
      os << "ONEDNN";
      break;
@@ -137,8 +134,6 @@ inline Backend StringToBackend(const char* backend_cstr) {
    return Backend::GPU;
  } else if (s == std::string("XPU")) {
    return Backend::XPU;
-  } else if (s == std::string("NPU")) {
-    return Backend::NPU;
  } else if (s == std::string("OneDNN")) {
    return Backend::ONEDNN;
  } else if (s == std::string("GPUDNN")) {
@@ -173,8 +168,6 @@ inline std::string BackendToString(const Backend& backend) {
      return "GPU";
    case Backend::XPU:
      return "XPU";
-    case Backend::NPU:
-      return "NPU";
    case Backend::ONEDNN:
      return "ONEDNN";
    case Backend::GPUDNN:

--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -35,8 +35,6 @@ const char *AllocationTypeStr(AllocationType type) {
      return "gpu_pinned";
    case AllocationType::XPU:
      return "xpu";
-    case AllocationType::NPU:
-      return "npu";
    case AllocationType::NPUPINNED:
      return "npu_pinned";
    case AllocationType::IPU:
@@ -76,8 +74,6 @@ Place GetPinnedPlace(const Place &place) {
    case AllocationType::GPU:
      return phi::GPUPinnedPlace();
      break;
-    case AllocationType::NPU:
-      return phi::NPUPinnedPlace();
    default:
      return place;
  }

--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -163,16 +163,6 @@ class XPUPlace : public Place {
      : Place(AllocationType::XPU, place.GetDeviceId()) {}
 };
-class NPUPlace : public Place {
- public:
-  NPUPlace() : Place(AllocationType::NPU, 0) {}
-  explicit NPUPlace(int device_id) : Place(AllocationType::NPU, device_id) {}
-  NPUPlace(const NPUPlace&) = default;
-  NPUPlace(const Place& place)  // NOLINT
-      : Place(AllocationType::NPU, place.GetDeviceId()) {}
-};
 class NPUPinnedPlace : public Place {
 public:
  NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {}
@@ -220,7 +210,6 @@ namespace experimental {
 using AllocationType = phi::AllocationType;
 using GPUPinnedPlace = phi::GPUPinnedPlace;
 using XPUPlace = phi::XPUPlace;
-using NPUPlace = phi::NPUPlace;
 }  // namespace experimental
 using AllocationType = phi::AllocationType;

--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -37,8 +37,6 @@ Backend TransToPhiBackend(const phi::Place& place) {
      return Backend::GPU;
    case AllocationType::XPU:
      return Backend::XPU;
-    case AllocationType::NPU:
-      return Backend::NPU;
    case AllocationType::IPU:
      return Backend::IPU;
    case AllocationType::CUSTOM:

--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -161,13 +161,6 @@ void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
 #endif
 }
-template <>
-void set_constant_with_place<phi::NPUPlace>(const phi::DeviceContext& context,
-                                            phi::DenseTensor* tensor,
-                                            float value) {
-  PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
-}
 template <>
 void set_constant_with_place<phi::NPUPinnedPlace>(
    const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {

--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -348,7 +348,7 @@ def amp_guard(
        or tracer._expected_place.is_custom_place()
    ):
        warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace, XPUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
            % tracer._expected_place
        )
        enable = False

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -24,7 +24,6 @@ from ..fluid.core import CPUPlace  # noqa: F401
 from ..fluid.core import IPUPlace  # noqa: F401
 from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
-from ..fluid.core import NPUPlace  # noqa: F401
 from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid import core  # noqa: F401

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -2214,7 +2214,7 @@ def _memcpy(input, place=None, output=None):
    """
    The OP copies the :attr:`input` to the :attr:`output`.
-    NOTE: currently, only support CUDAPlace <-> CUDAPinnedPlace or NPUPlace <-> CPUPlace.
+    NOTE: currently, only support CUDAPlace <-> CUDAPinnedPlace.
    Parameters:
        input (Tensor): A tensor. Its data type supports float16, float32, float64, int32, int64, and bool.

--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -194,14 +194,6 @@ class Timeline:
                            % (k, mevent.device_id),
                            pid,
                        )
-                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
-                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
-                        pid = self._allocate_pid()
-                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
-                        self._chrome_trace.emit_pid(
-                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
-                            pid,
-                        )
                if (k, 0, "CPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CPU")] = pid
@@ -259,7 +251,6 @@ class Timeline:
            profiler_pb2.MemEvent.CPUPlace: "CPU",
            profiler_pb2.MemEvent.CUDAPlace: "GPU",
            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
-            profiler_pb2.MemEvent.NPUPlace: "NPU",
        }
        for k, profile_pb in self._profile_dict.items():
            mem_list = []