昇腾和寒武纪相关代码退场 npu相关代码退场3 (#53699)

* rm npu * rm use_npu * rm npuid * rm use_npu * rm npuid * delete npupinned * roll back sth. * roll back sth. * delete npupinned * roll back sth. * roll back sth. * rm npu * rollback something * rollback npu identity * rollback npu identity

昇腾和寒武纪相关代码退场 npu相关代码退场3 (#53699)
* rm npu * rm use_npu * rm npuid * rm use_npu * rm npuid * delete npupinned * roll back sth. * roll back sth. * delete npupinned * roll back sth. * roll back sth. * rm npu * rollback something * rollback npu identity * rollback npu identity
5b054d2f · 张春乔 · GitHub · 0689e2a5 · 5b054d2f · 5b054d2f
12 changed file
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -68,8 +68,6 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    return LibraryType::kPlain;
  } else if (s == std::string("IPU")) {
    return LibraryType::kPlain;
-  } else if (s == std::string("NPU")) {
-    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
  } else {

--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -34,13 +34,10 @@ struct BeamSearchDecodeFunctor {
        id_tensor_(id_tensor),
        score_tensor_(score_tensor) {
    tensor_on_gpu_ = false;
-    tensor_on_npu_ = false;
    // First make a copy of GPU data on CPU
    if (platform::is_gpu_place(step_ids_origin_[0].place())) {
      if (platform::is_gpu_place(step_ids_origin_[0].place())) {
        tensor_on_gpu_ = true;
-      } else {
-        tensor_on_npu_ = true;
      }
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
@@ -63,8 +60,6 @@ struct BeamSearchDecodeFunctor {
    if (platform::is_gpu_place(step_scores_origin_[0].place())) {
      if (platform::is_gpu_place(step_scores_origin_[0].place())) {
        tensor_on_gpu_ = true;
-      } else {
-        tensor_on_npu_ = true;
      }
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
@@ -96,7 +91,7 @@ struct BeamSearchDecodeFunctor {
    } else {
      BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
      // Check if the tensor is on GPU. If so, use the CPU copy instead
-      if (tensor_on_gpu_ || tensor_on_npu_) {
+      if (tensor_on_gpu_) {
        beam_search_decoder.Backtrace(
            step_ids_, step_scores_, id_tensor_, score_tensor_);
      } else {
@@ -107,7 +102,6 @@ struct BeamSearchDecodeFunctor {
  }
  bool tensor_on_gpu_;
-  bool tensor_on_npu_;
  size_t beam_size_;
  int end_id_;
  // TODO(Superjomn) Here might result serious performance issue in the

--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -111,7 +111,6 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                 "1: dst is on CUDAPlace. "
                 "2: dst is on CUDAPinnedPlace. "
                 "3: dst is on XPUPlace. "
-                 "4: dst is on NPUPinnerPlace. "
                 "5: dst is on CustomDevicePlace");
    AddComment(R"DOC(
    Memcpy Operator.

--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -39,8 +39,6 @@ class MemcpyFunctor {
    CUDA = 1,
    CUDA_PINNED = 2,
    XPU = 3,
-    NPU = 4,
-    NPU_PINNED = 5,
    CUSTOM_DEVICE = 6,
  };

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -98,7 +98,6 @@ BufferedReader::BufferedReader(
  cpu_buffer_.resize(buffer_size);
  cuda_buffer_.resize(buffer_size);
-  npu_buffer_.resize(buffer_size);
  xpu_buffer_.resize(buffer_size);
  custom_device_buffer_.resize(buffer_size);
  ReadTillBufferFullAsync();

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -77,7 +77,6 @@ class BufferedReader : public framework::DecoratedReader {
  // buffers and prevent alloc every time.
  std::vector<TensorVec> cpu_buffer_;
  std::vector<TensorVec> cuda_buffer_;
-  std::vector<TensorVec> npu_buffer_;
  std::vector<TensorVec> xpu_buffer_;
  std::vector<TensorVec> custom_device_buffer_;
  size_t prev_pos_{-1UL};

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -123,7 +123,6 @@ cc_library(
       framework_proto
       ${IPU_CTX_DEPS}
       ${GPU_CTX_DEPS}
-       ${NPU_CTX_DEPS}
       ${MKLDNN_CTX_DEPS}
       ${dgc_deps}
       dlpack

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -93,7 +93,6 @@ namespace platform {
 enum DeviceType {
  CPU = 0,
  CUDA = 1,
-  NPU = 2,
  XPU = 3,
  IPU = 4,
  CUSTOM_DEVICE = 6,

--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -52,7 +52,6 @@ enum class Backend : uint8_t {
  // various acceleration devices' backends
  XPU,  // XPU currently does not exist at the same time as CUDA
-  NPU,  // NPU currently does not exist at the same time as CUDA
  IPU,
  // paddle kernel primitives backend

--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -31,7 +31,6 @@ enum class AllocationType : int8_t {
  GPU = 2,
  GPUPINNED = 3,
  XPU = 4,
-  NPU = 5,
  IPU = 7,
  CUSTOM = 9,
 };

--- a/python/paddle/distributed/auto_parallel/cluster.py
+++ b/python/paddle/distributed/auto_parallel/cluster.py
@@ -450,9 +450,8 @@ class Cluster:
        """Generate cluster by default config."""
        gpu_models = ["V100", "A100", "H100", "A2", "A10", "A16", "A30", "A40"]
        xpu_models = ["XPU"]
-        npu_models = ["NPU"]
        dcu_models = ["DCU"]
-        all_gpu_models = gpu_models + xpu_models + npu_models + dcu_models
+        all_gpu_models = gpu_models + xpu_models + dcu_models
        self._num_devices_per_machine = device_count
        def _convert_to_type(gpu_model):
@@ -461,8 +460,6 @@ class Cluster:
                type = "GPU"
            elif gpu_model in xpu_models:
                type = "XPU"
-            elif gpu_model in npu_models:
-                type = "NPU"
            elif gpu_model in dcu_models:
                type = "DCU"
            else:

--- a/python/paddle/distributed/auto_parallel/cluster_v2.py
+++ b/python/paddle/distributed/auto_parallel/cluster_v2.py
@@ -25,7 +25,6 @@ class DeviceType(IntEnum):
    CPU = 1
    GPU = 2
    XPU = 3
-    NPU = 4
    DCU = 5
    NIC = 6