昇腾和寒武纪相关代码退场 npu相关代码退场2 (#53568)

0d45ac73 · 张春乔 · GitHub · 00ded2ea · 0d45ac73 · 0d45ac73
37 changed file
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -111,8 +111,7 @@ bool MessageBus::Send(int64_t dst_rank,
 #else
  PADDLE_THROW(platform::errors::Unavailable(
      "Fleet executor does not support sending message between different "
-      "ranks when Paddle is compiled with npu or "
-      "isn't compiled with distributed for now."));
+      "ranks when Paddle isn't compiled with distributed for now."));
 #endif
  return true;
 }
@@ -202,10 +201,9 @@ void MessageBus::ListenPort() {
  }
  LOG(INFO) << "Message bus's listen port thread starts successful.";
 #else
-  LOG(WARNING)
-      << "Fleet executor's ListenPort() is a fake function when Paddle is "
-         "compiled with npu or Paddle isn't compiled "
-         "with distributed for now.";
+  LOG(WARNING) << "Fleet executor's ListenPort() is a fake function when "
+                  "Paddle isn't compiled "
+                  "with distributed for now.";
 #endif
 }


--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -89,11 +89,6 @@ struct DLDeviceVisitor
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
  }

-  inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "platform::NPUPinnedPlace is not supported"));
-  }
-
  inline ::DLDevice operator()(const platform::CustomPlace &place) const {
    PADDLE_THROW(platform::errors::Unimplemented(
        "platform::CustomPlace is not supported"));

--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -50,10 +50,6 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
      execution_strategy.num_threads_ = 1;
      break;
    }
-    case platform::DeviceType::NPU: {
-      execution_strategy.num_threads_ = 1;
-      break;
-    }
    case platform::DeviceType::CUSTOM_DEVICE: {
      execution_strategy.num_threads_ = 1;
      break;

--- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -196,7 +196,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
                                 ? OpFuncType::kGpuSync
                                 : OpFuncType::kGpuAsync;
  } else {
-    // Memcpy in npu and custom devices is asynchronous
+    // Memcpy in custom devices is asynchronous
    new_op_func_node.type_ = OpFuncType::kGpuAsync;
  }

@@ -225,7 +225,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
    }
  }

-  // NOTE(winter-wang): in npu and custom device, D2H kernel is asynchronous.
+  // NOTE(winter-wang): in custom device, D2H kernel is asynchronous.
  // need to explicit synchronization.
  if ((platform::is_custom_place(place)) && op_type == kMemcpyD2H) {
    dev_ctx->Wait();

--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(

  DeviceContext* dev_ctx = nullptr;

-  // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
+  // only gpu needs update. xpu not need, because xpu memcpy op kernel is
  // synchronous.
  if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) {
    VLOG(6) << "Parse DeviceContext for " << op_type

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1331,8 +1331,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
    device_name = "XPU";
  } else {
    PADDLE_THROW(
-        platform::errors::Unavailable("Only CPU/CUDA/NPU/XPU is supportted. "
-                                      "please use CPU/CUDA/NPU/XPU backend."));
+        platform::errors::Unavailable("Only CPU/CUDA/XPU is supportted. "
+                                      "please use CPU/CUDA/XPU backend."));
  }

  VLOG(1) << string::Sprintf(

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -52,12 +52,12 @@ OpSupportedInfos(const std::string& place,
      {"CPU", &platform::is_cpu_place},
      {"XPU", &platform::is_xpu_place},
  };
-  PADDLE_ENFORCE_NE(is_target_place.count(query_place),
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The argument `place` should be 'GPU', 'CPU', 'XPU', "
-                        "'NPU', but got '%s'.",
-                        place));
+  PADDLE_ENFORCE_NE(
+      is_target_place.count(query_place),
+      0,
+      platform::errors::InvalidArgument(
+          "The argument `place` should be 'GPU', 'CPU', 'XPU', but got '%s'.",
+          place));

  std::unordered_set<std::string> all_ops;
  const auto& op_info = framework::OpInfoMap::Instance().map();
@@ -147,7 +147,7 @@ AmpOperators::AmpOperators()
      OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
  unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                unsupported_ops_gpu_bf16.end());
-// NOTE: GPU/NPU/XPU is compiled seperatly.
+// NOTE: GPU/XPU is compiled seperatly.
 #elif defined(PADDLE_WITH_XPU)
  auto unsupported_ops_xpu_fp16 = std::get<2>(
      OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -364,10 +364,6 @@ struct Argument {
                      IpuEnableModelRuntimeExecutor,
                      bool);

-  // npu related
-  DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
-  DECL_ARGUMENT_FIELD(npu_device_id, NPUDeviceId, int);
-
  // mixed precision related
  DECL_ARGUMENT_FIELD(model_precision, ModelPrecision, int);
  DECL_ARGUMENT_FIELD(mixed_black_list,

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -56,8 +56,6 @@ PassStrategy *AnalysisConfig::pass_builder() const {
      pass_builder_.reset(new GpuPassStrategy);
    } else if (use_xpu_) {
      pass_builder_.reset(new XpuPassStrategy);
-    } else if (use_npu_) {
-      pass_builder_.reset(new NpuPassStrategy);
    } else if (use_ipu_) {
      LOG(INFO) << "Create IPU IR passes";
      pass_builder_.reset(new IpuPassStrategy);
@@ -506,8 +504,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(use_opencl_);

  // NPU related.
-  CP_MEMBER(use_npu_);
-  CP_MEMBER(npu_device_id_);
  CP_MEMBER(nnadapter_config_);

  // profile related.
@@ -574,9 +570,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  } else if (use_custom_device_) {
    pass_builder_.reset(new CustomDevicePassStrategy(
        *static_cast<CustomDevicePassStrategy *>(other.pass_builder())));
-  } else if (use_npu_) {
-    pass_builder_.reset(new NpuPassStrategy(
-        *static_cast<NpuPassStrategy *>(other.pass_builder())));
  } else {
    pass_builder_.reset(new CpuPassStrategy(
        *static_cast<CpuPassStrategy *>(other.pass_builder())));
@@ -827,7 +820,6 @@ void AnalysisConfig::Update() {
  // Transfer pass_builder and copy the existing compatible passes.
  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
      ((use_xpu() ^ pass_builder_->use_xpu())) ||
-      ((use_npu() ^ pass_builder_->use_npu())) ||
      ((use_ipu() ^ pass_builder_->use_ipu())) ||
      ((use_custom_device() ^ pass_builder_->use_custom_device()))) {
    if (use_gpu()) {
@@ -841,13 +833,6 @@ void AnalysisConfig::Update() {
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
      pass_builder_.reset(new XpuPassStrategy);
-    } else if (use_npu()) {
-      PADDLE_ENFORCE_EQ(
-          use_gpu(),
-          false,
-          platform::errors::InvalidArgument(
-              "Only one choice can be made between GPU and NPU."));
-      pass_builder_.reset(new NpuPassStrategy);
    } else if (use_custom_device()) {
      PADDLE_ENFORCE_EQ(
          use_gpu(),
@@ -875,14 +860,6 @@ void AnalysisConfig::Update() {
              "Only one choice can be made between CPU and XPU."));
      pass_builder_.reset(new XpuPassStrategy(
          *static_cast<XpuPassStrategy *>(pass_builder_.get())));
-    } else if (use_npu()) {
-      PADDLE_ENFORCE_EQ(
-          use_gpu(),
-          false,
-          platform::errors::InvalidArgument(
-              "Only one choice can be made between GPU and NPU."));
-      pass_builder_.reset(new NpuPassStrategy(
-          *static_cast<NpuPassStrategy *>(pass_builder_.get())));
    } else if (use_custom_device()) {
      PADDLE_ENFORCE_EQ(
          use_gpu(),
@@ -1114,9 +1091,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
    ss << op_type;
  }

-  ss << use_npu_;
-  ss << npu_device_id_;
-
  ss << thread_local_stream_;

  ss << use_ipu_;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -148,8 +148,8 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
      return phi::Backend::CUSTOM;
    default:
      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "Paddle Inference not support backend, we now only support GPU, XPU, "
-          "NPU and CPU."));
+          "Paddle Inference not support backend, we now only support GPU, XPU "
+          "and CPU."));
      return phi::Backend::CPU;
  }
 }
@@ -1432,9 +1432,6 @@ void AnalysisPredictor::PrepareArgument() {
  argument_->SetIpuCustomPatterns(config_.ipu_custom_patterns_);
 #endif

-  argument_->SetUseNpu(config_.use_npu_);
-  argument_->SetNPUDeviceId(config_.npu_device_id());
-
  if (config_.use_mkldnn_) {
    LOG(INFO) << "MKLDNN is enabled";
    argument_->SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -130,7 +130,7 @@ T *Tensor::mutable_data(PlaceType place) {
    }
    default:
      PADDLE_THROW(paddle::platform::errors::Unavailable(
-          "Only CPU / CUDA / XPU / NPU places is supported. The place `%d` is "
+          "Only CPU / CUDA / XPU places is supported. The place `%d` is "
          "not supported.",
          static_cast<int>(place)));
      break;
@@ -261,7 +261,7 @@ void Tensor::CopyFromCpu(const T *data) {
                         dev_ctx->stream());
 #else
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
+        "The analysis predictor supports CPU, GPU and XPU now."));
 #endif
  }
 }
@@ -468,7 +468,7 @@ void Tensor::CopyToCpuImpl(T *data,
    dev_ctx->GetStream()->Synchronize();
 #else
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
+        "The analysis predictor supports CPU, GPU and XPU now."));
 #endif
  }
 }

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -414,12 +414,6 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \return bool Whether the XPU is turned on.
  ///
  bool use_xpu() const { return use_xpu_; }
-  ///
-  /// \brief A boolean state telling whether the NPU is turned on.
-  ///
-  /// \return bool Whether the NPU is turned on.
-  ///
-  bool use_npu() const { return use_npu_; }
  /// \brief A boolean state telling whether the IPU is turned on.
  ///
  /// \return bool Whether the IPU is turned on.
@@ -461,12 +455,6 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \return int The XPU device id.
  ///
  int xpu_device_id() const { return xpu_device_id_; }
-  ///
-  /// \brief Get the NPU device id.
-  ///
-  /// \return int The NPU device id.
-  ///
-  int npu_device_id() const { return npu_device_id_; }
  /// \brief Get the number of IPU device .
  ///
  /// \return int The number of IPU device.
@@ -1083,10 +1071,6 @@ struct PD_INFER_DECL AnalysisConfig {
  bool use_external_stream_{false};
  void* exec_stream_{nullptr};

-  // NPU related
-  bool use_npu_{false};
-  int npu_device_id_{0};
-
  // CustomDevice related
  bool use_custom_device_{false};
  int custom_device_id_{0};

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -360,7 +360,6 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
  /// GPU related fields.
  bool use_xpu{false};
  bool use_gpu{false};
-  bool use_npu{false};
  int device{0};
  float fraction_of_gpu_memory{
      -1.f};  ///< Change to a float in (0,1] if needed.

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -162,10 +162,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
  /// \return A bool variable implying whether we are in xpu mode.
  bool use_xpu() const { return use_xpu_; }

-  /// \brief Check if we are using npu.
-  /// \return A bool variable implying whether we are in npu mode.
-  bool use_npu() const { return use_npu_; }
-
  /// \brief Check if we are using ipu.
  /// \return A bool variable implying whether we are in ipu mode.
  bool use_ipu() const { return use_ipu_; }
@@ -181,7 +177,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
  /// \cond Protected
  bool use_xpu_{false};
  bool use_gpu_{false};
-  bool use_npu_{false};
  bool use_ipu_{false};
  bool use_mkldnn_{false};
  bool use_custom_device_{false};
@@ -293,21 +288,6 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
  XpuPassStrategy();
 };

-/// \class NpuPassStrategy
-/// \brief The NPU passes controller, it is used in AnalysisPredictor with NPU
-/// mode.
-class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
- public:
-  NpuPassStrategy() : PassStrategy({}) { use_npu_ = true; }
-
-  /// \brief Construct by copying another NpuPassStrategy object.
-  /// \param[in] other The NpuPassStrategy object we want to copy.
-  explicit NpuPassStrategy(const NpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {
-    use_npu_ = true;
-  }
-};
-
 /// \class CustomDevicePassStrategy
 /// \brief The CustomDevice passes controller, it is used in AnalysisPredictor
 /// with CustomDevice

--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -176,11 +176,6 @@ PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
  return config->use_xpu();
 }

-PD_Bool PD_ConfigUseNpu(__pd_keep PD_Config* pd_config) {
-  CHECK_AND_CONVERT_PD_CONFIG;
-  return config->use_npu();
-}
-
 int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
  return config->gpu_device_id();
@@ -189,10 +184,6 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
  return config->xpu_device_id();
 }
-int32_t PD_ConfigNpuDeviceId(__pd_keep PD_Config* pd_config) {
-  CHECK_AND_CONVERT_PD_CONFIG;
-  return config->npu_device_id();
-}

 void PD_ConfigEnableCustomDevice(__pd_keep PD_Config* pd_config,
                                 char* device_type,

--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -222,14 +222,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
    __pd_keep PD_Config* pd_config);
 ///
-/// \brief A boolean state telling whether the NPU is turned on.
-///
-/// \param[in] pd_onfig config
-/// \return Whether the NPU is turned on.
-///
-PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseNpu(
-    __pd_keep PD_Config* pd_config);
-///
 /// \brief Get the GPU device id.
 ///
 /// \param[in] pd_onfig config
@@ -246,14 +238,6 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
 PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
    __pd_keep PD_Config* pd_config);
 ///
-/// \brief Get the NPU device id.
-///
-/// \param[in] pd_onfig config
-/// \return The NPU device id.
-///
-PADDLE_CAPI_EXPORT extern int32_t PD_ConfigNpuDeviceId(
-    __pd_keep PD_Config* pd_config);
-///
 /// \brief Turn on custome device.
 ///
 /// \param[in] pd_config config

--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -230,15 +230,6 @@ func (config *Config) UseXpu() bool {
 	return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c))
 }

-///
-/// \brief A boolean state telling whether the NPU is turned on.
-///
-/// \return bool Whether the NPU is turned on.
-///
-func (config *Config) UseNpu() bool {
-	return cvtPDBoolToGo(C.PD_ConfigUseNpu(config.c))
-}
-
 ///
 /// \brief Get the GPU device id.
 ///
@@ -257,15 +248,6 @@ func (config *Config) XpuDeviceId() int32 {
 	return int32(C.PD_ConfigXpuDeviceId(config.c))
 }

-///
-/// \brief Get the NPU device id.
-///
-/// \return int32 The NPU device id.
-///
-func (config *Config) NpuDeviceId() int32 {
-	return int32(C.PD_ConfigNpuDeviceId(config.c))
-}
-
 ///
 /// \brief Get the initial size in MB of the GPU memory pool.
 ///

--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -190,13 +190,3 @@ class StatRegistry {
  USE_INT_STAT(STAT_gpu13_mem_size); \
  USE_INT_STAT(STAT_gpu14_mem_size); \
  USE_INT_STAT(STAT_gpu15_mem_size)
-
-#define USE_NPU_MEM_STAT            \
-  USE_INT_STAT(STAT_npu0_mem_size); \
-  USE_INT_STAT(STAT_npu1_mem_size); \
-  USE_INT_STAT(STAT_npu2_mem_size); \
-  USE_INT_STAT(STAT_npu3_mem_size); \
-  USE_INT_STAT(STAT_npu4_mem_size); \
-  USE_INT_STAT(STAT_npu5_mem_size); \
-  USE_INT_STAT(STAT_npu6_mem_size); \
-  USE_INT_STAT(STAT_npu7_mem_size)
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -28,7 +28,6 @@ using Place = phi::Place;
 using CPUPlace = phi::CPUPlace;
 using CUDAPlace = phi::GPUPlace;
 using CUDAPinnedPlace = phi::GPUPinnedPlace;
-using NPUPinnedPlace = phi::NPUPinnedPlace;
 using XPUPlace = phi::XPUPlace;
 using IPUPlace = phi::IPUPlace;
 using CustomPlace = phi::CustomPlace;
@@ -87,11 +86,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
      return typename Visitor::result_type();
 #endif
    }
-    case phi::AllocationType::NPUPINNED: {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
-      return typename Visitor::result_type();
-    }
    case phi::AllocationType::IPU: {
 #ifdef PADDLE_WITH_IPU
      platform::IPUPlace p(place.GetDeviceId());

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -673,7 +673,6 @@ void BindNativeConfig(py::module *m) {
      .def(py::init<>())
      .def_readwrite("use_gpu", &NativeConfig::use_gpu)
      .def_readwrite("use_xpu", &NativeConfig::use_xpu)
-      .def_readwrite("use_npu", &NativeConfig::use_npu)
      .def_readwrite("device", &NativeConfig::device)
      .def_readwrite("fraction_of_gpu_memory",
                     &NativeConfig::fraction_of_gpu_memory)
@@ -805,10 +804,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
      .def("use_gpu", &AnalysisConfig::use_gpu)
      .def("use_xpu", &AnalysisConfig::use_xpu)
-      .def("use_npu", &AnalysisConfig::use_npu)
      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
      .def("xpu_device_id", &AnalysisConfig::xpu_device_id)
-      .def("npu_device_id", &AnalysisConfig::npu_device_id)
      .def("memory_pool_init_size_mb",
           &AnalysisConfig::memory_pool_init_size_mb)
      .def("fraction_of_gpu_memory_for_pool",

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -629,7 +629,6 @@ void BindPlace(pybind11::module &m) {  // NOLINT
          [](platform::Place &self) { return platform::is_custom_place(self); })
      .def("gpu_device_id", [](platform::Place &self) { return self.device; })
      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
-      .def("npu_device_id", [](platform::Place &self) { return self.device; })
      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
      .def("custom_device_id",
           [](platform::Place &self) { return self.device; })

--- a/paddle/phi/backends/cpu/cpu_info.cc
+++ b/paddle/phi/backends/cpu/cpu_info.cc
@@ -110,23 +110,6 @@ size_t CUDAPinnedMaxChunkSize() {
  return CUDAPinnedMaxAllocSize() / 256;
 }

-size_t NPUPinnedMaxAllocSize() {
-  // For distributed systems, it requires configuring and limiting
-  // the fraction of memory to use.
-  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
-}
-
-size_t NPUPinnedMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 64 KB.
-  return 1 << 16;
-}
-
-size_t NPUPinnedMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
-  // memory.
-  return NPUPinnedMaxAllocSize() / 256;
-}
-
 #ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {

--- a/paddle/phi/backends/cpu/cpu_info.h
+++ b/paddle/phi/backends/cpu/cpu_info.h
@@ -75,15 +75,6 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();

-//! Get the maximum allocation size for a machine.
-size_t NPUPinnedMaxAllocSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t NPUPinnedMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t NPUPinnedMaxChunkSize();
-
 typedef enum {
  isa_any,
  sse42,

--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -35,8 +35,6 @@ const char *AllocationTypeStr(AllocationType type) {
      return "gpu_pinned";
    case AllocationType::XPU:
      return "xpu";
-    case AllocationType::NPUPINNED:
-      return "npu_pinned";
    case AllocationType::IPU:
      return "ipu";
    default:
@@ -55,7 +53,6 @@ std::string Place::DebugString() const {
    os << AllocationTypeStr(alloc_type_);
  }
  if (alloc_type_ == AllocationType::GPUPINNED ||
-      alloc_type_ == AllocationType::NPUPINNED ||
      alloc_type_ == AllocationType::CPU) {
    os << ")";
  } else {

--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -32,7 +32,6 @@ enum class AllocationType : int8_t {
  GPUPINNED = 3,
  XPU = 4,
  NPU = 5,
-  NPUPINNED = 6,
  IPU = 7,
  CUSTOM = 9,
 };
@@ -163,15 +162,6 @@ class XPUPlace : public Place {
      : Place(AllocationType::XPU, place.GetDeviceId()) {}
 };

-class NPUPinnedPlace : public Place {
- public:
-  NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {}
-
-  NPUPinnedPlace(const NPUPinnedPlace&) = default;
-  NPUPinnedPlace(const Place& place UNUSED)  // NOLINT
-      : Place(AllocationType::NPUPINNED) {}
-};
-
 class IPUPlace : public Place {
 public:
  IPUPlace() : Place(AllocationType::IPU, 0) {}

--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -161,12 +161,6 @@ void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
 #endif
 }

-template <>
-void set_constant_with_place<phi::NPUPinnedPlace>(
-    const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
-  PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
-}
-
 template <>
 void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context,
                                            phi::DenseTensor* tensor,

--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -15,7 +15,6 @@
 import paddle
 from paddle import _legacy_C_ops
 from paddle.distributed import collective
-from paddle.fluid import core
 from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype
 from paddle.framework import LayerHelper, _create_tensor, in_dygraph_mode
 from paddle.nn import Layer
@@ -551,11 +550,7 @@ def _parallel_linear(
    )

    # NOTE: npu linear function use matmul_v2 but linear use matmul
-    linear_function = (
-        _linear
-        if core.is_compiled_with_custom_device('npu')
-        else paddle.nn.functional.linear
-    )
+    linear_function = paddle.nn.functional.linear
    linear_out = linear_function(
        x,
        linear.weight,

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -595,9 +595,6 @@ class ShardingOptimizer(MetaOptimizerBase):
        # amp inf_var & clip global_norm_var

        rings = [self.mp_ring_id, self.pp_ring_id]
-        # FIXME(wangxi): some problem with NPU found_finite, need sync with DP
-        if core.is_compiled_with_custom_device('npu'):
-            rings += [self.dp_ring_id]
        FP16Utils.sync_amp_check_nan_inf(main_block, rings)

        gradientclip_helper = GradientClipHelper(None)
@@ -719,10 +716,7 @@ class ShardingOptimizer(MetaOptimizerBase):
        self._recreate_not_persist_param_as_var()

        self._dump_program_for_debug()
-
-        # GPU need to wait server ready, GPU and NPU is Layered connection
-        if not core.is_compiled_with_custom_device('npu'):
-            self._wait()
+        self._wait()
        return optimize_ops, params_grads

    def _init_pair_comm(self, pair, ring_id):

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1988,14 +1988,9 @@ class Executor:
            for var in program.global_block().vars.values():
                if var.is_data:
                    data_vars.append(var)
-            if core.is_compiled_with_custom_device('npu'):
-                dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'InMemoryDataset'
-                )
-            else:
-                dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'FileInstantDataset'
-                )
+            dataset = paddle.fluid.DatasetFactory().create_dataset(
+                'FileInstantDataset'
+            )
            dataset.set_batch_size(1)
            dataset.set_thread(1)
            dataset.set_filelist(['None'])
@@ -2165,14 +2160,9 @@ class Executor:
            for var in program.global_block().vars.values():
                if var.is_data:
                    data_vars.append(var)
-            if core.is_compiled_with_custom_device('npu'):
-                dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'InMemoryDataset'
-                )
-            else:
-                dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'FileInstantDataset'
-                )
+            dataset = paddle.fluid.DatasetFactory().create_dataset(
+                'FileInstantDataset'
+            )
            dataset.set_batch_size(1)
            dataset.set_thread(1)
            dataset.set_filelist(['None'])

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -597,21 +597,6 @@ def _current_expected_place():
                    "You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default."
                )
                _global_expected_place_ = core.CPUPlace()
-        elif core.is_compiled_with_custom_device("npu"):
-            # TODO(duanyanhui): Optimize DeviceManager and Return all expected places when device registered in DeviceManager is greater than 1.
-            try:
-                device_count = core.get_custom_device_count("npu")
-            except Exception as e:
-                device_count = 0
-            if device_count > 0:
-                _global_expected_place_ = core.CustomPlace(
-                    "npu", _custom_device_ids("npu")[0]
-                )
-            else:
-                warnings.warn(
-                    "You are using NPU version Paddle, but your NPU device is not set properly. CPU device will be used by default."
-                )
-                _global_expected_place_ = core.CPUPlace()
        else:
            _global_expected_place_ = core.CPUPlace()

@@ -7454,9 +7439,9 @@ def device_guard(device=None):
        device, index = device.split(':')
        if device == 'cpu':
            raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', 'xpu', 'npu', '', None]:
+    if device not in ['cpu', 'gpu', 'xpu', '', None]:
        raise ValueError(
-            "The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
            "when there is no need to specify device. But received %s" % device
        )
    if index:

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4554,9 +4554,7 @@ class PipelineOptimizer:

    def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
        self._device = 'cpu'
-        if core.is_compiled_with_custom_device('npu'):
-            self._device = "npu"
-        elif core.is_compiled_with_cuda():
+        if core.is_compiled_with_cuda():
            self._device = "gpu"
        if in_dygraph_mode():
            raise Exception("In dygraph, don't support PipelineOptimizer.")
@@ -4945,8 +4943,8 @@ class PipelineOptimizer:
            else None
        )
        if device:
-            assert device[0:3] == 'gpu' or device[0:3] == 'npu', (
-                "Now, only gpu and npu devices are "
+            assert device[0:3] == 'gpu', (
+                "Now, only gpu devices are "
                "supported in pipeline parallemism."
            )
        return device
@@ -5148,8 +5146,8 @@ class PipelineOptimizer:
                continue

            dev_type = device.split(':')[0]
-            assert dev_type == "gpu" or dev_type == 'npu', (
-                "Now only gpu and npu devices are supported "
+            assert dev_type == "gpu", (
+                "Now only gpu devices are supported "
                "for pipeline parallelism."
            )

@@ -6388,8 +6386,6 @@ class PipelineOptimizer:
            dev_index = int(dev.split(":")[1])
            if core.is_compiled_with_cuda():
                place_list.append(core.CUDAPlace(dev_index % 1))
-            elif paddle.is_compiled_with_custom_device('npu'):
-                place_list.append(paddle.CustomPlace('npu', dev_index % 1))

        # Step6: Split startup program
        new_startup_program = self._split_startup_program(
@@ -6412,8 +6408,6 @@ class PipelineOptimizer:

        if core.is_compiled_with_cuda():
            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        elif core.is_compiled_with_custom_device('npu'):
-            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
        # A pass to move the recv op to the beginning of
        # the forward/backward phase
        self._mv_head_recv(program_list[self.local_rank])

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -16,7 +16,6 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
 from paddle.device import (
    get_all_custom_device_type,
    is_compiled_with_cuda,
-    is_compiled_with_custom_device,
    is_compiled_with_rocm,
 )
 from paddle.fluid.framework import _global_flags, in_dygraph_mode
@@ -465,13 +464,6 @@ def conv1d(
        l_type = 'depthwise_conv2d'
        use_cudnn = False

-    # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if is_compiled_with_custom_device('npu'):
-        if num_channels == groups and num_channels == num_filters:
-            l_type = 'depthwise_conv2d'
-        else:
-            l_type = 'conv2d'
-
    squeeze_aixs = -3 if channel_last else -2
    x = unsqueeze(x, axis=[squeeze_aixs])

@@ -755,13 +747,6 @@ def conv2d(

    use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]

-    # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if is_compiled_with_custom_device('npu'):
-        if num_channels == groups and num_channels == num_filters:
-            l_type = 'depthwise_conv2d'
-        else:
-            l_type = 'conv2d'
-
    if (
        is_compiled_with_cuda()
        and get_flags("FLAGS_conv2d_disable_cudnn")[

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -16,7 +16,7 @@ import math

 # TODO: define loss functions of neural network
 import paddle
-from paddle import _C_ops, _legacy_C_ops, fluid, in_dynamic_mode
+from paddle import _C_ops, fluid, in_dynamic_mode
 from paddle.framework import core
 from paddle.static.nn.control_flow import Assert
 from paddle.utils import deprecated
@@ -269,51 +269,15 @@ def fluid_softmax_with_cross_entropy(
    if input_dims - 1 == label_dims:
        label = paddle.unsqueeze(label, axis=axis)
    if in_dygraph_mode():
-        if core.is_compiled_with_custom_device("npu"):
-            if not soft_label:
-                valid_label = (
-                    paddle.cast(label != ignore_index, dtype=label.dtype)
-                    * label
-                )
-                softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                    logits,
-                    valid_label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    numeric_stable_mode,
-                    'axis',
-                    axis,
-                    'use_softmax',
-                    True,
-                )
-            else:
-                softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                    logits,
-                    label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    numeric_stable_mode,
-                    'axis',
-                    axis,
-                    'use_softmax',
-                    True,
-                )
-        else:
-            softmax, loss = _C_ops.cross_entropy_with_softmax(
-                logits,
-                label,
-                soft_label,
-                True,
-                numeric_stable_mode,
-                ignore_index,
-                axis,
-            )
+        softmax, loss = _C_ops.cross_entropy_with_softmax(
+            logits,
+            label,
+            soft_label,
+            True,
+            numeric_stable_mode,
+            ignore_index,
+            axis,
+        )
        if not return_softmax:
            return loss
        else:
@@ -2734,41 +2698,9 @@ def cross_entropy(
            valid_label = (
                paddle.cast(label != ignore_index, dtype=label.dtype) * label
            )
-        if core.is_compiled_with_custom_device("npu"):
-            if not soft_label:
-                _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input,
-                    valid_label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    True,
-                    'axis',
-                    axis,
-                    'use_softmax',
-                    use_softmax,
-                )
-            else:
-                _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input,
-                    label,
-                    'soft_label',
-                    soft_label,
-                    'ignore_index',
-                    ignore_index,
-                    'numeric_stable_mode',
-                    True,
-                    'axis',
-                    axis,
-                    'use_softmax',
-                    use_softmax,
-                )
-        else:
-            _, out = _C_ops.cross_entropy_with_softmax(
-                input, label, soft_label, use_softmax, True, ignore_index, axis
-            )
+        _, out = _C_ops.cross_entropy_with_softmax(
+            input, label, soft_label, use_softmax, True, ignore_index, axis
+        )

        if weight is not None:


--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
@@ -220,24 +220,7 @@ class OptimizerWithMixedPrecision:
        """
        train_program = loss.block.program
        self._train_program = train_program
-
-        # NOTE(zhiqiu): _float_status is only used for NPU.
-        if core.is_compiled_with_custom_device('npu'):
-            float_status = paddle.static.data(
-                name="float_status", shape=[8], dtype='float32'
-            )
-            self._train_program.global_block().append_op(
-                type="alloc_float_status",
-                outputs={"FloatStatus": float_status},
-            )
-            self._train_program.global_block().append_op(
-                type="clear_float_status",
-                inputs={"FloatStatus": float_status},
-                outputs={"FloatStatusOut": float_status},
-            )
-            self._float_status = float_status
-        else:
-            self._float_status = None
+        self._float_status = None

        with program_guard(self._train_program, startup_program):
            self._init_amp_var()
@@ -476,27 +459,17 @@ class OptimizerWithMixedPrecision:
        if self._is_distributed:
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
-            if core.is_compiled_with_custom_device('npu'):
-                with self._train_program._optimized_guard(grads):
+            for p, g in params_grads:
+                with self._train_program._optimized_guard([p, g]):
                    _, found_inf = check_finite_and_unscale(
-                        grads,
+                        [
+                            g,
+                        ],
                        self._loss_scaling,
                        name="find_infinite_scale",
                        float_status=self._float_status,
                    )
                    found_infs.append(found_inf)
-            else:
-                for p, g in params_grads:
-                    with self._train_program._optimized_guard([p, g]):
-                        _, found_inf = check_finite_and_unscale(
-                            [
-                                g,
-                            ],
-                            self._loss_scaling,
-                            name="find_infinite_scale",
-                            float_status=self._float_status,
-                        )
-                        found_infs.append(found_inf)
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):

--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -97,8 +97,6 @@ def _get_sys_unsupported_list(dtype):
    device = None
    if core.is_compiled_with_xpu():
        device = 'XPU'
-    elif core.is_compiled_with_custom_device('npu'):
-        device = 'NPU'
    else:
        device = 'GPU'
    _, _, sys_unsupported_list = core.op_supported_infos(device, var_type)

--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -948,13 +948,6 @@ def conv2d(
    ):
        l_type = 'depthwise_conv2d'

-    # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_custom_device('npu'):
-        if num_channels == groups and num_channels == num_filters:
-            l_type = 'depthwise_conv2d'
-        else:
-            l_type = 'conv2d'
-
    helper = LayerHelper(l_type, **locals())
    dtype = helper.input_dtype()


--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -212,12 +212,6 @@ class Timeline:
                    self._chrome_trace.emit_pid(
                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid
                    )
-                if (k, 0, "NPU") not in self._mem_devices:
-                    pid = self._allocate_pid()
-                    self._mem_devices[(k, 0, "NPU")] = pid
-                    self._chrome_trace.emit_pid(
-                        "memory usage on %s:npu:%d" % (k, 0), pid
-                    )

    def _allocate_events(self):
        for k, profile_pb in self._profile_dict.items():