未验证 提交 0d45ac73 编写于 作者: 张春乔 提交者: GitHub

昇腾和寒武纪相关代码退场 npu相关代码退场2 (#53568)

上级 00ded2ea
...@@ -111,8 +111,7 @@ bool MessageBus::Send(int64_t dst_rank, ...@@ -111,8 +111,7 @@ bool MessageBus::Send(int64_t dst_rank,
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Fleet executor does not support sending message between different " "Fleet executor does not support sending message between different "
"ranks when Paddle is compiled with npu or " "ranks when Paddle isn't compiled with distributed for now."));
"isn't compiled with distributed for now."));
#endif #endif
return true; return true;
} }
...@@ -202,10 +201,9 @@ void MessageBus::ListenPort() { ...@@ -202,10 +201,9 @@ void MessageBus::ListenPort() {
} }
LOG(INFO) << "Message bus's listen port thread starts successful."; LOG(INFO) << "Message bus's listen port thread starts successful.";
#else #else
LOG(WARNING) LOG(WARNING) << "Fleet executor's ListenPort() is a fake function when "
<< "Fleet executor's ListenPort() is a fake function when Paddle is " "Paddle isn't compiled "
"compiled with npu or Paddle isn't compiled " "with distributed for now.";
"with distributed for now.";
#endif #endif
} }
......
...@@ -89,11 +89,6 @@ struct DLDeviceVisitor ...@@ -89,11 +89,6 @@ struct DLDeviceVisitor
platform::errors::Unimplemented("platform::XPUPlace is not supported")); platform::errors::Unimplemented("platform::XPUPlace is not supported"));
} }
inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::NPUPinnedPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CustomPlace &place) const { inline ::DLDevice operator()(const platform::CustomPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"platform::CustomPlace is not supported")); "platform::CustomPlace is not supported"));
......
...@@ -50,10 +50,6 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { ...@@ -50,10 +50,6 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
execution_strategy.num_threads_ = 1; execution_strategy.num_threads_ = 1;
break; break;
} }
case platform::DeviceType::NPU: {
execution_strategy.num_threads_ = 1;
break;
}
case platform::DeviceType::CUSTOM_DEVICE: { case platform::DeviceType::CUSTOM_DEVICE: {
execution_strategy.num_threads_ = 1; execution_strategy.num_threads_ = 1;
break; break;
......
...@@ -196,7 +196,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode( ...@@ -196,7 +196,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
? OpFuncType::kGpuSync ? OpFuncType::kGpuSync
: OpFuncType::kGpuAsync; : OpFuncType::kGpuAsync;
} else { } else {
// Memcpy in npu and custom devices is asynchronous // Memcpy in custom devices is asynchronous
new_op_func_node.type_ = OpFuncType::kGpuAsync; new_op_func_node.type_ = OpFuncType::kGpuAsync;
} }
...@@ -225,7 +225,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode( ...@@ -225,7 +225,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
} }
} }
// NOTE(winter-wang): in npu and custom device, D2H kernel is asynchronous. // NOTE(winter-wang): in custom device, D2H kernel is asynchronous.
// need to explicit synchronization. // need to explicit synchronization.
if ((platform::is_custom_place(place)) && op_type == kMemcpyD2H) { if ((platform::is_custom_place(place)) && op_type == kMemcpyD2H) {
dev_ctx->Wait(); dev_ctx->Wait();
......
...@@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext( ...@@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
DeviceContext* dev_ctx = nullptr; DeviceContext* dev_ctx = nullptr;
// only gpu/npu need update. xpu not need, because xpu memcpy op kernel is // only gpu needs update. xpu not need, because xpu memcpy op kernel is
// synchronous. // synchronous.
if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) { if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) {
VLOG(6) << "Parse DeviceContext for " << op_type VLOG(6) << "Parse DeviceContext for " << op_type
......
...@@ -1331,8 +1331,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( ...@@ -1331,8 +1331,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
device_name = "XPU"; device_name = "XPU";
} else { } else {
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unavailable("Only CPU/CUDA/NPU/XPU is supportted. " platform::errors::Unavailable("Only CPU/CUDA/XPU is supportted. "
"please use CPU/CUDA/NPU/XPU backend.")); "please use CPU/CUDA/XPU backend."));
} }
VLOG(1) << string::Sprintf( VLOG(1) << string::Sprintf(
......
...@@ -52,12 +52,12 @@ OpSupportedInfos(const std::string& place, ...@@ -52,12 +52,12 @@ OpSupportedInfos(const std::string& place,
{"CPU", &platform::is_cpu_place}, {"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place}, {"XPU", &platform::is_xpu_place},
}; };
PADDLE_ENFORCE_NE(is_target_place.count(query_place), PADDLE_ENFORCE_NE(
0, is_target_place.count(query_place),
platform::errors::InvalidArgument( 0,
"The argument `place` should be 'GPU', 'CPU', 'XPU', " platform::errors::InvalidArgument(
"'NPU', but got '%s'.", "The argument `place` should be 'GPU', 'CPU', 'XPU', but got '%s'.",
place)); place));
std::unordered_set<std::string> all_ops; std::unordered_set<std::string> all_ops;
const auto& op_info = framework::OpInfoMap::Instance().map(); const auto& op_info = framework::OpInfoMap::Instance().map();
...@@ -147,7 +147,7 @@ AmpOperators::AmpOperators() ...@@ -147,7 +147,7 @@ AmpOperators::AmpOperators()
OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
unsupported_ops_gpu_bf16.end()); unsupported_ops_gpu_bf16.end());
// NOTE: GPU/NPU/XPU is compiled seperatly. // NOTE: GPU/XPU is compiled seperatly.
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
auto unsupported_ops_xpu_fp16 = std::get<2>( auto unsupported_ops_xpu_fp16 = std::get<2>(
OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16)); OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
......
...@@ -364,10 +364,6 @@ struct Argument { ...@@ -364,10 +364,6 @@ struct Argument {
IpuEnableModelRuntimeExecutor, IpuEnableModelRuntimeExecutor,
bool); bool);
// npu related
DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
DECL_ARGUMENT_FIELD(npu_device_id, NPUDeviceId, int);
// mixed precision related // mixed precision related
DECL_ARGUMENT_FIELD(model_precision, ModelPrecision, int); DECL_ARGUMENT_FIELD(model_precision, ModelPrecision, int);
DECL_ARGUMENT_FIELD(mixed_black_list, DECL_ARGUMENT_FIELD(mixed_black_list,
......
...@@ -56,8 +56,6 @@ PassStrategy *AnalysisConfig::pass_builder() const { ...@@ -56,8 +56,6 @@ PassStrategy *AnalysisConfig::pass_builder() const {
pass_builder_.reset(new GpuPassStrategy); pass_builder_.reset(new GpuPassStrategy);
} else if (use_xpu_) { } else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy); pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy);
} else if (use_ipu_) { } else if (use_ipu_) {
LOG(INFO) << "Create IPU IR passes"; LOG(INFO) << "Create IPU IR passes";
pass_builder_.reset(new IpuPassStrategy); pass_builder_.reset(new IpuPassStrategy);
...@@ -506,8 +504,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -506,8 +504,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_opencl_); CP_MEMBER(use_opencl_);
// NPU related. // NPU related.
CP_MEMBER(use_npu_);
CP_MEMBER(npu_device_id_);
CP_MEMBER(nnadapter_config_); CP_MEMBER(nnadapter_config_);
// profile related. // profile related.
...@@ -574,9 +570,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -574,9 +570,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
} else if (use_custom_device_) { } else if (use_custom_device_) {
pass_builder_.reset(new CustomDevicePassStrategy( pass_builder_.reset(new CustomDevicePassStrategy(
*static_cast<CustomDevicePassStrategy *>(other.pass_builder()))); *static_cast<CustomDevicePassStrategy *>(other.pass_builder())));
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(other.pass_builder())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder()))); *static_cast<CpuPassStrategy *>(other.pass_builder())));
...@@ -827,7 +820,6 @@ void AnalysisConfig::Update() { ...@@ -827,7 +820,6 @@ void AnalysisConfig::Update() {
// Transfer pass_builder and copy the existing compatible passes. // Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) || if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
((use_xpu() ^ pass_builder_->use_xpu())) || ((use_xpu() ^ pass_builder_->use_xpu())) ||
((use_npu() ^ pass_builder_->use_npu())) ||
((use_ipu() ^ pass_builder_->use_ipu())) || ((use_ipu() ^ pass_builder_->use_ipu())) ||
((use_custom_device() ^ pass_builder_->use_custom_device()))) { ((use_custom_device() ^ pass_builder_->use_custom_device()))) {
if (use_gpu()) { if (use_gpu()) {
...@@ -841,13 +833,6 @@ void AnalysisConfig::Update() { ...@@ -841,13 +833,6 @@ void AnalysisConfig::Update() {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy); pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu()) {
PADDLE_ENFORCE_EQ(
use_gpu(),
false,
platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy);
} else if (use_custom_device()) { } else if (use_custom_device()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), use_gpu(),
...@@ -875,14 +860,6 @@ void AnalysisConfig::Update() { ...@@ -875,14 +860,6 @@ void AnalysisConfig::Update() {
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy( pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(pass_builder_.get()))); *static_cast<XpuPassStrategy *>(pass_builder_.get())));
} else if (use_npu()) {
PADDLE_ENFORCE_EQ(
use_gpu(),
false,
platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(pass_builder_.get())));
} else if (use_custom_device()) { } else if (use_custom_device()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), use_gpu(),
...@@ -1114,9 +1091,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -1114,9 +1091,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << op_type; ss << op_type;
} }
ss << use_npu_;
ss << npu_device_id_;
ss << thread_local_stream_; ss << thread_local_stream_;
ss << use_ipu_; ss << use_ipu_;
......
...@@ -148,8 +148,8 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) { ...@@ -148,8 +148,8 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
return phi::Backend::CUSTOM; return phi::Backend::CUSTOM;
default: default:
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Paddle Inference not support backend, we now only support GPU, XPU, " "Paddle Inference not support backend, we now only support GPU, XPU "
"NPU and CPU.")); "and CPU."));
return phi::Backend::CPU; return phi::Backend::CPU;
} }
} }
...@@ -1432,9 +1432,6 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1432,9 +1432,6 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetIpuCustomPatterns(config_.ipu_custom_patterns_); argument_->SetIpuCustomPatterns(config_.ipu_custom_patterns_);
#endif #endif
argument_->SetUseNpu(config_.use_npu_);
argument_->SetNPUDeviceId(config_.npu_device_id());
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled"; LOG(INFO) << "MKLDNN is enabled";
argument_->SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); argument_->SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
......
...@@ -130,7 +130,7 @@ T *Tensor::mutable_data(PlaceType place) { ...@@ -130,7 +130,7 @@ T *Tensor::mutable_data(PlaceType place) {
} }
default: default:
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Only CPU / CUDA / XPU / NPU places is supported. The place `%d` is " "Only CPU / CUDA / XPU places is supported. The place `%d` is "
"not supported.", "not supported.",
static_cast<int>(place))); static_cast<int>(place)));
break; break;
...@@ -261,7 +261,7 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -261,7 +261,7 @@ void Tensor::CopyFromCpu(const T *data) {
dev_ctx->stream()); dev_ctx->stream());
#else #else
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU, NPU and XPU now.")); "The analysis predictor supports CPU, GPU and XPU now."));
#endif #endif
} }
} }
...@@ -468,7 +468,7 @@ void Tensor::CopyToCpuImpl(T *data, ...@@ -468,7 +468,7 @@ void Tensor::CopyToCpuImpl(T *data,
dev_ctx->GetStream()->Synchronize(); dev_ctx->GetStream()->Synchronize();
#else #else
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU, NPU and XPU now.")); "The analysis predictor supports CPU, GPU and XPU now."));
#endif #endif
} }
} }
......
...@@ -414,12 +414,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -414,12 +414,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// \return bool Whether the XPU is turned on. /// \return bool Whether the XPU is turned on.
/// ///
bool use_xpu() const { return use_xpu_; } bool use_xpu() const { return use_xpu_; }
///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \return bool Whether the NPU is turned on.
///
bool use_npu() const { return use_npu_; }
/// \brief A boolean state telling whether the IPU is turned on. /// \brief A boolean state telling whether the IPU is turned on.
/// ///
/// \return bool Whether the IPU is turned on. /// \return bool Whether the IPU is turned on.
...@@ -461,12 +455,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -461,12 +455,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// \return int The XPU device id. /// \return int The XPU device id.
/// ///
int xpu_device_id() const { return xpu_device_id_; } int xpu_device_id() const { return xpu_device_id_; }
///
/// \brief Get the NPU device id.
///
/// \return int The NPU device id.
///
int npu_device_id() const { return npu_device_id_; }
/// \brief Get the number of IPU device . /// \brief Get the number of IPU device .
/// ///
/// \return int The number of IPU device. /// \return int The number of IPU device.
...@@ -1083,10 +1071,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1083,10 +1071,6 @@ struct PD_INFER_DECL AnalysisConfig {
bool use_external_stream_{false}; bool use_external_stream_{false};
void* exec_stream_{nullptr}; void* exec_stream_{nullptr};
// NPU related
bool use_npu_{false};
int npu_device_id_{0};
// CustomDevice related // CustomDevice related
bool use_custom_device_{false}; bool use_custom_device_{false};
int custom_device_id_{0}; int custom_device_id_{0};
......
...@@ -360,7 +360,6 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config { ...@@ -360,7 +360,6 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
/// GPU related fields. /// GPU related fields.
bool use_xpu{false}; bool use_xpu{false};
bool use_gpu{false}; bool use_gpu{false};
bool use_npu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{ float fraction_of_gpu_memory{
-1.f}; ///< Change to a float in (0,1] if needed. -1.f}; ///< Change to a float in (0,1] if needed.
......
...@@ -162,10 +162,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -162,10 +162,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in xpu mode. /// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; } bool use_xpu() const { return use_xpu_; }
/// \brief Check if we are using npu.
/// \return A bool variable implying whether we are in npu mode.
bool use_npu() const { return use_npu_; }
/// \brief Check if we are using ipu. /// \brief Check if we are using ipu.
/// \return A bool variable implying whether we are in ipu mode. /// \return A bool variable implying whether we are in ipu mode.
bool use_ipu() const { return use_ipu_; } bool use_ipu() const { return use_ipu_; }
...@@ -181,7 +177,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -181,7 +177,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \cond Protected /// \cond Protected
bool use_xpu_{false}; bool use_xpu_{false};
bool use_gpu_{false}; bool use_gpu_{false};
bool use_npu_{false};
bool use_ipu_{false}; bool use_ipu_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
bool use_custom_device_{false}; bool use_custom_device_{false};
...@@ -293,21 +288,6 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { ...@@ -293,21 +288,6 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
XpuPassStrategy(); XpuPassStrategy();
}; };
/// \class NpuPassStrategy
/// \brief The NPU passes controller, it is used in AnalysisPredictor with NPU
/// mode.
class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
public:
NpuPassStrategy() : PassStrategy({}) { use_npu_ = true; }
/// \brief Construct by copying another NpuPassStrategy object.
/// \param[in] other The NpuPassStrategy object we want to copy.
explicit NpuPassStrategy(const NpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
use_npu_ = true;
}
};
/// \class CustomDevicePassStrategy /// \class CustomDevicePassStrategy
/// \brief The CustomDevice passes controller, it is used in AnalysisPredictor /// \brief The CustomDevice passes controller, it is used in AnalysisPredictor
/// with CustomDevice /// with CustomDevice
......
...@@ -176,11 +176,6 @@ PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { ...@@ -176,11 +176,6 @@ PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
return config->use_xpu(); return config->use_xpu();
} }
PD_Bool PD_ConfigUseNpu(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->use_npu();
}
int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) { int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->gpu_device_id(); return config->gpu_device_id();
...@@ -189,10 +184,6 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) { ...@@ -189,10 +184,6 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->xpu_device_id(); return config->xpu_device_id();
} }
int32_t PD_ConfigNpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->npu_device_id();
}
void PD_ConfigEnableCustomDevice(__pd_keep PD_Config* pd_config, void PD_ConfigEnableCustomDevice(__pd_keep PD_Config* pd_config,
char* device_type, char* device_type,
......
...@@ -222,14 +222,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( ...@@ -222,14 +222,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu( PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
__pd_keep PD_Config* pd_config); __pd_keep PD_Config* pd_config);
/// ///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \param[in] pd_onfig config
/// \return Whether the NPU is turned on.
///
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseNpu(
__pd_keep PD_Config* pd_config);
///
/// \brief Get the GPU device id. /// \brief Get the GPU device id.
/// ///
/// \param[in] pd_onfig config /// \param[in] pd_onfig config
...@@ -246,14 +238,6 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId( ...@@ -246,14 +238,6 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId( PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
__pd_keep PD_Config* pd_config); __pd_keep PD_Config* pd_config);
/// ///
/// \brief Get the NPU device id.
///
/// \param[in] pd_onfig config
/// \return The NPU device id.
///
PADDLE_CAPI_EXPORT extern int32_t PD_ConfigNpuDeviceId(
__pd_keep PD_Config* pd_config);
///
/// \brief Turn on custome device. /// \brief Turn on custome device.
/// ///
/// \param[in] pd_config config /// \param[in] pd_config config
......
...@@ -230,15 +230,6 @@ func (config *Config) UseXpu() bool { ...@@ -230,15 +230,6 @@ func (config *Config) UseXpu() bool {
return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c)) return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c))
} }
///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \return bool Whether the NPU is turned on.
///
func (config *Config) UseNpu() bool {
return cvtPDBoolToGo(C.PD_ConfigUseNpu(config.c))
}
/// ///
/// \brief Get the GPU device id. /// \brief Get the GPU device id.
/// ///
...@@ -257,15 +248,6 @@ func (config *Config) XpuDeviceId() int32 { ...@@ -257,15 +248,6 @@ func (config *Config) XpuDeviceId() int32 {
return int32(C.PD_ConfigXpuDeviceId(config.c)) return int32(C.PD_ConfigXpuDeviceId(config.c))
} }
///
/// \brief Get the NPU device id.
///
/// \return int32 The NPU device id.
///
func (config *Config) NpuDeviceId() int32 {
return int32(C.PD_ConfigNpuDeviceId(config.c))
}
/// ///
/// \brief Get the initial size in MB of the GPU memory pool. /// \brief Get the initial size in MB of the GPU memory pool.
/// ///
......
...@@ -190,13 +190,3 @@ class StatRegistry { ...@@ -190,13 +190,3 @@ class StatRegistry {
USE_INT_STAT(STAT_gpu13_mem_size); \ USE_INT_STAT(STAT_gpu13_mem_size); \
USE_INT_STAT(STAT_gpu14_mem_size); \ USE_INT_STAT(STAT_gpu14_mem_size); \
USE_INT_STAT(STAT_gpu15_mem_size) USE_INT_STAT(STAT_gpu15_mem_size)
#define USE_NPU_MEM_STAT \
USE_INT_STAT(STAT_npu0_mem_size); \
USE_INT_STAT(STAT_npu1_mem_size); \
USE_INT_STAT(STAT_npu2_mem_size); \
USE_INT_STAT(STAT_npu3_mem_size); \
USE_INT_STAT(STAT_npu4_mem_size); \
USE_INT_STAT(STAT_npu5_mem_size); \
USE_INT_STAT(STAT_npu6_mem_size); \
USE_INT_STAT(STAT_npu7_mem_size)
...@@ -28,7 +28,6 @@ using Place = phi::Place; ...@@ -28,7 +28,6 @@ using Place = phi::Place;
using CPUPlace = phi::CPUPlace; using CPUPlace = phi::CPUPlace;
using CUDAPlace = phi::GPUPlace; using CUDAPlace = phi::GPUPlace;
using CUDAPinnedPlace = phi::GPUPinnedPlace; using CUDAPinnedPlace = phi::GPUPinnedPlace;
using NPUPinnedPlace = phi::NPUPinnedPlace;
using XPUPlace = phi::XPUPlace; using XPUPlace = phi::XPUPlace;
using IPUPlace = phi::IPUPlace; using IPUPlace = phi::IPUPlace;
using CustomPlace = phi::CustomPlace; using CustomPlace = phi::CustomPlace;
...@@ -87,11 +86,6 @@ typename Visitor::result_type VisitPlace(const Place &place, ...@@ -87,11 +86,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
return typename Visitor::result_type(); return typename Visitor::result_type();
#endif #endif
} }
case phi::AllocationType::NPUPINNED: {
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type();
}
case phi::AllocationType::IPU: { case phi::AllocationType::IPU: {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
platform::IPUPlace p(place.GetDeviceId()); platform::IPUPlace p(place.GetDeviceId());
......
...@@ -673,7 +673,6 @@ void BindNativeConfig(py::module *m) { ...@@ -673,7 +673,6 @@ void BindNativeConfig(py::module *m) {
.def(py::init<>()) .def(py::init<>())
.def_readwrite("use_gpu", &NativeConfig::use_gpu) .def_readwrite("use_gpu", &NativeConfig::use_gpu)
.def_readwrite("use_xpu", &NativeConfig::use_xpu) .def_readwrite("use_xpu", &NativeConfig::use_xpu)
.def_readwrite("use_npu", &NativeConfig::use_npu)
.def_readwrite("device", &NativeConfig::device) .def_readwrite("device", &NativeConfig::device)
.def_readwrite("fraction_of_gpu_memory", .def_readwrite("fraction_of_gpu_memory",
&NativeConfig::fraction_of_gpu_memory) &NativeConfig::fraction_of_gpu_memory)
...@@ -805,10 +804,8 @@ void BindAnalysisConfig(py::module *m) { ...@@ -805,10 +804,8 @@ void BindAnalysisConfig(py::module *m) {
.def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization) .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
.def("use_gpu", &AnalysisConfig::use_gpu) .def("use_gpu", &AnalysisConfig::use_gpu)
.def("use_xpu", &AnalysisConfig::use_xpu) .def("use_xpu", &AnalysisConfig::use_xpu)
.def("use_npu", &AnalysisConfig::use_npu)
.def("gpu_device_id", &AnalysisConfig::gpu_device_id) .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
.def("xpu_device_id", &AnalysisConfig::xpu_device_id) .def("xpu_device_id", &AnalysisConfig::xpu_device_id)
.def("npu_device_id", &AnalysisConfig::npu_device_id)
.def("memory_pool_init_size_mb", .def("memory_pool_init_size_mb",
&AnalysisConfig::memory_pool_init_size_mb) &AnalysisConfig::memory_pool_init_size_mb)
.def("fraction_of_gpu_memory_for_pool", .def("fraction_of_gpu_memory_for_pool",
......
...@@ -629,7 +629,6 @@ void BindPlace(pybind11::module &m) { // NOLINT ...@@ -629,7 +629,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
[](platform::Place &self) { return platform::is_custom_place(self); }) [](platform::Place &self) { return platform::is_custom_place(self); })
.def("gpu_device_id", [](platform::Place &self) { return self.device; }) .def("gpu_device_id", [](platform::Place &self) { return self.device; })
.def("xpu_device_id", [](platform::Place &self) { return self.device; }) .def("xpu_device_id", [](platform::Place &self) { return self.device; })
.def("npu_device_id", [](platform::Place &self) { return self.device; })
.def("ipu_device_id", [](platform::Place &self) { return self.device; }) .def("ipu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id", .def("custom_device_id",
[](platform::Place &self) { return self.device; }) [](platform::Place &self) { return self.device; })
......
...@@ -110,23 +110,6 @@ size_t CUDAPinnedMaxChunkSize() { ...@@ -110,23 +110,6 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256; return CUDAPinnedMaxAllocSize() / 256;
} }
size_t NPUPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
}
size_t NPUPinnedMinChunkSize() {
// Allow to allocate the minimum chunk size is 64 KB.
return 1 << 16;
}
size_t NPUPinnedMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
// memory.
return NPUPinnedMaxAllocSize() / 256;
}
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu; static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) { bool MayIUse(const cpu_isa_t cpu_isa) {
......
...@@ -75,15 +75,6 @@ size_t CUDAPinnedMinChunkSize(); ...@@ -75,15 +75,6 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize(); size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
typedef enum { typedef enum {
isa_any, isa_any,
sse42, sse42,
......
...@@ -35,8 +35,6 @@ const char *AllocationTypeStr(AllocationType type) { ...@@ -35,8 +35,6 @@ const char *AllocationTypeStr(AllocationType type) {
return "gpu_pinned"; return "gpu_pinned";
case AllocationType::XPU: case AllocationType::XPU:
return "xpu"; return "xpu";
case AllocationType::NPUPINNED:
return "npu_pinned";
case AllocationType::IPU: case AllocationType::IPU:
return "ipu"; return "ipu";
default: default:
...@@ -55,7 +53,6 @@ std::string Place::DebugString() const { ...@@ -55,7 +53,6 @@ std::string Place::DebugString() const {
os << AllocationTypeStr(alloc_type_); os << AllocationTypeStr(alloc_type_);
} }
if (alloc_type_ == AllocationType::GPUPINNED || if (alloc_type_ == AllocationType::GPUPINNED ||
alloc_type_ == AllocationType::NPUPINNED ||
alloc_type_ == AllocationType::CPU) { alloc_type_ == AllocationType::CPU) {
os << ")"; os << ")";
} else { } else {
......
...@@ -32,7 +32,6 @@ enum class AllocationType : int8_t { ...@@ -32,7 +32,6 @@ enum class AllocationType : int8_t {
GPUPINNED = 3, GPUPINNED = 3,
XPU = 4, XPU = 4,
NPU = 5, NPU = 5,
NPUPINNED = 6,
IPU = 7, IPU = 7,
CUSTOM = 9, CUSTOM = 9,
}; };
...@@ -163,15 +162,6 @@ class XPUPlace : public Place { ...@@ -163,15 +162,6 @@ class XPUPlace : public Place {
: Place(AllocationType::XPU, place.GetDeviceId()) {} : Place(AllocationType::XPU, place.GetDeviceId()) {}
}; };
class NPUPinnedPlace : public Place {
public:
NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {}
NPUPinnedPlace(const NPUPinnedPlace&) = default;
NPUPinnedPlace(const Place& place UNUSED) // NOLINT
: Place(AllocationType::NPUPINNED) {}
};
class IPUPlace : public Place { class IPUPlace : public Place {
public: public:
IPUPlace() : Place(AllocationType::IPU, 0) {} IPUPlace() : Place(AllocationType::IPU, 0) {}
......
...@@ -161,12 +161,6 @@ void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context, ...@@ -161,12 +161,6 @@ void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
#endif #endif
} }
template <>
void set_constant_with_place<phi::NPUPinnedPlace>(
const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <> template <>
void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context, void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
import paddle import paddle
from paddle import _legacy_C_ops from paddle import _legacy_C_ops
from paddle.distributed import collective from paddle.distributed import collective
from paddle.fluid import core
from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype
from paddle.framework import LayerHelper, _create_tensor, in_dygraph_mode from paddle.framework import LayerHelper, _create_tensor, in_dygraph_mode
from paddle.nn import Layer from paddle.nn import Layer
...@@ -551,11 +550,7 @@ def _parallel_linear( ...@@ -551,11 +550,7 @@ def _parallel_linear(
) )
# NOTE: npu linear function use matmul_v2 but linear use matmul # NOTE: npu linear function use matmul_v2 but linear use matmul
linear_function = ( linear_function = paddle.nn.functional.linear
_linear
if core.is_compiled_with_custom_device('npu')
else paddle.nn.functional.linear
)
linear_out = linear_function( linear_out = linear_function(
x, x,
linear.weight, linear.weight,
......
...@@ -595,9 +595,6 @@ class ShardingOptimizer(MetaOptimizerBase): ...@@ -595,9 +595,6 @@ class ShardingOptimizer(MetaOptimizerBase):
# amp inf_var & clip global_norm_var # amp inf_var & clip global_norm_var
rings = [self.mp_ring_id, self.pp_ring_id] rings = [self.mp_ring_id, self.pp_ring_id]
# FIXME(wangxi): some problem with NPU found_finite, need sync with DP
if core.is_compiled_with_custom_device('npu'):
rings += [self.dp_ring_id]
FP16Utils.sync_amp_check_nan_inf(main_block, rings) FP16Utils.sync_amp_check_nan_inf(main_block, rings)
gradientclip_helper = GradientClipHelper(None) gradientclip_helper = GradientClipHelper(None)
...@@ -719,10 +716,7 @@ class ShardingOptimizer(MetaOptimizerBase): ...@@ -719,10 +716,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self._recreate_not_persist_param_as_var() self._recreate_not_persist_param_as_var()
self._dump_program_for_debug() self._dump_program_for_debug()
self._wait()
# GPU need to wait server ready, GPU and NPU is Layered connection
if not core.is_compiled_with_custom_device('npu'):
self._wait()
return optimize_ops, params_grads return optimize_ops, params_grads
def _init_pair_comm(self, pair, ring_id): def _init_pair_comm(self, pair, ring_id):
......
...@@ -1988,14 +1988,9 @@ class Executor: ...@@ -1988,14 +1988,9 @@ class Executor:
for var in program.global_block().vars.values(): for var in program.global_block().vars.values():
if var.is_data: if var.is_data:
data_vars.append(var) data_vars.append(var)
if core.is_compiled_with_custom_device('npu'): dataset = paddle.fluid.DatasetFactory().create_dataset(
dataset = paddle.fluid.DatasetFactory().create_dataset( 'FileInstantDataset'
'InMemoryDataset' )
)
else:
dataset = paddle.fluid.DatasetFactory().create_dataset(
'FileInstantDataset'
)
dataset.set_batch_size(1) dataset.set_batch_size(1)
dataset.set_thread(1) dataset.set_thread(1)
dataset.set_filelist(['None']) dataset.set_filelist(['None'])
...@@ -2165,14 +2160,9 @@ class Executor: ...@@ -2165,14 +2160,9 @@ class Executor:
for var in program.global_block().vars.values(): for var in program.global_block().vars.values():
if var.is_data: if var.is_data:
data_vars.append(var) data_vars.append(var)
if core.is_compiled_with_custom_device('npu'): dataset = paddle.fluid.DatasetFactory().create_dataset(
dataset = paddle.fluid.DatasetFactory().create_dataset( 'FileInstantDataset'
'InMemoryDataset' )
)
else:
dataset = paddle.fluid.DatasetFactory().create_dataset(
'FileInstantDataset'
)
dataset.set_batch_size(1) dataset.set_batch_size(1)
dataset.set_thread(1) dataset.set_thread(1)
dataset.set_filelist(['None']) dataset.set_filelist(['None'])
......
...@@ -597,21 +597,6 @@ def _current_expected_place(): ...@@ -597,21 +597,6 @@ def _current_expected_place():
"You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default." "You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default."
) )
_global_expected_place_ = core.CPUPlace() _global_expected_place_ = core.CPUPlace()
elif core.is_compiled_with_custom_device("npu"):
# TODO(duanyanhui): Optimize DeviceManager and Return all expected places when device registered in DeviceManager is greater than 1.
try:
device_count = core.get_custom_device_count("npu")
except Exception as e:
device_count = 0
if device_count > 0:
_global_expected_place_ = core.CustomPlace(
"npu", _custom_device_ids("npu")[0]
)
else:
warnings.warn(
"You are using NPU version Paddle, but your NPU device is not set properly. CPU device will be used by default."
)
_global_expected_place_ = core.CPUPlace()
else: else:
_global_expected_place_ = core.CPUPlace() _global_expected_place_ = core.CPUPlace()
...@@ -7454,9 +7439,9 @@ def device_guard(device=None): ...@@ -7454,9 +7439,9 @@ def device_guard(device=None):
device, index = device.split(':') device, index = device.split(':')
if device == 'cpu': if device == 'cpu':
raise ValueError("Should not set device id for cpu.") raise ValueError("Should not set device id for cpu.")
if device not in ['cpu', 'gpu', 'xpu', 'npu', '', None]: if device not in ['cpu', 'gpu', 'xpu', '', None]:
raise ValueError( raise ValueError(
"The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None " "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
"when there is no need to specify device. But received %s" % device "when there is no need to specify device. But received %s" % device
) )
if index: if index:
......
...@@ -4554,9 +4554,7 @@ class PipelineOptimizer: ...@@ -4554,9 +4554,7 @@ class PipelineOptimizer:
def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
self._device = 'cpu' self._device = 'cpu'
if core.is_compiled_with_custom_device('npu'): if core.is_compiled_with_cuda():
self._device = "npu"
elif core.is_compiled_with_cuda():
self._device = "gpu" self._device = "gpu"
if in_dygraph_mode(): if in_dygraph_mode():
raise Exception("In dygraph, don't support PipelineOptimizer.") raise Exception("In dygraph, don't support PipelineOptimizer.")
...@@ -4945,8 +4943,8 @@ class PipelineOptimizer: ...@@ -4945,8 +4943,8 @@ class PipelineOptimizer:
else None else None
) )
if device: if device:
assert device[0:3] == 'gpu' or device[0:3] == 'npu', ( assert device[0:3] == 'gpu', (
"Now, only gpu and npu devices are " "Now, only gpu devices are "
"supported in pipeline parallemism." "supported in pipeline parallemism."
) )
return device return device
...@@ -5148,8 +5146,8 @@ class PipelineOptimizer: ...@@ -5148,8 +5146,8 @@ class PipelineOptimizer:
continue continue
dev_type = device.split(':')[0] dev_type = device.split(':')[0]
assert dev_type == "gpu" or dev_type == 'npu', ( assert dev_type == "gpu", (
"Now only gpu and npu devices are supported " "Now only gpu devices are supported "
"for pipeline parallelism." "for pipeline parallelism."
) )
...@@ -6388,8 +6386,6 @@ class PipelineOptimizer: ...@@ -6388,8 +6386,6 @@ class PipelineOptimizer:
dev_index = int(dev.split(":")[1]) dev_index = int(dev.split(":")[1])
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place_list.append(core.CUDAPlace(dev_index % 1)) place_list.append(core.CUDAPlace(dev_index % 1))
elif paddle.is_compiled_with_custom_device('npu'):
place_list.append(paddle.CustomPlace('npu', dev_index % 1))
# Step6: Split startup program # Step6: Split startup program
new_startup_program = self._split_startup_program( new_startup_program = self._split_startup_program(
...@@ -6412,8 +6408,6 @@ class PipelineOptimizer: ...@@ -6412,8 +6408,6 @@ class PipelineOptimizer:
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place_id = int(os.getenv("FLAGS_selected_gpus", "0")) place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
elif core.is_compiled_with_custom_device('npu'):
place_id = int(os.getenv("FLAGS_selected_npus", "0"))
# A pass to move the recv op to the beginning of # A pass to move the recv op to the beginning of
# the forward/backward phase # the forward/backward phase
self._mv_head_recv(program_list[self.local_rank]) self._mv_head_recv(program_list[self.local_rank])
......
...@@ -16,7 +16,6 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode ...@@ -16,7 +16,6 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
from paddle.device import ( from paddle.device import (
get_all_custom_device_type, get_all_custom_device_type,
is_compiled_with_cuda, is_compiled_with_cuda,
is_compiled_with_custom_device,
is_compiled_with_rocm, is_compiled_with_rocm,
) )
from paddle.fluid.framework import _global_flags, in_dygraph_mode from paddle.fluid.framework import _global_flags, in_dygraph_mode
...@@ -465,13 +464,6 @@ def conv1d( ...@@ -465,13 +464,6 @@ def conv1d(
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
use_cudnn = False use_cudnn = False
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
squeeze_aixs = -3 if channel_last else -2 squeeze_aixs = -3 if channel_last else -2
x = unsqueeze(x, axis=[squeeze_aixs]) x = unsqueeze(x, axis=[squeeze_aixs])
...@@ -755,13 +747,6 @@ def conv2d( ...@@ -755,13 +747,6 @@ def conv2d(
use_mkldnn = _global_flags()["FLAGS_use_mkldnn"] use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
if ( if (
is_compiled_with_cuda() is_compiled_with_cuda()
and get_flags("FLAGS_conv2d_disable_cudnn")[ and get_flags("FLAGS_conv2d_disable_cudnn")[
......
...@@ -16,7 +16,7 @@ import math ...@@ -16,7 +16,7 @@ import math
# TODO: define loss functions of neural network # TODO: define loss functions of neural network
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops, fluid, in_dynamic_mode from paddle import _C_ops, fluid, in_dynamic_mode
from paddle.framework import core from paddle.framework import core
from paddle.static.nn.control_flow import Assert from paddle.static.nn.control_flow import Assert
from paddle.utils import deprecated from paddle.utils import deprecated
...@@ -269,51 +269,15 @@ def fluid_softmax_with_cross_entropy( ...@@ -269,51 +269,15 @@ def fluid_softmax_with_cross_entropy(
if input_dims - 1 == label_dims: if input_dims - 1 == label_dims:
label = paddle.unsqueeze(label, axis=axis) label = paddle.unsqueeze(label, axis=axis)
if in_dygraph_mode(): if in_dygraph_mode():
if core.is_compiled_with_custom_device("npu"): softmax, loss = _C_ops.cross_entropy_with_softmax(
if not soft_label: logits,
valid_label = ( label,
paddle.cast(label != ignore_index, dtype=label.dtype) soft_label,
* label True,
) numeric_stable_mode,
softmax, loss = _legacy_C_ops.softmax_with_cross_entropy( ignore_index,
logits, axis,
valid_label, )
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
numeric_stable_mode,
'axis',
axis,
'use_softmax',
True,
)
else:
softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
logits,
label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
numeric_stable_mode,
'axis',
axis,
'use_softmax',
True,
)
else:
softmax, loss = _C_ops.cross_entropy_with_softmax(
logits,
label,
soft_label,
True,
numeric_stable_mode,
ignore_index,
axis,
)
if not return_softmax: if not return_softmax:
return loss return loss
else: else:
...@@ -2734,41 +2698,9 @@ def cross_entropy( ...@@ -2734,41 +2698,9 @@ def cross_entropy(
valid_label = ( valid_label = (
paddle.cast(label != ignore_index, dtype=label.dtype) * label paddle.cast(label != ignore_index, dtype=label.dtype) * label
) )
if core.is_compiled_with_custom_device("npu"): _, out = _C_ops.cross_entropy_with_softmax(
if not soft_label: input, label, soft_label, use_softmax, True, ignore_index, axis
_, out = _legacy_C_ops.softmax_with_cross_entropy( )
input,
valid_label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
True,
'axis',
axis,
'use_softmax',
use_softmax,
)
else:
_, out = _legacy_C_ops.softmax_with_cross_entropy(
input,
label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
True,
'axis',
axis,
'use_softmax',
use_softmax,
)
else:
_, out = _C_ops.cross_entropy_with_softmax(
input, label, soft_label, use_softmax, True, ignore_index, axis
)
if weight is not None: if weight is not None:
......
...@@ -220,24 +220,7 @@ class OptimizerWithMixedPrecision: ...@@ -220,24 +220,7 @@ class OptimizerWithMixedPrecision:
""" """
train_program = loss.block.program train_program = loss.block.program
self._train_program = train_program self._train_program = train_program
self._float_status = None
# NOTE(zhiqiu): _float_status is only used for NPU.
if core.is_compiled_with_custom_device('npu'):
float_status = paddle.static.data(
name="float_status", shape=[8], dtype='float32'
)
self._train_program.global_block().append_op(
type="alloc_float_status",
outputs={"FloatStatus": float_status},
)
self._train_program.global_block().append_op(
type="clear_float_status",
inputs={"FloatStatus": float_status},
outputs={"FloatStatusOut": float_status},
)
self._float_status = float_status
else:
self._float_status = None
with program_guard(self._train_program, startup_program): with program_guard(self._train_program, startup_program):
self._init_amp_var() self._init_amp_var()
...@@ -476,27 +459,17 @@ class OptimizerWithMixedPrecision: ...@@ -476,27 +459,17 @@ class OptimizerWithMixedPrecision:
if self._is_distributed: if self._is_distributed:
# if distributed, split check_finite_and_unscale to overlap # if distributed, split check_finite_and_unscale to overlap
# unscale with communication # unscale with communication
if core.is_compiled_with_custom_device('npu'): for p, g in params_grads:
with self._train_program._optimized_guard(grads): with self._train_program._optimized_guard([p, g]):
_, found_inf = check_finite_and_unscale( _, found_inf = check_finite_and_unscale(
grads, [
g,
],
self._loss_scaling, self._loss_scaling,
name="find_infinite_scale", name="find_infinite_scale",
float_status=self._float_status, float_status=self._float_status,
) )
found_infs.append(found_inf) found_infs.append(found_inf)
else:
for p, g in params_grads:
with self._train_program._optimized_guard([p, g]):
_, found_inf = check_finite_and_unscale(
[
g,
],
self._loss_scaling,
name="find_infinite_scale",
float_status=self._float_status,
)
found_infs.append(found_inf)
elif self._use_pure_fp16: elif self._use_pure_fp16:
if fp32_grads: if fp32_grads:
with self._train_program._optimized_guard(fp32_grads): with self._train_program._optimized_guard(fp32_grads):
......
...@@ -97,8 +97,6 @@ def _get_sys_unsupported_list(dtype): ...@@ -97,8 +97,6 @@ def _get_sys_unsupported_list(dtype):
device = None device = None
if core.is_compiled_with_xpu(): if core.is_compiled_with_xpu():
device = 'XPU' device = 'XPU'
elif core.is_compiled_with_custom_device('npu'):
device = 'NPU'
else: else:
device = 'GPU' device = 'GPU'
_, _, sys_unsupported_list = core.op_supported_infos(device, var_type) _, _, sys_unsupported_list = core.op_supported_infos(device, var_type)
......
...@@ -948,13 +948,6 @@ def conv2d( ...@@ -948,13 +948,6 @@ def conv2d(
): ):
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if core.is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
......
...@@ -212,12 +212,6 @@ class Timeline: ...@@ -212,12 +212,6 @@ class Timeline:
self._chrome_trace.emit_pid( self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid
) )
if (k, 0, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "NPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:npu:%d" % (k, 0), pid
)
def _allocate_events(self): def _allocate_events(self):
for k, profile_pb in self._profile_dict.items(): for k, profile_pb in self._profile_dict.items():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册