未验证 提交 0d45ac73 编写于 作者: 张春乔 提交者: GitHub

昇腾和寒武纪相关代码退场 npu相关代码退场2 (#53568)

上级 00ded2ea
......@@ -111,8 +111,7 @@ bool MessageBus::Send(int64_t dst_rank,
#else
PADDLE_THROW(platform::errors::Unavailable(
"Fleet executor does not support sending message between different "
"ranks when Paddle is compiled with npu or "
"isn't compiled with distributed for now."));
"ranks when Paddle isn't compiled with distributed for now."));
#endif
return true;
}
......@@ -202,10 +201,9 @@ void MessageBus::ListenPort() {
}
LOG(INFO) << "Message bus's listen port thread starts successful.";
#else
LOG(WARNING)
<< "Fleet executor's ListenPort() is a fake function when Paddle is "
"compiled with npu or Paddle isn't compiled "
"with distributed for now.";
LOG(WARNING) << "Fleet executor's ListenPort() is a fake function when "
"Paddle isn't compiled "
"with distributed for now.";
#endif
}
......
......@@ -89,11 +89,6 @@ struct DLDeviceVisitor
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::NPUPinnedPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CustomPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::CustomPlace is not supported"));
......
......@@ -50,10 +50,6 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
execution_strategy.num_threads_ = 1;
break;
}
case platform::DeviceType::NPU: {
execution_strategy.num_threads_ = 1;
break;
}
case platform::DeviceType::CUSTOM_DEVICE: {
execution_strategy.num_threads_ = 1;
break;
......
......@@ -196,7 +196,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
? OpFuncType::kGpuSync
: OpFuncType::kGpuAsync;
} else {
// Memcpy in npu and custom devices is asynchronous
// Memcpy in custom devices is asynchronous
new_op_func_node.type_ = OpFuncType::kGpuAsync;
}
......@@ -225,7 +225,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
}
}
// NOTE(winter-wang): in npu and custom device, D2H kernel is asynchronous.
// NOTE(winter-wang): in custom device, D2H kernel is asynchronous.
// need to explicit synchronization.
if ((platform::is_custom_place(place)) && op_type == kMemcpyD2H) {
dev_ctx->Wait();
......
......@@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
DeviceContext* dev_ctx = nullptr;
// only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
// only gpu needs update. xpu not need, because xpu memcpy op kernel is
// synchronous.
if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) {
VLOG(6) << "Parse DeviceContext for " << op_type
......
......@@ -1331,8 +1331,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
device_name = "XPU";
} else {
PADDLE_THROW(
platform::errors::Unavailable("Only CPU/CUDA/NPU/XPU is supportted. "
"please use CPU/CUDA/NPU/XPU backend."));
platform::errors::Unavailable("Only CPU/CUDA/XPU is supportted. "
"please use CPU/CUDA/XPU backend."));
}
VLOG(1) << string::Sprintf(
......
......@@ -52,12 +52,12 @@ OpSupportedInfos(const std::string& place,
{"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place},
};
PADDLE_ENFORCE_NE(is_target_place.count(query_place),
0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU', 'CPU', 'XPU', "
"'NPU', but got '%s'.",
place));
PADDLE_ENFORCE_NE(
is_target_place.count(query_place),
0,
platform::errors::InvalidArgument(
"The argument `place` should be 'GPU', 'CPU', 'XPU', but got '%s'.",
place));
std::unordered_set<std::string> all_ops;
const auto& op_info = framework::OpInfoMap::Instance().map();
......@@ -147,7 +147,7 @@ AmpOperators::AmpOperators()
OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
unsupported_ops_gpu_bf16.end());
// NOTE: GPU/NPU/XPU is compiled seperatly.
// NOTE: GPU/XPU is compiled seperatly.
#elif defined(PADDLE_WITH_XPU)
auto unsupported_ops_xpu_fp16 = std::get<2>(
OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
......
......@@ -364,10 +364,6 @@ struct Argument {
IpuEnableModelRuntimeExecutor,
bool);
// npu related
DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
DECL_ARGUMENT_FIELD(npu_device_id, NPUDeviceId, int);
// mixed precision related
DECL_ARGUMENT_FIELD(model_precision, ModelPrecision, int);
DECL_ARGUMENT_FIELD(mixed_black_list,
......
......@@ -56,8 +56,6 @@ PassStrategy *AnalysisConfig::pass_builder() const {
pass_builder_.reset(new GpuPassStrategy);
} else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy);
} else if (use_ipu_) {
LOG(INFO) << "Create IPU IR passes";
pass_builder_.reset(new IpuPassStrategy);
......@@ -506,8 +504,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_opencl_);
// NPU related.
CP_MEMBER(use_npu_);
CP_MEMBER(npu_device_id_);
CP_MEMBER(nnadapter_config_);
// profile related.
......@@ -574,9 +570,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
} else if (use_custom_device_) {
pass_builder_.reset(new CustomDevicePassStrategy(
*static_cast<CustomDevicePassStrategy *>(other.pass_builder())));
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(other.pass_builder())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder())));
......@@ -827,7 +820,6 @@ void AnalysisConfig::Update() {
// Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
((use_xpu() ^ pass_builder_->use_xpu())) ||
((use_npu() ^ pass_builder_->use_npu())) ||
((use_ipu() ^ pass_builder_->use_ipu())) ||
((use_custom_device() ^ pass_builder_->use_custom_device()))) {
if (use_gpu()) {
......@@ -841,13 +833,6 @@ void AnalysisConfig::Update() {
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu()) {
PADDLE_ENFORCE_EQ(
use_gpu(),
false,
platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy);
} else if (use_custom_device()) {
PADDLE_ENFORCE_EQ(
use_gpu(),
......@@ -875,14 +860,6 @@ void AnalysisConfig::Update() {
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(pass_builder_.get())));
} else if (use_npu()) {
PADDLE_ENFORCE_EQ(
use_gpu(),
false,
platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(pass_builder_.get())));
} else if (use_custom_device()) {
PADDLE_ENFORCE_EQ(
use_gpu(),
......@@ -1114,9 +1091,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << op_type;
}
ss << use_npu_;
ss << npu_device_id_;
ss << thread_local_stream_;
ss << use_ipu_;
......
......@@ -148,8 +148,8 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
return phi::Backend::CUSTOM;
default:
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Paddle Inference not support backend, we now only support GPU, XPU, "
"NPU and CPU."));
"Paddle Inference not support backend, we now only support GPU, XPU "
"and CPU."));
return phi::Backend::CPU;
}
}
......@@ -1432,9 +1432,6 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetIpuCustomPatterns(config_.ipu_custom_patterns_);
#endif
argument_->SetUseNpu(config_.use_npu_);
argument_->SetNPUDeviceId(config_.npu_device_id());
if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled";
argument_->SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
......
......@@ -130,7 +130,7 @@ T *Tensor::mutable_data(PlaceType place) {
}
default:
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Only CPU / CUDA / XPU / NPU places is supported. The place `%d` is "
"Only CPU / CUDA / XPU places is supported. The place `%d` is "
"not supported.",
static_cast<int>(place)));
break;
......@@ -261,7 +261,7 @@ void Tensor::CopyFromCpu(const T *data) {
dev_ctx->stream());
#else
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU, NPU and XPU now."));
"The analysis predictor supports CPU, GPU and XPU now."));
#endif
}
}
......@@ -468,7 +468,7 @@ void Tensor::CopyToCpuImpl(T *data,
dev_ctx->GetStream()->Synchronize();
#else
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU, NPU and XPU now."));
"The analysis predictor supports CPU, GPU and XPU now."));
#endif
}
}
......
......@@ -414,12 +414,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// \return bool Whether the XPU is turned on.
///
bool use_xpu() const { return use_xpu_; }
///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \return bool Whether the NPU is turned on.
///
bool use_npu() const { return use_npu_; }
/// \brief A boolean state telling whether the IPU is turned on.
///
/// \return bool Whether the IPU is turned on.
......@@ -461,12 +455,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// \return int The XPU device id.
///
int xpu_device_id() const { return xpu_device_id_; }
///
/// \brief Get the NPU device id.
///
/// \return int The NPU device id.
///
int npu_device_id() const { return npu_device_id_; }
/// \brief Get the number of IPU device .
///
/// \return int The number of IPU device.
......@@ -1083,10 +1071,6 @@ struct PD_INFER_DECL AnalysisConfig {
bool use_external_stream_{false};
void* exec_stream_{nullptr};
// NPU related
bool use_npu_{false};
int npu_device_id_{0};
// CustomDevice related
bool use_custom_device_{false};
int custom_device_id_{0};
......
......@@ -360,7 +360,6 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
/// GPU related fields.
bool use_xpu{false};
bool use_gpu{false};
bool use_npu{false};
int device{0};
float fraction_of_gpu_memory{
-1.f}; ///< Change to a float in (0,1] if needed.
......
......@@ -162,10 +162,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; }
/// \brief Check if we are using npu.
/// \return A bool variable implying whether we are in npu mode.
bool use_npu() const { return use_npu_; }
/// \brief Check if we are using ipu.
/// \return A bool variable implying whether we are in ipu mode.
bool use_ipu() const { return use_ipu_; }
......@@ -181,7 +177,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \cond Protected
bool use_xpu_{false};
bool use_gpu_{false};
bool use_npu_{false};
bool use_ipu_{false};
bool use_mkldnn_{false};
bool use_custom_device_{false};
......@@ -293,21 +288,6 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
XpuPassStrategy();
};
/// \class NpuPassStrategy
/// \brief The NPU passes controller, it is used in AnalysisPredictor with NPU
/// mode.
class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
public:
NpuPassStrategy() : PassStrategy({}) { use_npu_ = true; }
/// \brief Construct by copying another NpuPassStrategy object.
/// \param[in] other The NpuPassStrategy object we want to copy.
explicit NpuPassStrategy(const NpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
use_npu_ = true;
}
};
/// \class CustomDevicePassStrategy
/// \brief The CustomDevice passes controller, it is used in AnalysisPredictor
/// with CustomDevice
......
......@@ -176,11 +176,6 @@ PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
return config->use_xpu();
}
PD_Bool PD_ConfigUseNpu(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->use_npu();
}
int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->gpu_device_id();
......@@ -189,10 +184,6 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->xpu_device_id();
}
int32_t PD_ConfigNpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->npu_device_id();
}
void PD_ConfigEnableCustomDevice(__pd_keep PD_Config* pd_config,
char* device_type,
......
......@@ -222,14 +222,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
__pd_keep PD_Config* pd_config);
///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \param[in] pd_onfig config
/// \return Whether the NPU is turned on.
///
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseNpu(
__pd_keep PD_Config* pd_config);
///
/// \brief Get the GPU device id.
///
/// \param[in] pd_onfig config
......@@ -246,14 +238,6 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
__pd_keep PD_Config* pd_config);
///
/// \brief Get the NPU device id.
///
/// \param[in] pd_onfig config
/// \return The NPU device id.
///
PADDLE_CAPI_EXPORT extern int32_t PD_ConfigNpuDeviceId(
__pd_keep PD_Config* pd_config);
///
/// \brief Turn on custome device.
///
/// \param[in] pd_config config
......
......@@ -230,15 +230,6 @@ func (config *Config) UseXpu() bool {
return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c))
}
///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \return bool Whether the NPU is turned on.
///
func (config *Config) UseNpu() bool {
return cvtPDBoolToGo(C.PD_ConfigUseNpu(config.c))
}
///
/// \brief Get the GPU device id.
///
......@@ -257,15 +248,6 @@ func (config *Config) XpuDeviceId() int32 {
return int32(C.PD_ConfigXpuDeviceId(config.c))
}
///
/// \brief Get the NPU device id.
///
/// \return int32 The NPU device id.
///
func (config *Config) NpuDeviceId() int32 {
return int32(C.PD_ConfigNpuDeviceId(config.c))
}
///
/// \brief Get the initial size in MB of the GPU memory pool.
///
......
......@@ -190,13 +190,3 @@ class StatRegistry {
USE_INT_STAT(STAT_gpu13_mem_size); \
USE_INT_STAT(STAT_gpu14_mem_size); \
USE_INT_STAT(STAT_gpu15_mem_size)
#define USE_NPU_MEM_STAT \
USE_INT_STAT(STAT_npu0_mem_size); \
USE_INT_STAT(STAT_npu1_mem_size); \
USE_INT_STAT(STAT_npu2_mem_size); \
USE_INT_STAT(STAT_npu3_mem_size); \
USE_INT_STAT(STAT_npu4_mem_size); \
USE_INT_STAT(STAT_npu5_mem_size); \
USE_INT_STAT(STAT_npu6_mem_size); \
USE_INT_STAT(STAT_npu7_mem_size)
......@@ -28,7 +28,6 @@ using Place = phi::Place;
using CPUPlace = phi::CPUPlace;
using CUDAPlace = phi::GPUPlace;
using CUDAPinnedPlace = phi::GPUPinnedPlace;
using NPUPinnedPlace = phi::NPUPinnedPlace;
using XPUPlace = phi::XPUPlace;
using IPUPlace = phi::IPUPlace;
using CustomPlace = phi::CustomPlace;
......@@ -87,11 +86,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
return typename Visitor::result_type();
#endif
}
case phi::AllocationType::NPUPINNED: {
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type();
}
case phi::AllocationType::IPU: {
#ifdef PADDLE_WITH_IPU
platform::IPUPlace p(place.GetDeviceId());
......
......@@ -673,7 +673,6 @@ void BindNativeConfig(py::module *m) {
.def(py::init<>())
.def_readwrite("use_gpu", &NativeConfig::use_gpu)
.def_readwrite("use_xpu", &NativeConfig::use_xpu)
.def_readwrite("use_npu", &NativeConfig::use_npu)
.def_readwrite("device", &NativeConfig::device)
.def_readwrite("fraction_of_gpu_memory",
&NativeConfig::fraction_of_gpu_memory)
......@@ -805,10 +804,8 @@ void BindAnalysisConfig(py::module *m) {
.def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
.def("use_gpu", &AnalysisConfig::use_gpu)
.def("use_xpu", &AnalysisConfig::use_xpu)
.def("use_npu", &AnalysisConfig::use_npu)
.def("gpu_device_id", &AnalysisConfig::gpu_device_id)
.def("xpu_device_id", &AnalysisConfig::xpu_device_id)
.def("npu_device_id", &AnalysisConfig::npu_device_id)
.def("memory_pool_init_size_mb",
&AnalysisConfig::memory_pool_init_size_mb)
.def("fraction_of_gpu_memory_for_pool",
......
......@@ -629,7 +629,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
[](platform::Place &self) { return platform::is_custom_place(self); })
.def("gpu_device_id", [](platform::Place &self) { return self.device; })
.def("xpu_device_id", [](platform::Place &self) { return self.device; })
.def("npu_device_id", [](platform::Place &self) { return self.device; })
.def("ipu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id",
[](platform::Place &self) { return self.device; })
......
......@@ -110,23 +110,6 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256;
}
size_t NPUPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
}
size_t NPUPinnedMinChunkSize() {
// Allow to allocate the minimum chunk size is 64 KB.
return 1 << 16;
}
size_t NPUPinnedMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
// memory.
return NPUPinnedMaxAllocSize() / 256;
}
#ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) {
......
......@@ -75,15 +75,6 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
typedef enum {
isa_any,
sse42,
......
......@@ -35,8 +35,6 @@ const char *AllocationTypeStr(AllocationType type) {
return "gpu_pinned";
case AllocationType::XPU:
return "xpu";
case AllocationType::NPUPINNED:
return "npu_pinned";
case AllocationType::IPU:
return "ipu";
default:
......@@ -55,7 +53,6 @@ std::string Place::DebugString() const {
os << AllocationTypeStr(alloc_type_);
}
if (alloc_type_ == AllocationType::GPUPINNED ||
alloc_type_ == AllocationType::NPUPINNED ||
alloc_type_ == AllocationType::CPU) {
os << ")";
} else {
......
......@@ -32,7 +32,6 @@ enum class AllocationType : int8_t {
GPUPINNED = 3,
XPU = 4,
NPU = 5,
NPUPINNED = 6,
IPU = 7,
CUSTOM = 9,
};
......@@ -163,15 +162,6 @@ class XPUPlace : public Place {
: Place(AllocationType::XPU, place.GetDeviceId()) {}
};
class NPUPinnedPlace : public Place {
public:
NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {}
NPUPinnedPlace(const NPUPinnedPlace&) = default;
NPUPinnedPlace(const Place& place UNUSED) // NOLINT
: Place(AllocationType::NPUPINNED) {}
};
class IPUPlace : public Place {
public:
IPUPlace() : Place(AllocationType::IPU, 0) {}
......
......@@ -161,12 +161,6 @@ void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
#endif
}
template <>
void set_constant_with_place<phi::NPUPinnedPlace>(
const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
......
......@@ -15,7 +15,6 @@
import paddle
from paddle import _legacy_C_ops
from paddle.distributed import collective
from paddle.fluid import core
from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype
from paddle.framework import LayerHelper, _create_tensor, in_dygraph_mode
from paddle.nn import Layer
......@@ -551,11 +550,7 @@ def _parallel_linear(
)
# NOTE: npu linear function use matmul_v2 but linear use matmul
linear_function = (
_linear
if core.is_compiled_with_custom_device('npu')
else paddle.nn.functional.linear
)
linear_function = paddle.nn.functional.linear
linear_out = linear_function(
x,
linear.weight,
......
......@@ -595,9 +595,6 @@ class ShardingOptimizer(MetaOptimizerBase):
# amp inf_var & clip global_norm_var
rings = [self.mp_ring_id, self.pp_ring_id]
# FIXME(wangxi): some problem with NPU found_finite, need sync with DP
if core.is_compiled_with_custom_device('npu'):
rings += [self.dp_ring_id]
FP16Utils.sync_amp_check_nan_inf(main_block, rings)
gradientclip_helper = GradientClipHelper(None)
......@@ -719,10 +716,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self._recreate_not_persist_param_as_var()
self._dump_program_for_debug()
# GPU need to wait server ready, GPU and NPU is Layered connection
if not core.is_compiled_with_custom_device('npu'):
self._wait()
self._wait()
return optimize_ops, params_grads
def _init_pair_comm(self, pair, ring_id):
......
......@@ -1988,14 +1988,9 @@ class Executor:
for var in program.global_block().vars.values():
if var.is_data:
data_vars.append(var)
if core.is_compiled_with_custom_device('npu'):
dataset = paddle.fluid.DatasetFactory().create_dataset(
'InMemoryDataset'
)
else:
dataset = paddle.fluid.DatasetFactory().create_dataset(
'FileInstantDataset'
)
dataset = paddle.fluid.DatasetFactory().create_dataset(
'FileInstantDataset'
)
dataset.set_batch_size(1)
dataset.set_thread(1)
dataset.set_filelist(['None'])
......@@ -2165,14 +2160,9 @@ class Executor:
for var in program.global_block().vars.values():
if var.is_data:
data_vars.append(var)
if core.is_compiled_with_custom_device('npu'):
dataset = paddle.fluid.DatasetFactory().create_dataset(
'InMemoryDataset'
)
else:
dataset = paddle.fluid.DatasetFactory().create_dataset(
'FileInstantDataset'
)
dataset = paddle.fluid.DatasetFactory().create_dataset(
'FileInstantDataset'
)
dataset.set_batch_size(1)
dataset.set_thread(1)
dataset.set_filelist(['None'])
......
......@@ -597,21 +597,6 @@ def _current_expected_place():
"You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default."
)
_global_expected_place_ = core.CPUPlace()
elif core.is_compiled_with_custom_device("npu"):
# TODO(duanyanhui): Optimize DeviceManager and Return all expected places when device registered in DeviceManager is greater than 1.
try:
device_count = core.get_custom_device_count("npu")
except Exception as e:
device_count = 0
if device_count > 0:
_global_expected_place_ = core.CustomPlace(
"npu", _custom_device_ids("npu")[0]
)
else:
warnings.warn(
"You are using NPU version Paddle, but your NPU device is not set properly. CPU device will be used by default."
)
_global_expected_place_ = core.CPUPlace()
else:
_global_expected_place_ = core.CPUPlace()
......@@ -7454,9 +7439,9 @@ def device_guard(device=None):
device, index = device.split(':')
if device == 'cpu':
raise ValueError("Should not set device id for cpu.")
if device not in ['cpu', 'gpu', 'xpu', 'npu', '', None]:
if device not in ['cpu', 'gpu', 'xpu', '', None]:
raise ValueError(
"The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
"The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
"when there is no need to specify device. But received %s" % device
)
if index:
......
......@@ -4554,9 +4554,7 @@ class PipelineOptimizer:
def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
self._device = 'cpu'
if core.is_compiled_with_custom_device('npu'):
self._device = "npu"
elif core.is_compiled_with_cuda():
if core.is_compiled_with_cuda():
self._device = "gpu"
if in_dygraph_mode():
raise Exception("In dygraph, don't support PipelineOptimizer.")
......@@ -4945,8 +4943,8 @@ class PipelineOptimizer:
else None
)
if device:
assert device[0:3] == 'gpu' or device[0:3] == 'npu', (
"Now, only gpu and npu devices are "
assert device[0:3] == 'gpu', (
"Now, only gpu devices are "
"supported in pipeline parallemism."
)
return device
......@@ -5148,8 +5146,8 @@ class PipelineOptimizer:
continue
dev_type = device.split(':')[0]
assert dev_type == "gpu" or dev_type == 'npu', (
"Now only gpu and npu devices are supported "
assert dev_type == "gpu", (
"Now only gpu devices are supported "
"for pipeline parallelism."
)
......@@ -6388,8 +6386,6 @@ class PipelineOptimizer:
dev_index = int(dev.split(":")[1])
if core.is_compiled_with_cuda():
place_list.append(core.CUDAPlace(dev_index % 1))
elif paddle.is_compiled_with_custom_device('npu'):
place_list.append(paddle.CustomPlace('npu', dev_index % 1))
# Step6: Split startup program
new_startup_program = self._split_startup_program(
......@@ -6412,8 +6408,6 @@ class PipelineOptimizer:
if core.is_compiled_with_cuda():
place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
elif core.is_compiled_with_custom_device('npu'):
place_id = int(os.getenv("FLAGS_selected_npus", "0"))
# A pass to move the recv op to the beginning of
# the forward/backward phase
self._mv_head_recv(program_list[self.local_rank])
......
......@@ -16,7 +16,6 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
from paddle.device import (
get_all_custom_device_type,
is_compiled_with_cuda,
is_compiled_with_custom_device,
is_compiled_with_rocm,
)
from paddle.fluid.framework import _global_flags, in_dygraph_mode
......@@ -465,13 +464,6 @@ def conv1d(
l_type = 'depthwise_conv2d'
use_cudnn = False
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
squeeze_aixs = -3 if channel_last else -2
x = unsqueeze(x, axis=[squeeze_aixs])
......@@ -755,13 +747,6 @@ def conv2d(
use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
if (
is_compiled_with_cuda()
and get_flags("FLAGS_conv2d_disable_cudnn")[
......
......@@ -16,7 +16,7 @@ import math
# TODO: define loss functions of neural network
import paddle
from paddle import _C_ops, _legacy_C_ops, fluid, in_dynamic_mode
from paddle import _C_ops, fluid, in_dynamic_mode
from paddle.framework import core
from paddle.static.nn.control_flow import Assert
from paddle.utils import deprecated
......@@ -269,51 +269,15 @@ def fluid_softmax_with_cross_entropy(
if input_dims - 1 == label_dims:
label = paddle.unsqueeze(label, axis=axis)
if in_dygraph_mode():
if core.is_compiled_with_custom_device("npu"):
if not soft_label:
valid_label = (
paddle.cast(label != ignore_index, dtype=label.dtype)
* label
)
softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
logits,
valid_label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
numeric_stable_mode,
'axis',
axis,
'use_softmax',
True,
)
else:
softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
logits,
label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
numeric_stable_mode,
'axis',
axis,
'use_softmax',
True,
)
else:
softmax, loss = _C_ops.cross_entropy_with_softmax(
logits,
label,
soft_label,
True,
numeric_stable_mode,
ignore_index,
axis,
)
softmax, loss = _C_ops.cross_entropy_with_softmax(
logits,
label,
soft_label,
True,
numeric_stable_mode,
ignore_index,
axis,
)
if not return_softmax:
return loss
else:
......@@ -2734,41 +2698,9 @@ def cross_entropy(
valid_label = (
paddle.cast(label != ignore_index, dtype=label.dtype) * label
)
if core.is_compiled_with_custom_device("npu"):
if not soft_label:
_, out = _legacy_C_ops.softmax_with_cross_entropy(
input,
valid_label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
True,
'axis',
axis,
'use_softmax',
use_softmax,
)
else:
_, out = _legacy_C_ops.softmax_with_cross_entropy(
input,
label,
'soft_label',
soft_label,
'ignore_index',
ignore_index,
'numeric_stable_mode',
True,
'axis',
axis,
'use_softmax',
use_softmax,
)
else:
_, out = _C_ops.cross_entropy_with_softmax(
input, label, soft_label, use_softmax, True, ignore_index, axis
)
_, out = _C_ops.cross_entropy_with_softmax(
input, label, soft_label, use_softmax, True, ignore_index, axis
)
if weight is not None:
......
......@@ -220,24 +220,7 @@ class OptimizerWithMixedPrecision:
"""
train_program = loss.block.program
self._train_program = train_program
# NOTE(zhiqiu): _float_status is only used for NPU.
if core.is_compiled_with_custom_device('npu'):
float_status = paddle.static.data(
name="float_status", shape=[8], dtype='float32'
)
self._train_program.global_block().append_op(
type="alloc_float_status",
outputs={"FloatStatus": float_status},
)
self._train_program.global_block().append_op(
type="clear_float_status",
inputs={"FloatStatus": float_status},
outputs={"FloatStatusOut": float_status},
)
self._float_status = float_status
else:
self._float_status = None
self._float_status = None
with program_guard(self._train_program, startup_program):
self._init_amp_var()
......@@ -476,27 +459,17 @@ class OptimizerWithMixedPrecision:
if self._is_distributed:
# if distributed, split check_finite_and_unscale to overlap
# unscale with communication
if core.is_compiled_with_custom_device('npu'):
with self._train_program._optimized_guard(grads):
for p, g in params_grads:
with self._train_program._optimized_guard([p, g]):
_, found_inf = check_finite_and_unscale(
grads,
[
g,
],
self._loss_scaling,
name="find_infinite_scale",
float_status=self._float_status,
)
found_infs.append(found_inf)
else:
for p, g in params_grads:
with self._train_program._optimized_guard([p, g]):
_, found_inf = check_finite_and_unscale(
[
g,
],
self._loss_scaling,
name="find_infinite_scale",
float_status=self._float_status,
)
found_infs.append(found_inf)
elif self._use_pure_fp16:
if fp32_grads:
with self._train_program._optimized_guard(fp32_grads):
......
......@@ -97,8 +97,6 @@ def _get_sys_unsupported_list(dtype):
device = None
if core.is_compiled_with_xpu():
device = 'XPU'
elif core.is_compiled_with_custom_device('npu'):
device = 'NPU'
else:
device = 'GPU'
_, _, sys_unsupported_list = core.op_supported_infos(device, var_type)
......
......@@ -948,13 +948,6 @@ def conv2d(
):
l_type = 'depthwise_conv2d'
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if core.is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
l_type = 'conv2d'
helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype()
......
......@@ -212,12 +212,6 @@ class Timeline:
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid
)
if (k, 0, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "NPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:npu:%d" % (k, 0), pid
)
def _allocate_events(self):
for k, profile_pb in self._profile_dict.items():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册