未验证 提交 ff4654e2 编写于 作者: L Leo Chen 提交者: GitHub

refactor npu device manager (#31154)

refactor npu device manager (#31154)
上级 1435b4c0
......@@ -1015,8 +1015,9 @@ DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
} // namespace details
inline std::string build_npu_error_msg(aclError stat) {
std::string s = " ACL error, the error code is : " + stat;
return s;
std::ostringstream sout;
sout << " ACL error, the error code is : " << stat << ". ";
return sout.str();
}
#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \
......
......@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
/**
......@@ -85,6 +86,14 @@ DEFINE_string(selected_gpus, "",
"share-memory only.");
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
DEFINE_string(selected_npus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
#endif
#ifdef PADDLE_WITH_CUDA
/**
......@@ -378,7 +387,8 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
/**
......
......@@ -28,8 +28,8 @@ DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb);
DECLARE_string(selected_npus);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
......@@ -78,7 +78,7 @@ std::vector<int> GetSelectedNPUDevices() {
// use user specified NPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
......@@ -368,5 +368,42 @@ bool IsNPUMallocRecorded(int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
}
AclInstance::~AclInstance() {}
AclInstance &AclInstance::Instance() {
static AclInstance instance;
return instance;
}
AclInstance::AclInstance() {
PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
VLOG(4) << "Call aclrtSetDevice ";
// NOTE(zhiqiu): why set devices here?
// Because ACL creates a default context which contains 2 streams
// when calling aclrtSetDeviceId, so usually we do not need to
// create contexts explicitly. And, for each device, aclrtSetDeviceId
// need to call parily with aclrtResetDeviceId to destory the default
// context. Here, we use this singleton and static instance to manage
// the devices to make sure they will be resetted before program exit.
devices_ = platform::GetSelectedNPUDevices();
for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
SetNPUDeviceId(*it);
VLOG(4) << "Call aclrtSetDevice " << *it;
}
}
void AclInstance::Finalize() {
// NOTE(zhiqiu): DO NOT perform finalize in destructor
// to avoid problems caused by destructor order of static
// object.
for (size_t i = 0; i < devices_.size(); ++i) {
auto status = aclrtResetDevice(devices_[i]);
VLOG(4) << "Call aclrtResetDevice " << devices_[i]
<< " status = " << status;
}
auto status = aclFinalize();
VLOG(4) << "Call aclFinalize, status = " << status;
}
} // namespace platform
} // namespace paddle
......@@ -138,46 +138,15 @@ class AclInstance {
public:
// NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
// no PADDLE_ENFORCE here, call acl API directly.
~AclInstance() {}
~AclInstance();
AclInstance(const AclInstance &o) = delete;
const AclInstance &operator=(const AclInstance &o) = delete;
static AclInstance &Instance() {
static AclInstance instance;
return instance;
}
void Finalize() {
// NOTE(zhiqiu): DO NOT perform finalize in destructor
// to avoid problems caused by destructor order of static
// object.
for (size_t i = 0; i < devices_.size(); ++i) {
auto status = aclrtResetDevice(devices_[i]);
VLOG(4) << "Call aclrtResetDevice " << devices_[i]
<< " status = " << status;
}
auto status = aclFinalize();
VLOG(4) << "Call aclFinalize, status = " << status;
}
static AclInstance &Instance();
void Finalize();
private:
// forbid calling default constructor
AclInstance() {
PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
VLOG(4) << "Call aclrtSetDevice ";
// NOTE(zhiqiu): why set devices here?
// Because ACL creates a default context which contains 2 streams
// when calling aclrtSetDeviceId, so usually we do not need to
// create contexts explicitly. And, for each device, aclrtSetDeviceId
// need to call parily with aclrtResetDeviceId to destory the default
// context. Here, we use this singleton and static instance to manage
// the devices to make sure they will be resetted before program exit.
devices_ = platform::GetSelectedNPUDevices();
for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
SetNPUDeviceId(*it);
VLOG(4) << "Call aclrtSetDevice " << *it;
}
}
AclInstance();
std::vector<int> devices_;
};
......
......@@ -87,6 +87,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
// others
DECLARE_bool(sync_nccl_allreduce);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// device management
DECLARE_string(selected_npus);
#endif
#ifdef PADDLE_WITH_DISTRIBUTE
DECLARE_int32(rpc_send_thread_num);
DECLARE_int32(rpc_get_thread_num);
......@@ -365,6 +371,11 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
#endif
#ifdef PADDLE_WITH_DITRIBUTE
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
FLAGS_rpc_get_thread_num,
......
......@@ -39,7 +39,8 @@ int main(int argc, char** argv) {
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
envs.push_back("fraction_of_gpu_memory_to_use");
envs.push_back("initial_gpu_memory_in_mb");
envs.push_back("reallocate_gpu_memory_in_mb");
......@@ -63,6 +64,10 @@ int main(int argc, char** argv) {
undefok.push_back("initial_cpu_memory_in_mb");
#endif
#if defined(PADDLE_WITH_CUDA)
envs.push_back("selected_npus");
#endif
char* env_str = nullptr;
if (envs.size() > 0) {
std::string env_string = "--tryfromenv=";
......
......@@ -218,7 +218,7 @@ def __bootstrap__():
read_env_flags.append('tracer_mkldnn_ops_on')
read_env_flags.append('tracer_mkldnn_ops_off')
if core.is_compiled_with_cuda():
if core.is_compiled_with_cuda() or core.is_compiled_with_npu():
read_env_flags += [
'fraction_of_gpu_memory_to_use',
'initial_gpu_memory_in_mb',
......@@ -234,6 +234,10 @@ def __bootstrap__():
'local_exe_sub_scope_limit',
'gpu_memory_limit_mb',
]
if core.is_compiled_with_npu():
read_env_flags += ['selected_npus', ]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
# don't init_p2p when in unittest to save time.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册