未验证 提交 ff4654e2 编写于 作者: L Leo Chen 提交者: GitHub

refactor npu device manager (#31154)

refactor npu device manager (#31154)
上级 1435b4c0
...@@ -1015,8 +1015,9 @@ DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); ...@@ -1015,8 +1015,9 @@ DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
} // namespace details } // namespace details
inline std::string build_npu_error_msg(aclError stat) { inline std::string build_npu_error_msg(aclError stat) {
std::string s = " ACL error, the error code is : " + stat; std::ostringstream sout;
return s; sout << " ACL error, the error code is : " << stat << ". ";
return sout.str();
} }
#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ #define PADDLE_ENFORCE_NPU_SUCCESS(COND) \
......
...@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false, ...@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags. // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
/** /**
...@@ -85,6 +86,14 @@ DEFINE_string(selected_gpus, "", ...@@ -85,6 +86,14 @@ DEFINE_string(selected_gpus, "",
"share-memory only."); "share-memory only.");
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
DEFINE_string(selected_npus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
/** /**
...@@ -378,7 +387,8 @@ DEFINE_double( ...@@ -378,7 +387,8 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags. // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
/** /**
......
...@@ -28,8 +28,8 @@ DECLARE_double(fraction_of_gpu_memory_to_use); ...@@ -28,8 +28,8 @@ DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb); DECLARE_uint64(gpu_memory_limit_mb);
DECLARE_string(selected_npus);
constexpr static float fraction_reserve_gpu_memory = 0.05f; constexpr static float fraction_reserve_gpu_memory = 0.05f;
...@@ -78,7 +78,7 @@ std::vector<int> GetSelectedNPUDevices() { ...@@ -78,7 +78,7 @@ std::vector<int> GetSelectedNPUDevices() {
// use user specified NPUs in single-node multi-process mode. // use user specified NPUs in single-node multi-process mode.
std::vector<int> devices; std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) { if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
for (auto id : devices_str) { for (auto id : devices_str) {
devices.push_back(atoi(id.c_str())); devices.push_back(atoi(id.c_str()));
} }
...@@ -368,5 +368,42 @@ bool IsNPUMallocRecorded(int dev_id) { ...@@ -368,5 +368,42 @@ bool IsNPUMallocRecorded(int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord(); return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
} }
AclInstance::~AclInstance() {}
AclInstance &AclInstance::Instance() {
static AclInstance instance;
return instance;
}
AclInstance::AclInstance() {
PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
VLOG(4) << "Call aclrtSetDevice ";
// NOTE(zhiqiu): why set devices here?
// Because ACL creates a default context which contains 2 streams
// when calling aclrtSetDeviceId, so usually we do not need to
// create contexts explicitly. And, for each device, aclrtSetDeviceId
// need to call parily with aclrtResetDeviceId to destory the default
// context. Here, we use this singleton and static instance to manage
// the devices to make sure they will be resetted before program exit.
devices_ = platform::GetSelectedNPUDevices();
for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
SetNPUDeviceId(*it);
VLOG(4) << "Call aclrtSetDevice " << *it;
}
}
void AclInstance::Finalize() {
// NOTE(zhiqiu): DO NOT perform finalize in destructor
// to avoid problems caused by destructor order of static
// object.
for (size_t i = 0; i < devices_.size(); ++i) {
auto status = aclrtResetDevice(devices_[i]);
VLOG(4) << "Call aclrtResetDevice " << devices_[i]
<< " status = " << status;
}
auto status = aclFinalize();
VLOG(4) << "Call aclFinalize, status = " << status;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -138,46 +138,15 @@ class AclInstance { ...@@ -138,46 +138,15 @@ class AclInstance {
public: public:
// NOTE(zhiiu): Commonly, exception in destructor is not recommended, so // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
// no PADDLE_ENFORCE here, call acl API directly. // no PADDLE_ENFORCE here, call acl API directly.
~AclInstance() {} ~AclInstance();
AclInstance(const AclInstance &o) = delete; AclInstance(const AclInstance &o) = delete;
const AclInstance &operator=(const AclInstance &o) = delete; const AclInstance &operator=(const AclInstance &o) = delete;
static AclInstance &Instance();
static AclInstance &Instance() { void Finalize();
static AclInstance instance;
return instance;
}
void Finalize() {
// NOTE(zhiqiu): DO NOT perform finalize in destructor
// to avoid problems caused by destructor order of static
// object.
for (size_t i = 0; i < devices_.size(); ++i) {
auto status = aclrtResetDevice(devices_[i]);
VLOG(4) << "Call aclrtResetDevice " << devices_[i]
<< " status = " << status;
}
auto status = aclFinalize();
VLOG(4) << "Call aclFinalize, status = " << status;
}
private: private:
// forbid calling default constructor // forbid calling default constructor
AclInstance() { AclInstance();
PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
VLOG(4) << "Call aclrtSetDevice ";
// NOTE(zhiqiu): why set devices here?
// Because ACL creates a default context which contains 2 streams
// when calling aclrtSetDeviceId, so usually we do not need to
// create contexts explicitly. And, for each device, aclrtSetDeviceId
// need to call parily with aclrtResetDeviceId to destory the default
// context. Here, we use this singleton and static instance to manage
// the devices to make sure they will be resetted before program exit.
devices_ = platform::GetSelectedNPUDevices();
for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
SetNPUDeviceId(*it);
VLOG(4) << "Call aclrtSetDevice " << *it;
}
}
std::vector<int> devices_; std::vector<int> devices_;
}; };
......
...@@ -87,6 +87,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); ...@@ -87,6 +87,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
// others // others
DECLARE_bool(sync_nccl_allreduce); DECLARE_bool(sync_nccl_allreduce);
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
// device management
DECLARE_string(selected_npus);
#endif
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
DECLARE_int32(rpc_send_thread_num); DECLARE_int32(rpc_send_thread_num);
DECLARE_int32(rpc_get_thread_num); DECLARE_int32(rpc_get_thread_num);
...@@ -365,6 +371,11 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -365,6 +371,11 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce); FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
#endif
#ifdef PADDLE_WITH_DITRIBUTE #ifdef PADDLE_WITH_DITRIBUTE
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num, REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
FLAGS_rpc_get_thread_num, FLAGS_rpc_get_thread_num,
......
...@@ -39,7 +39,8 @@ int main(int argc, char** argv) { ...@@ -39,7 +39,8 @@ int main(int argc, char** argv) {
} }
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
envs.push_back("fraction_of_gpu_memory_to_use"); envs.push_back("fraction_of_gpu_memory_to_use");
envs.push_back("initial_gpu_memory_in_mb"); envs.push_back("initial_gpu_memory_in_mb");
envs.push_back("reallocate_gpu_memory_in_mb"); envs.push_back("reallocate_gpu_memory_in_mb");
...@@ -63,6 +64,10 @@ int main(int argc, char** argv) { ...@@ -63,6 +64,10 @@ int main(int argc, char** argv) {
undefok.push_back("initial_cpu_memory_in_mb"); undefok.push_back("initial_cpu_memory_in_mb");
#endif #endif
#if defined(PADDLE_WITH_CUDA)
envs.push_back("selected_npus");
#endif
char* env_str = nullptr; char* env_str = nullptr;
if (envs.size() > 0) { if (envs.size() > 0) {
std::string env_string = "--tryfromenv="; std::string env_string = "--tryfromenv=";
......
...@@ -218,7 +218,7 @@ def __bootstrap__(): ...@@ -218,7 +218,7 @@ def __bootstrap__():
read_env_flags.append('tracer_mkldnn_ops_on') read_env_flags.append('tracer_mkldnn_ops_on')
read_env_flags.append('tracer_mkldnn_ops_off') read_env_flags.append('tracer_mkldnn_ops_off')
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda() or core.is_compiled_with_npu():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'fraction_of_gpu_memory_to_use',
'initial_gpu_memory_in_mb', 'initial_gpu_memory_in_mb',
...@@ -234,6 +234,10 @@ def __bootstrap__(): ...@@ -234,6 +234,10 @@ def __bootstrap__():
'local_exe_sub_scope_limit', 'local_exe_sub_scope_limit',
'gpu_memory_limit_mb', 'gpu_memory_limit_mb',
] ]
if core.is_compiled_with_npu():
read_env_flags += ['selected_npus', ]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
# don't init_p2p when in unittest to save time. # don't init_p2p when in unittest to save time.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册