From ff4654e216df6f7d19c06d22280713dc0cf7fe0e Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 25 Feb 2021 19:57:04 +0800 Subject: [PATCH] refactor npu device manager (#31154) refactor npu device manager (#31154) --- paddle/fluid/platform/enforce.h | 5 ++- paddle/fluid/platform/flags.cc | 14 ++++++- paddle/fluid/platform/npu_info.cc | 41 ++++++++++++++++++- paddle/fluid/platform/npu_info.h | 39 ++---------------- .../pybind/global_value_getter_setter.cc | 11 +++++ paddle/testing/paddle_gtest_main.cc | 9 +++- python/paddle/fluid/__init__.py | 6 ++- 7 files changed, 81 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c06616d01d5..3c8d256921d 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -1015,8 +1015,9 @@ DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); } // namespace details inline std::string build_npu_error_msg(aclError stat) { - std::string s = " ACL error, the error code is : " + stat; - return s; + std::ostringstream sout; + sout << " ACL error, the error code is : " << stat << ". "; + return sout.str(); } #define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index e786d01c075..256019bf3dc 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); -// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags. +// NOTE(zhiqiu): better to share the flags, otherwise we will have too many +// flags. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) /** @@ -85,6 +86,14 @@ DEFINE_string(selected_gpus, "", "share-memory only."); #endif +#if defined(PADDLE_WITH_ASCEND_CL) +DEFINE_string(selected_npus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (NPU). If you want to use " + "all visible devices, set this to empty string."); +#endif + #ifdef PADDLE_WITH_CUDA /** @@ -378,7 +387,8 @@ DEFINE_double( "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "reserve the rest for page tables, etc"); -// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags. +// NOTE(zhiqiu): better to share the flags, otherwise we will have too many +// flags. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) /** diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc index 4cb5d9325af..91099d2db2a 100644 --- a/paddle/fluid/platform/npu_info.cc +++ b/paddle/fluid/platform/npu_info.cc @@ -28,8 +28,8 @@ DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(enable_cublas_tensor_op_math); -DECLARE_string(selected_gpus); DECLARE_uint64(gpu_memory_limit_mb); +DECLARE_string(selected_npus); constexpr static float fraction_reserve_gpu_memory = 0.05f; @@ -78,7 +78,7 @@ std::vector GetSelectedNPUDevices() { // use user specified NPUs in single-node multi-process mode. std::vector devices; if (!FLAGS_selected_gpus.empty()) { - auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); + auto devices_str = paddle::string::Split(FLAGS_selected_npus, ','); for (auto id : devices_str) { devices.push_back(atoi(id.c_str())); } @@ -368,5 +368,42 @@ bool IsNPUMallocRecorded(int dev_id) { return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord(); } +AclInstance::~AclInstance() {} + +AclInstance &AclInstance::Instance() { + static AclInstance instance; + return instance; +} + +AclInstance::AclInstance() { + PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr)); + VLOG(4) << "Call aclrtSetDevice "; + // NOTE(zhiqiu): why set devices here? + // Because ACL creates a default context which contains 2 streams + // when calling aclrtSetDeviceId, so usually we do not need to + // create contexts explicitly. And, for each device, aclrtSetDeviceId + // need to call parily with aclrtResetDeviceId to destory the default + // context. Here, we use this singleton and static instance to manage + // the devices to make sure they will be resetted before program exit. + devices_ = platform::GetSelectedNPUDevices(); + for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) { + SetNPUDeviceId(*it); + VLOG(4) << "Call aclrtSetDevice " << *it; + } +} + +void AclInstance::Finalize() { + // NOTE(zhiqiu): DO NOT perform finalize in destructor + // to avoid problems caused by destructor order of static + // object. + for (size_t i = 0; i < devices_.size(); ++i) { + auto status = aclrtResetDevice(devices_[i]); + VLOG(4) << "Call aclrtResetDevice " << devices_[i] + << " status = " << status; + } + auto status = aclFinalize(); + VLOG(4) << "Call aclFinalize, status = " << status; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h index 7caada68190..648b18531b2 100644 --- a/paddle/fluid/platform/npu_info.h +++ b/paddle/fluid/platform/npu_info.h @@ -138,46 +138,15 @@ class AclInstance { public: // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so // no PADDLE_ENFORCE here, call acl API directly. - ~AclInstance() {} + ~AclInstance(); AclInstance(const AclInstance &o) = delete; const AclInstance &operator=(const AclInstance &o) = delete; - - static AclInstance &Instance() { - static AclInstance instance; - return instance; - } - - void Finalize() { - // NOTE(zhiqiu): DO NOT perform finalize in destructor - // to avoid problems caused by destructor order of static - // object. - for (size_t i = 0; i < devices_.size(); ++i) { - auto status = aclrtResetDevice(devices_[i]); - VLOG(4) << "Call aclrtResetDevice " << devices_[i] - << " status = " << status; - } - auto status = aclFinalize(); - VLOG(4) << "Call aclFinalize, status = " << status; - } + static AclInstance &Instance(); + void Finalize(); private: // forbid calling default constructor - AclInstance() { - PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr)); - VLOG(4) << "Call aclrtSetDevice "; - // NOTE(zhiqiu): why set devices here? - // Because ACL creates a default context which contains 2 streams - // when calling aclrtSetDeviceId, so usually we do not need to - // create contexts explicitly. And, for each device, aclrtSetDeviceId - // need to call parily with aclrtResetDeviceId to destory the default - // context. Here, we use this singleton and static instance to manage - // the devices to make sure they will be resetted before program exit. - devices_ = platform::GetSelectedNPUDevices(); - for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) { - SetNPUDeviceId(*it); - VLOG(4) << "Call aclrtSetDevice " << *it; - } - } + AclInstance(); std::vector devices_; }; diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index fa44eeb485c..7afa121ea82 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -87,6 +87,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); // others DECLARE_bool(sync_nccl_allreduce); #endif + +#ifdef PADDLE_WITH_ASCEND_CL +// device management +DECLARE_string(selected_npus); +#endif + #ifdef PADDLE_WITH_DISTRIBUTE DECLARE_int32(rpc_send_thread_num); DECLARE_int32(rpc_get_thread_num); @@ -365,6 +371,11 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce); #endif + +#ifdef PADDLE_WITH_ASCEND_CL + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus); +#endif + #ifdef PADDLE_WITH_DITRIBUTE REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num, FLAGS_rpc_get_thread_num, diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 2a1af175599..f5154f0a0cf 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -39,7 +39,8 @@ int main(int argc, char** argv) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) envs.push_back("fraction_of_gpu_memory_to_use"); envs.push_back("initial_gpu_memory_in_mb"); envs.push_back("reallocate_gpu_memory_in_mb"); @@ -63,6 +64,10 @@ int main(int argc, char** argv) { undefok.push_back("initial_cpu_memory_in_mb"); #endif +#if defined(PADDLE_WITH_CUDA) + envs.push_back("selected_npus"); +#endif + char* env_str = nullptr; if (envs.size() > 0) { std::string env_string = "--tryfromenv="; @@ -94,7 +99,7 @@ int main(int argc, char** argv) { paddle::framework::InitDevices(); int ret = RUN_ALL_TESTS(); - + #ifdef PADDLE_WITH_ASCEND_CL paddle::platform::AclInstance::Instance().Finalize(); #endif diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5c6ce1dc17a..aa1f49cce63 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -218,7 +218,7 @@ def __bootstrap__(): read_env_flags.append('tracer_mkldnn_ops_on') read_env_flags.append('tracer_mkldnn_ops_off') - if core.is_compiled_with_cuda(): + if core.is_compiled_with_cuda() or core.is_compiled_with_npu(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb', @@ -234,6 +234,10 @@ def __bootstrap__(): 'local_exe_sub_scope_limit', 'gpu_memory_limit_mb', ] + + if core.is_compiled_with_npu(): + read_env_flags += ['selected_npus', ] + core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) # don't init_p2p when in unittest to save time. -- GitLab