From ff4654e216df6f7d19c06d22280713dc0cf7fe0e Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 25 Feb 2021 19:57:04 +0800
Subject: [PATCH] refactor npu device manager (#31154)

refactor npu device manager (#31154)
---
 paddle/fluid/platform/enforce.h               |  5 ++-
 paddle/fluid/platform/flags.cc                | 14 ++++++-
 paddle/fluid/platform/npu_info.cc             | 41 ++++++++++++++++++-
 paddle/fluid/platform/npu_info.h              | 39 ++----------------
 .../pybind/global_value_getter_setter.cc      | 11 +++++
 paddle/testing/paddle_gtest_main.cc           |  9 +++-
 python/paddle/fluid/__init__.py               |  6 ++-
 7 files changed, 81 insertions(+), 44 deletions(-)
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c06616d01d..3c8d256921 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -1015,8 +1015,9 @@ DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
 }  // namespace details
 
 inline std::string build_npu_error_msg(aclError stat) {
-  std::string s = " ACL error, the error code is : " + stat;
-  return s;
+  std::ostringstream sout;
+  sout << " ACL error, the error code is : " << stat << ". ";
+  return sout.str();
 }
 
 #define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index e786d01c07..256019bf3d 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
 
-// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
 
 /**
@@ -85,6 +86,14 @@ DEFINE_string(selected_gpus, "",
               "share-memory only.");
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+DEFINE_string(selected_npus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (NPU). If you want to use "
+              "all visible devices, set this to empty string.");
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 /**
@@ -378,7 +387,8 @@ DEFINE_double(
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
 
-// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
 
 /**
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
index 4cb5d9325a..91099d2db2 100644
--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
@@ -28,8 +28,8 @@ DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
 DECLARE_uint64(gpu_memory_limit_mb);
+DECLARE_string(selected_npus);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
@@ -78,7 +78,7 @@ std::vector<int> GetSelectedNPUDevices() {
   // use user specified NPUs in single-node multi-process mode.
   std::vector<int> devices;
   if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
     for (auto id : devices_str) {
       devices.push_back(atoi(id.c_str()));
     }
@@ -368,5 +368,42 @@ bool IsNPUMallocRecorded(int dev_id) {
   return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
 }
 
+AclInstance::~AclInstance() {}
+
+AclInstance &AclInstance::Instance() {
+  static AclInstance instance;
+  return instance;
+}
+
+AclInstance::AclInstance() {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
+  VLOG(4) << "Call aclrtSetDevice ";
+  // NOTE(zhiqiu): why set devices here?
+  // Because ACL creates a default context which contains 2 streams
+  // when calling aclrtSetDeviceId, so usually we do not need to
+  // create contexts explicitly. And, for each device, aclrtSetDeviceId
+  // need to call parily with aclrtResetDeviceId to destory the default
+  // context. Here, we use this singleton and static instance to manage
+  // the devices to make sure they will be resetted before program exit.
+  devices_ = platform::GetSelectedNPUDevices();
+  for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
+    SetNPUDeviceId(*it);
+    VLOG(4) << "Call aclrtSetDevice " << *it;
+  }
+}
+
+void AclInstance::Finalize() {
+  // NOTE(zhiqiu): DO NOT perform finalize in destructor
+  // to avoid problems caused by destructor order of static
+  // object.
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    auto status = aclrtResetDevice(devices_[i]);
+    VLOG(4) << "Call aclrtResetDevice " << devices_[i]
+            << " status = " << status;
+  }
+  auto status = aclFinalize();
+  VLOG(4) << "Call aclFinalize, status = " << status;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h
index 7caada6819..648b18531b 100644
--- a/paddle/fluid/platform/npu_info.h
+++ b/paddle/fluid/platform/npu_info.h
@@ -138,46 +138,15 @@ class AclInstance {
  public:
   // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
   // no PADDLE_ENFORCE here, call acl API directly.
-  ~AclInstance() {}
+  ~AclInstance();
   AclInstance(const AclInstance &o) = delete;
   const AclInstance &operator=(const AclInstance &o) = delete;
-
-  static AclInstance &Instance() {
-    static AclInstance instance;
-    return instance;
-  }
-
-  void Finalize() {
-    // NOTE(zhiqiu): DO NOT perform finalize in destructor
-    // to avoid problems caused by destructor order of static
-    // object.
-    for (size_t i = 0; i < devices_.size(); ++i) {
-      auto status = aclrtResetDevice(devices_[i]);
-      VLOG(4) << "Call aclrtResetDevice " << devices_[i]
-              << " status = " << status;
-    }
-    auto status = aclFinalize();
-    VLOG(4) << "Call aclFinalize, status = " << status;
-  }
+  static AclInstance &Instance();
+  void Finalize();
 
  private:
   // forbid calling default constructor
-  AclInstance() {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
-    VLOG(4) << "Call aclrtSetDevice ";
-    // NOTE(zhiqiu): why set devices here?
-    // Because ACL creates a default context which contains 2 streams
-    // when calling aclrtSetDeviceId, so usually we do not need to
-    // create contexts explicitly. And, for each device, aclrtSetDeviceId
-    // need to call parily with aclrtResetDeviceId to destory the default
-    // context. Here, we use this singleton and static instance to manage
-    // the devices to make sure they will be resetted before program exit.
-    devices_ = platform::GetSelectedNPUDevices();
-    for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
-      SetNPUDeviceId(*it);
-      VLOG(4) << "Call aclrtSetDevice " << *it;
-    }
-  }
+  AclInstance();
   std::vector<int> devices_;
 };
 
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index fa44eeb485..7afa121ea8 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -87,6 +87,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 // others
 DECLARE_bool(sync_nccl_allreduce);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+// device management
+DECLARE_string(selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 DECLARE_int32(rpc_send_thread_num);
 DECLARE_int32(rpc_get_thread_num);
@@ -365,6 +371,11 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
       FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DITRIBUTE
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
                              FLAGS_rpc_get_thread_num,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 2a1af17559..f5154f0a0c 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -39,7 +39,8 @@ int main(int argc, char** argv) {
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
   envs.push_back("fraction_of_gpu_memory_to_use");
   envs.push_back("initial_gpu_memory_in_mb");
   envs.push_back("reallocate_gpu_memory_in_mb");
@@ -63,6 +64,10 @@ int main(int argc, char** argv) {
   undefok.push_back("initial_cpu_memory_in_mb");
 #endif
 
+#if defined(PADDLE_WITH_CUDA)
+  envs.push_back("selected_npus");
+#endif
+
   char* env_str = nullptr;
   if (envs.size() > 0) {
     std::string env_string = "--tryfromenv=";
@@ -94,7 +99,7 @@ int main(int argc, char** argv) {
   paddle::framework::InitDevices();
 
   int ret = RUN_ALL_TESTS();
-  
+
 #ifdef PADDLE_WITH_ASCEND_CL
   paddle::platform::AclInstance::Instance().Finalize();
 #endif
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5c6ce1dc17..aa1f49cce6 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -218,7 +218,7 @@ def __bootstrap__():
         read_env_flags.append('tracer_mkldnn_ops_on')
         read_env_flags.append('tracer_mkldnn_ops_off')
 
-    if core.is_compiled_with_cuda():
+    if core.is_compiled_with_cuda() or core.is_compiled_with_npu():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use',
             'initial_gpu_memory_in_mb',
@@ -234,6 +234,10 @@ def __bootstrap__():
             'local_exe_sub_scope_limit',
             'gpu_memory_limit_mb',
         ]
+
+    if core.is_compiled_with_npu():
+        read_env_flags += ['selected_npus', ]
+
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
     # don't init_p2p when in unittest to save time.
-- 
GitLab