refactor npu device manager (#31154)

refactor npu device manager (#31154)

refactor npu device manager (#31154)
ff4654e2 · Leo Chen · GitHub · 1435b4c0 · ff4654e2 · ff4654e2
7 changed file
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -1015,8 +1015,9 @@ DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
 }  // namespace details

 inline std::string build_npu_error_msg(aclError stat) {
-  std::string s = " ACL error, the error code is : " + stat;
-  return s;
+  std::ostringstream sout;
+  sout << " ACL error, the error code is : " << stat << ". ";
+  return sout.str();
 }

 #define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
            "Checking whether operator produce NAN/INF or not. It will be "
            "extremely slow so please use this flag wisely.");

-// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)

 /**
@@ -85,6 +86,14 @@ DEFINE_string(selected_gpus, "",
              "share-memory only.");
 #endif

+#if defined(PADDLE_WITH_ASCEND_CL)
+DEFINE_string(selected_npus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (NPU). If you want to use "
+              "all visible devices, set this to empty string.");
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 /**
@@ -378,7 +387,8 @@ DEFINE_double(
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

-// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)

 /**

--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
@@ -28,8 +28,8 @@ DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
 DECLARE_uint64(gpu_memory_limit_mb);
+DECLARE_string(selected_npus);

 constexpr static float fraction_reserve_gpu_memory = 0.05f;

@@ -78,7 +78,7 @@ std::vector<int> GetSelectedNPUDevices() {
  // use user specified NPUs in single-node multi-process mode.
  std::vector<int> devices;
  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
    for (auto id : devices_str) {
      devices.push_back(atoi(id.c_str()));
    }
@@ -368,5 +368,42 @@ bool IsNPUMallocRecorded(int dev_id) {
  return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
 }

+AclInstance::~AclInstance() {}
+
+AclInstance &AclInstance::Instance() {
+  static AclInstance instance;
+  return instance;
+}
+
+AclInstance::AclInstance() {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
+  VLOG(4) << "Call aclrtSetDevice ";
+  // NOTE(zhiqiu): why set devices here?
+  // Because ACL creates a default context which contains 2 streams
+  // when calling aclrtSetDeviceId, so usually we do not need to
+  // create contexts explicitly. And, for each device, aclrtSetDeviceId
+  // need to call parily with aclrtResetDeviceId to destory the default
+  // context. Here, we use this singleton and static instance to manage
+  // the devices to make sure they will be resetted before program exit.
+  devices_ = platform::GetSelectedNPUDevices();
+  for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
+    SetNPUDeviceId(*it);
+    VLOG(4) << "Call aclrtSetDevice " << *it;
+  }
+}
+
+void AclInstance::Finalize() {
+  // NOTE(zhiqiu): DO NOT perform finalize in destructor
+  // to avoid problems caused by destructor order of static
+  // object.
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    auto status = aclrtResetDevice(devices_[i]);
+    VLOG(4) << "Call aclrtResetDevice " << devices_[i]
+            << " status = " << status;
+  }
+  auto status = aclFinalize();
+  VLOG(4) << "Call aclFinalize, status = " << status;
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/npu_info.h
+++ b/paddle/fluid/platform/npu_info.h
@@ -138,46 +138,15 @@ class AclInstance {
 public:
  // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
  // no PADDLE_ENFORCE here, call acl API directly.
-  ~AclInstance() {}
+  ~AclInstance();
  AclInstance(const AclInstance &o) = delete;
  const AclInstance &operator=(const AclInstance &o) = delete;
-
-  static AclInstance &Instance() {
-    static AclInstance instance;
-    return instance;
-  }
-
-  void Finalize() {
-    // NOTE(zhiqiu): DO NOT perform finalize in destructor
-    // to avoid problems caused by destructor order of static
-    // object.
-    for (size_t i = 0; i < devices_.size(); ++i) {
-      auto status = aclrtResetDevice(devices_[i]);
-      VLOG(4) << "Call aclrtResetDevice " << devices_[i]
-              << " status = " << status;
-    }
-    auto status = aclFinalize();
-    VLOG(4) << "Call aclFinalize, status = " << status;
-  }
+  static AclInstance &Instance();
+  void Finalize();

 private:
  // forbid calling default constructor
-  AclInstance() {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
-    VLOG(4) << "Call aclrtSetDevice ";
-    // NOTE(zhiqiu): why set devices here?
-    // Because ACL creates a default context which contains 2 streams
-    // when calling aclrtSetDeviceId, so usually we do not need to
-    // create contexts explicitly. And, for each device, aclrtSetDeviceId
-    // need to call parily with aclrtResetDeviceId to destory the default
-    // context. Here, we use this singleton and static instance to manage
-    // the devices to make sure they will be resetted before program exit.
-    devices_ = platform::GetSelectedNPUDevices();
-    for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
-      SetNPUDeviceId(*it);
-      VLOG(4) << "Call aclrtSetDevice " << *it;
-    }
-  }
+  AclInstance();
  std::vector<int> devices_;
 };


--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -87,6 +87,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 // others
 DECLARE_bool(sync_nccl_allreduce);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+// device management
+DECLARE_string(selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 DECLARE_int32(rpc_send_thread_num);
 DECLARE_int32(rpc_get_thread_num);
@@ -365,6 +371,11 @@ static void RegisterGlobalVarGetterSetter() {
      FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DITRIBUTE
  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
                             FLAGS_rpc_get_thread_num,

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -39,7 +39,8 @@ int main(int argc, char** argv) {
  }
 #endif

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
  envs.push_back("fraction_of_gpu_memory_to_use");
  envs.push_back("initial_gpu_memory_in_mb");
  envs.push_back("reallocate_gpu_memory_in_mb");
@@ -63,6 +64,10 @@ int main(int argc, char** argv) {
  undefok.push_back("initial_cpu_memory_in_mb");
 #endif

+#if defined(PADDLE_WITH_CUDA)
+  envs.push_back("selected_npus");
+#endif
+
  char* env_str = nullptr;
  if (envs.size() > 0) {
    std::string env_string = "--tryfromenv=";

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -218,7 +218,7 @@ def __bootstrap__():
        read_env_flags.append('tracer_mkldnn_ops_on')
        read_env_flags.append('tracer_mkldnn_ops_off')

-    if core.is_compiled_with_cuda():
+    if core.is_compiled_with_cuda() or core.is_compiled_with_npu():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use',
            'initial_gpu_memory_in_mb',
@@ -234,6 +234,10 @@ def __bootstrap__():
            'local_exe_sub_scope_limit',
            'gpu_memory_limit_mb',
        ]
+
+    if core.is_compiled_with_npu():
+        read_env_flags += ['selected_npus', ]
+
    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])
    # don't init_p2p when in unittest to save time.