Merge pull request #9821 from reyoung/feature/change_int64

Make cuda_helper.h Pass cpplint

Merge pull request #9821 from reyoung/feature/change_int64
Make cuda_helper.h Pass cpplint
718e1807 · Yu Yang · GitHub · 129859e7 · c64190ec · 718e1807
隐藏空白更改
内联并排

Showing with 22 addition and 15 deletion

paddle/fluid/platform/cuda_helper.h paddle/fluid/platform/cuda_helper.h +11 -7

paddle/fluid/platform/nccl_helper.h paddle/fluid/platform/nccl_helper.h +11 -8

未找到文件。
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 USE_CUDA_ATOMIC(Add, float);
 USE_CUDA_ATOMIC(Add, int);
 USE_CUDA_ATOMIC(Add, unsigned int);
-USE_CUDA_ATOMIC(Add, unsigned long long int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT

 CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  static_assert(sizeof(int64_t) == sizeof(long long int),
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                "long long should be int64");
-  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
-                       static_cast<unsigned long long int>(val));
+  return CudaAtomicAdd(
+      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));           // NOLINT
 }

 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =
-      reinterpret_cast<unsigned long long int*>(address);
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int* address_as_ull =                 // NOLINT
+      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;   // NOLINT

  do {
    assumed = old;

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -61,7 +61,7 @@ struct NCCLContext {
  ncclComm_t comm_;

  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}

  cudaStream_t stream() const { return ctx_->stream(); }

@@ -95,6 +95,7 @@ struct NCCLContextMap {
  std::vector<int> order_;

  explicit NCCLContextMap(const std::vector<platform::Place> &places) {
+    PADDLE_ENFORCE(!places.empty());
    order_.reserve(places.size());
    for (auto &p : places) {
      int dev_id = boost::get<CUDAPlace>(p).device;
@@ -105,15 +106,17 @@ struct NCCLContextMap {
        order_.size(), contexts_.size(),
        "NCCL Context Map does not support contain two or more same device");

-    std::vector<ncclComm_t> comms;
-    comms.resize(order_.size());
+    if (places.size() > 1) {
+      std::vector<ncclComm_t> comms;
+      comms.resize(order_.size());

-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+          &comms[0], static_cast<int>(order_.size()), &order_[0]));

-    int i = 0;
-    for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
+      int i = 0;
+      for (auto &dev_id : order_) {
+        contexts_.at(dev_id).comm_ = comms[i++];
+      }
    }
  }