diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 881d611d4ac26f992036f639097815aff625227b..8758af0804ae08fec6fa64d7387f197f046ce20e 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512; USE_CUDA_ATOMIC(Add, float); USE_CUDA_ATOMIC(Add, int); USE_CUDA_ATOMIC(Add, unsigned int); -USE_CUDA_ATOMIC(Add, unsigned long long int); +// CUDA API uses unsigned long long int, we cannot use uint64_t here. +// It because unsigned long long int is not necessarily uint64_t +USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT CUDA_ATOMIC_WRAPPER(Add, int64_t) { - static_assert(sizeof(int64_t) == sizeof(long long int), + // Here, we check long long int must be int64_t. + static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT "long long should be int64"); - return CudaAtomicAdd(reinterpret_cast(address), - static_cast(val)); + return CudaAtomicAdd( + reinterpret_cast(address), // NOLINT + static_cast(val)); // NOLINT } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { - unsigned long long int* address_as_ull = - reinterpret_cast(address); - unsigned long long int old = *address_as_ull, assumed; + unsigned long long int* address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT do { assumed = old; diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 3a2a423486170320d82aea7ad1e97138c2df7e69..ca9ab2c7aecff47924f0198802d710b7661f5576 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -61,7 +61,7 @@ struct NCCLContext { ncclComm_t comm_; explicit NCCLContext(int dev_id) - : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {} + : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} cudaStream_t stream() const { return ctx_->stream(); } @@ -95,6 +95,7 @@ struct NCCLContextMap { std::vector order_; explicit NCCLContextMap(const std::vector &places) { + PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); for (auto &p : places) { int dev_id = boost::get(p).device; @@ -105,15 +106,17 @@ struct NCCLContextMap { order_.size(), contexts_.size(), "NCCL Context Map does not support contain two or more same device"); - std::vector comms; - comms.resize(order_.size()); + if (places.size() > 1) { + std::vector comms; + comms.resize(order_.size()); - PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( - &comms[0], static_cast(order_.size()), &order_[0])); + PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( + &comms[0], static_cast(order_.size()), &order_[0])); - int i = 0; - for (auto &dev_id : order_) { - contexts_.at(dev_id).comm_ = comms[i++]; + int i = 0; + for (auto &dev_id : order_) { + contexts_.at(dev_id).comm_ = comms[i++]; + } } }