未验证 提交 718e1807 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #9821 from reyoung/feature/change_int64

Make cuda_helper.h Pass cpplint
...@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512; ...@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
USE_CUDA_ATOMIC(Add, float); USE_CUDA_ATOMIC(Add, float);
USE_CUDA_ATOMIC(Add, int); USE_CUDA_ATOMIC(Add, int);
USE_CUDA_ATOMIC(Add, unsigned int); USE_CUDA_ATOMIC(Add, unsigned int);
USE_CUDA_ATOMIC(Add, unsigned long long int); // CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT
CUDA_ATOMIC_WRAPPER(Add, int64_t) { CUDA_ATOMIC_WRAPPER(Add, int64_t) {
static_assert(sizeof(int64_t) == sizeof(long long int), // Here, we check long long int must be int64_t.
static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT
"long long should be int64"); "long long should be int64");
return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address), return CudaAtomicAdd(
static_cast<unsigned long long int>(val)); reinterpret_cast<unsigned long long int*>(address), // NOLINT
static_cast<unsigned long long int>(val)); // NOLINT
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
USE_CUDA_ATOMIC(Add, double); USE_CUDA_ATOMIC(Add, double);
#else #else
CUDA_ATOMIC_WRAPPER(Add, double) { CUDA_ATOMIC_WRAPPER(Add, double) {
unsigned long long int* address_as_ull = unsigned long long int* address_as_ull = // NOLINT
reinterpret_cast<unsigned long long int*>(address); reinterpret_cast<unsigned long long int*>(address); // NOLINT
unsigned long long int old = *address_as_ull, assumed; unsigned long long int old = *address_as_ull, assumed; // NOLINT
do { do {
assumed = old; assumed = old;
......
...@@ -61,7 +61,7 @@ struct NCCLContext { ...@@ -61,7 +61,7 @@ struct NCCLContext {
ncclComm_t comm_; ncclComm_t comm_;
explicit NCCLContext(int dev_id) explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {} : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
cudaStream_t stream() const { return ctx_->stream(); } cudaStream_t stream() const { return ctx_->stream(); }
...@@ -95,6 +95,7 @@ struct NCCLContextMap { ...@@ -95,6 +95,7 @@ struct NCCLContextMap {
std::vector<int> order_; std::vector<int> order_;
explicit NCCLContextMap(const std::vector<platform::Place> &places) { explicit NCCLContextMap(const std::vector<platform::Place> &places) {
PADDLE_ENFORCE(!places.empty());
order_.reserve(places.size()); order_.reserve(places.size());
for (auto &p : places) { for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device; int dev_id = boost::get<CUDAPlace>(p).device;
...@@ -105,15 +106,17 @@ struct NCCLContextMap { ...@@ -105,15 +106,17 @@ struct NCCLContextMap {
order_.size(), contexts_.size(), order_.size(), contexts_.size(),
"NCCL Context Map does not support contain two or more same device"); "NCCL Context Map does not support contain two or more same device");
std::vector<ncclComm_t> comms; if (places.size() > 1) {
comms.resize(order_.size()); std::vector<ncclComm_t> comms;
comms.resize(order_.size());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(order_.size()), &order_[0])); &comms[0], static_cast<int>(order_.size()), &order_[0]));
int i = 0; int i = 0;
for (auto &dev_id : order_) { for (auto &dev_id : order_) {
contexts_.at(dev_id).comm_ = comms[i++]; contexts_.at(dev_id).comm_ = comms[i++];
}
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册