diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 0cfc82709637f2a1b2f4349ed2a543a3b082309f..21da7a0560ee39ca71540a738424df3e3581f34f 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -14,7 +14,7 @@ if(WITH_DISTRIBUTE) DEPS phi_api eager_api gloo_wrapper) endif() -if(WITH_NCCL) +if(WITH_NCCL OR WITH_RCCL) cc_library( processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index f38ce8faa7ffb53c8e80855bdd4d6c946767748e..5f1da003313ad36f76282e5b6c48b203c59aa81a 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -14,7 +14,13 @@ #pragma once +#ifdef PADDLE_WITH_CUDA #include +#endif +#ifdef PADDLE_WITH_HIP +#include +#endif + #include #include @@ -23,9 +29,19 @@ #include "paddle/fluid/distributed/collective/Types.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/variable.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" +#endif + #include "paddle/fluid/platform/device_context.h" + +#ifdef PADDLE_WITH_RCCL +#include "paddle/fluid/platform/dynload/rccl.h" +#else #include "paddle/fluid/platform/dynload/nccl.h" +#endif + #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -56,7 +72,11 @@ class EventManager { ~EventManager() { if (is_created_) { platform::CUDADeviceGuard guard(device_index_); +#ifdef PADDLE_WITH_HIP + hipEventDestroy(event_); +#else cudaEventDestroy(event_); +#endif } } @@ -94,24 +114,42 @@ class EventManager { device_index, device_index_)); platform::CUDADeviceGuard guard(device_index_); +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream())); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream())); +#endif } bool Query() const { +#ifdef PADDLE_WITH_HIP + gpuError_t err = hipEventQuery(event_); + if (err == hipSuccess) { + return true; + } + if (err == hipErrorNotReady) { + return false; + } +#else gpuError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { return true; - } else if (err == cudaErrorNotReady) { - return false; - } else { - PADDLE_ENFORCE_GPU_SUCCESS(err); + } + if (err == cudaErrorNotReady) { return false; } +#endif + PADDLE_ENFORCE_GPU_SUCCESS(err); + return false; } void Synchronize() const { if (is_created_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); +#else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); +#endif } } @@ -124,12 +162,22 @@ class EventManager { "Event's device %d", device_index, device_index_)); platform::CUDADeviceGuard guard(device_index_); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0)); +#else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0)); +#endif } } private: +#ifdef PADDLE_WITH_HIP + unsigned int flags_ = hipEventDefault; +#else unsigned int flags_ = cudaEventDefault; +#endif + bool is_created_{false}; gpuEvent_t event_{}; int8_t device_index_{0}; @@ -138,7 +186,13 @@ class EventManager { void CreateEvent(int device_index) { device_index_ = device_index; platform::CUDADeviceGuard guard(device_index); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_)); +#else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_)); +#endif + is_created_ = true; } }; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index dc67205c78f56d8d7012a42dd7383413d73d0692..793f8dacbf8d4a79e138c7b85ffe0453e94af750 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -95,7 +95,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // If we use the work to do barrier, we should block cpu for (auto& place : places_) { platform::CUDADeviceGuard gpuGuard(place); +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#endif } } return true; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 2325e645b4c46e184b68dfe1fd6341622cd3ac31..c56f75b46518c26e33b1d3066c59eb5c527fc544 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -30,8 +30,13 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/NCCLTools.h" +#endif + +#ifdef PADDLE_WITH_RCCL +#include "paddle/fluid/platform/dynload/rccl.h" +#else #include "paddle/fluid/platform/dynload/nccl.h" #endif