未验证 提交 c8fac5ee 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part5), test=develop (#31014)

上级 580447d0
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include <functional> #include <functional>
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#include "gflags/gflags.h" #include "gflags/gflags.h"
...@@ -53,7 +53,7 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) { ...@@ -53,7 +53,7 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
const platform::CUDAPlace &place, size_t max_memory_size) const platform::CUDAPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {} : GarbageCollector(place, max_memory_size) {}
...@@ -82,18 +82,27 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, ...@@ -82,18 +82,27 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
size_t max_memory_size) size_t max_memory_size)
: GarbageCollector(place, max_memory_size) { : GarbageCollector(place, max_memory_size) {
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
#endif
callback_manager_.reset(new platform::StreamCallbackManager(stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_));
} }
StreamGarbageCollector::~StreamGarbageCollector() { StreamGarbageCollector::~StreamGarbageCollector() {
auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace()); auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
#endif
} }
cudaStream_t StreamGarbageCollector::stream() const { return stream_; } gpuStream_t StreamGarbageCollector::stream() const { return stream_; }
void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
......
...@@ -80,7 +80,7 @@ class XPUGarbageCollector : public GarbageCollector { ...@@ -80,7 +80,7 @@ class XPUGarbageCollector : public GarbageCollector {
}; };
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class UnsafeFastGPUGarbageCollector : public GarbageCollector { class UnsafeFastGPUGarbageCollector : public GarbageCollector {
public: public:
UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
...@@ -110,13 +110,13 @@ class StreamGarbageCollector : public GarbageCollector { ...@@ -110,13 +110,13 @@ class StreamGarbageCollector : public GarbageCollector {
void Wait() const override; void Wait() const override;
cudaStream_t stream() const; gpuStream_t stream() const;
protected: protected:
void ClearCallback(const std::function<void()> &callback) override; void ClearCallback(const std::function<void()> &callback) override;
private: private:
cudaStream_t stream_; gpuStream_t stream_;
std::unique_ptr<platform::StreamCallbackManager> callback_manager_; std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
}; };
......
...@@ -152,7 +152,7 @@ class HeterObjectPool { ...@@ -152,7 +152,7 @@ class HeterObjectPool {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
if (pool_.empty()) { if (pool_.empty()) {
num_ += 1; num_ += 1;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
VLOG(0) << "pool construct size: " << num_; VLOG(0) << "pool construct size: " << num_;
#endif #endif
return std::make_shared<T>(); return std::make_shared<T>();
......
...@@ -21,9 +21,10 @@ limitations under the License. */ ...@@ -21,9 +21,10 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB) (defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -48,16 +49,25 @@ void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -48,16 +49,25 @@ void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc,
dataset->GetReaders(); dataset->GetReaders();
for (int i = 0; i < place_num; ++i) { for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i); int num = trainer_desc.worker_places(i);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place = platform::CUDAPlace(num); platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream; gpuStream_t stream;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
#endif
copy_streams_.push_back(stream); copy_streams_.push_back(stream);
places_.push_back(place); places_.push_back(place);
cudaEvent_t event; gpuEvent_t event;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
#endif
events_.push_back(event); events_.push_back(event);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -140,8 +150,13 @@ void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program, ...@@ -140,8 +150,13 @@ void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program,
_ForEachDataType_(HeterMemcpyFunc); _ForEachDataType_(HeterMemcpyFunc);
} }
} }
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
hipEventSynchronize(event);
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event); cudaEventSynchronize(event);
#endif
} }
place_ = place; place_ = place;
} }
...@@ -150,7 +165,7 @@ template <typename T> ...@@ -150,7 +165,7 @@ template <typename T>
void HeterBoxTrainer::HeterMemCpy(LoDTensor* thread_tensor, void HeterBoxTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place, const paddle::platform::Place& thread_place,
cudaStream_t stream) { gpuStream_t stream) {
T* thread_ptr = T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place); thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>(); T* root_ptr = root_tensor->data<T>();
...@@ -171,7 +186,7 @@ void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -171,7 +186,7 @@ void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope()); pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope());
pull_dense_worker_->AddPlace(places_[i]); pull_dense_worker_->AddPlace(places_[i]);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
pull_dense_worker_->AddStream(copy_streams_[i]); pull_dense_worker_->AddStream(copy_streams_[i]);
#endif #endif
} }
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#endif #endif
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h> #include <stdio.h>
#include "gtest/gtest.h" #include "gtest/gtest.h"
...@@ -34,8 +32,14 @@ TEST(LoD, data) { ...@@ -34,8 +32,14 @@ TEST(LoD, data) {
auto& v = lod[0]; auto& v = lod[0];
paddle::platform::CUDAPlace gpu(0); paddle::platform::CUDAPlace gpu(0);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu),
v.size());
hipDeviceSynchronize();
#else
test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < v.size(); ++i) { for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2); EXPECT_EQ(v[i], i * 2);
} }
...@@ -59,8 +63,14 @@ TEST(LoDTensor, LoDInGPU) { ...@@ -59,8 +63,14 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0,
lod[0].CUDAMutableData(place), lod[0].size());
hipDeviceSynchronize();
#else
test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
...@@ -31,7 +31,7 @@ limitations under the License. */ ...@@ -31,7 +31,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
......
...@@ -12,7 +12,13 @@ ...@@ -12,7 +12,13 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory> #include <memory>
#include "glog/logging.h" #include "glog/logging.h"
...@@ -22,6 +28,7 @@ ...@@ -22,6 +28,7 @@
template <typename T> template <typename T>
using vec = paddle::framework::Vector<T>; using vec = paddle::framework::Vector<T>;
using gpuStream_t = paddle::gpuStream_t;
static __global__ void multiply_10(int* ptr) { static __global__ void multiply_10(int* ptr) {
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
...@@ -29,7 +36,7 @@ static __global__ void multiply_10(int* ptr) { ...@@ -29,7 +36,7 @@ static __global__ void multiply_10(int* ptr) {
} }
} }
cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
return reinterpret_cast<const paddle::platform::CUDADeviceContext*>( return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
...@@ -43,7 +50,12 @@ TEST(mixed_vector, GPU_VECTOR) { ...@@ -43,7 +50,12 @@ TEST(mixed_vector, GPU_VECTOR) {
ASSERT_EQ(tmp.size(), 10UL); ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu(0); paddle::platform::CUDAPlace gpu(0);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu),
tmp.MutableData(gpu));
#else
multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
#endif
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp[i], i * 10); ASSERT_EQ(tmp[i], i * 10);
...@@ -64,11 +76,23 @@ TEST(mixed_vector, MultiGPU) { ...@@ -64,11 +76,23 @@ TEST(mixed_vector, MultiGPU) {
ASSERT_EQ(tmp.size(), 10UL); ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu0(0); paddle::platform::CUDAPlace gpu0(0);
paddle::platform::SetDeviceId(0); paddle::platform::SetDeviceId(0);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu0),
tmp.MutableData(gpu0));
#else
multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
#endif
paddle::platform::CUDAPlace gpu1(1); paddle::platform::CUDAPlace gpu1(1);
auto* gpu1_ptr = tmp.MutableData(gpu1); auto* gpu1_ptr = tmp.MutableData(gpu1);
paddle::platform::SetDeviceId(1); paddle::platform::SetDeviceId(1);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu1),
gpu1_ptr);
#else
multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
#endif
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp[i], i * 100); ASSERT_EQ(tmp[i], i * 100);
} }
......
...@@ -369,7 +369,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -369,7 +369,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
// TODO(fengjiayi): The following macros // TODO(fengjiayi): The following macros
// seems ugly, do we have better method? // seems ugly, do we have better method?
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU) #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
#else #else
#define USE_OP_KERNEL(op_type) \ #define USE_OP_KERNEL(op_type) \
......
...@@ -193,7 +193,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -193,7 +193,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
try { try {
VLOG(4) << place << " " << DebugStringEx(&scope); VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or " "Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with CUDA support.", "reinstall Paddle with CUDA support.",
...@@ -1166,6 +1166,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1166,6 +1166,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
VLOG(4) << "Operator(" << Type() << "): context wait and get last error"; VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
#endif
#if defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
#endif #endif
} }
......
...@@ -384,7 +384,7 @@ class ExecutionContext { ...@@ -384,7 +384,7 @@ class ExecutionContext {
return device_context_; return device_context_;
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const inline platform::CUDADeviceContext& cuda_device_context() const { const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -60,7 +60,7 @@ static std::once_flag gProfileOnce; ...@@ -60,7 +60,7 @@ static std::once_flag gProfileOnce;
static bool gProfileStarted = false; static bool gProfileStarted = false;
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::once_flag p2p_init_flag; std::once_flag p2p_init_flag;
#endif #endif
...@@ -132,7 +132,7 @@ class ParallelExecutorPrivate { ...@@ -132,7 +132,7 @@ class ParallelExecutorPrivate {
} }
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
<< ", num_trainers:" << bst.num_trainers_ << ", num_trainers:" << bst.num_trainers_
...@@ -371,7 +371,7 @@ class ParallelExecutorPrivate { ...@@ -371,7 +371,7 @@ class ParallelExecutorPrivate {
std::unordered_map<std::string, bool> is_persistable_; std::unordered_map<std::string, bool> is_persistable_;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform::NCCLCommunicator *nccl_ctxs_{nullptr}; platform::NCCLCommunicator *nccl_ctxs_{nullptr};
#elif defined(PADDLE_WITH_XPU_BKCL) #elif defined(PADDLE_WITH_XPU_BKCL)
platform::BKCLCommunicator *bkcl_ctxs_{nullptr}; platform::BKCLCommunicator *bkcl_ctxs_{nullptr};
...@@ -483,7 +483,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -483,7 +483,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
} }
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
...@@ -572,7 +572,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { ...@@ -572,7 +572,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
} }
void InitP2P(const std::vector<platform::Place> &places) { void InitP2P(const std::vector<platform::Place> &places) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::call_once(p2p_init_flag, [&]() { std::call_once(p2p_init_flag, [&]() {
int count = places.size(); int count = places.size();
if (count <= 1) return; if (count <= 1) return;
...@@ -590,14 +590,24 @@ void InitP2P(const std::vector<platform::Place> &places) { ...@@ -590,14 +590,24 @@ void InitP2P(const std::vector<platform::Place> &places) {
for (int j = 0; j < count; ++j) { for (int j = 0; j < count; ++j) {
if (devices[i] == devices[j]) continue; if (devices[i] == devices[j]) continue;
int can_acess = -1; int can_acess = -1;
#ifdef PADDLE_WITH_HIP
hipError_t ret =
hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
if (ret != hipSuccess || can_acess != 1) {
#else
cudaError_t ret = cudaError_t ret =
cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
if (ret != cudaSuccess || can_acess != 1) { if (ret != cudaSuccess || can_acess != 1) {
#endif
LOG(WARNING) << "Cannot enable P2P access from " << devices[i] LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
<< " to " << devices[j]; << " to " << devices[j];
} else { } else {
platform::CUDADeviceGuard guard(devices[i]); platform::CUDADeviceGuard guard(devices[i]);
#ifdef PADDLE_WITH_HIP
hipDeviceEnablePeerAccess(devices[j], 0);
#else
cudaDeviceEnablePeerAccess(devices[j], 0); cudaDeviceEnablePeerAccess(devices[j], 0);
#endif
} }
} }
} }
...@@ -630,7 +640,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -630,7 +640,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
BuildStrategy::ReduceStrategy::kAllReduce; BuildStrategy::ReduceStrategy::kAllReduce;
member_->use_all_reduce_ = true; member_->use_all_reduce_ = true;
} }
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
if (member_->IsUseCUDA(member_->use_device_)) { if (member_->IsUseCUDA(member_->use_device_)) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
places.size(), 1, places.size(), 1,
...@@ -638,7 +648,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -638,7 +648,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
} }
#endif #endif
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
if (member_->IsUseCUDA(member_->use_device_)) { if (member_->IsUseCUDA(member_->use_device_)) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
places.size(), 1, places.size(), 1,
...@@ -710,7 +721,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -710,7 +721,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
} }
if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
// Initialize device context's nccl comm, will be used by normal // Initialize device context's nccl comm, will be used by normal
...@@ -774,7 +785,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -774,7 +785,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
std::vector<ir::Graph *> async_graphs(places.size()); std::vector<ir::Graph *> async_graphs(places.size());
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (member_->build_strategy_.async_mode_) { if (member_->build_strategy_.async_mode_) {
VLOG(3) << "use local async mode"; VLOG(3) << "use local async mode";
graph = member_->build_strategy_.Apply( graph = member_->build_strategy_.Apply(
...@@ -885,7 +896,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -885,7 +896,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
final_graphs = async_graphs; final_graphs = async_graphs;
} else if (member_->build_strategy_.enable_parallel_graph_) { } else if (member_->build_strategy_.enable_parallel_graph_) {
VLOG(3) << "use ParallelSSAGraphExecutor"; VLOG(3) << "use ParallelSSAGraphExecutor";
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(Yancey1989): Remove passing in the main_program when // TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr. // allreduce_seq_pass doesn't need it as the attr.
bool is_inference = details::IsDataParallelInferenceGraph(*graph); bool is_inference = details::IsDataParallelInferenceGraph(*graph);
...@@ -996,7 +1007,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -996,7 +1007,7 @@ void ParallelExecutor::BCastParamsToDevices(
} }
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::vector<void *> buffers; std::vector<void *> buffers;
buffers.reserve(member_->places_.size()); buffers.reserve(member_->places_.size());
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
......
...@@ -32,7 +32,7 @@ limitations under the License. */ ...@@ -32,7 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
......
...@@ -24,7 +24,8 @@ limitations under the License. */ ...@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
namespace paddle { namespace paddle {
......
...@@ -19,7 +19,8 @@ limitations under the License. */ ...@@ -19,7 +19,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#if defined _WIN32 || defined __APPLE__ #if defined _WIN32 || defined __APPLE__
......
...@@ -59,17 +59,19 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { ...@@ -59,17 +59,19 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
current_version_[tid] = 0; current_version_[tid] = 0;
} }
fleet_ptr_ = FleetWrapper::GetInstance(); fleet_ptr_ = FleetWrapper::GetInstance();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
copy_streams_.clear(); copy_streams_.clear();
#endif #endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
places_.clear(); places_.clear();
thread_scopes_.clear(); thread_scopes_.clear();
#endif #endif
} }
void PullDenseWorker::CreatePinVar() { void PullDenseWorker::CreatePinVar() {
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
// for (auto& v : dense_value_names_) { // for (auto& v : dense_value_names_) {
// for (auto& name : v.second) { // for (auto& name : v.second) {
for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
...@@ -84,7 +86,7 @@ void PullDenseWorker::CreatePinVar() { ...@@ -84,7 +86,7 @@ void PullDenseWorker::CreatePinVar() {
auto* ptr = root_scope_->Var(name + "pin"); auto* ptr = root_scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR); InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>(); LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
pin_tensor->mutable_data<float>(tensor->dims(), pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace()); platform::CUDAPinnedPlace());
#endif #endif
...@@ -113,7 +115,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -113,7 +115,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit(-1); exit(-1);
} }
status_vec->resize(0); status_vec->resize(0);
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// for (auto& v : dense_value_names_) { // for (auto& v : dense_value_names_) {
...@@ -131,7 +134,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -131,7 +134,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable* var = thread_scopes_[i]->FindVar(name); Variable* var = thread_scopes_[i]->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>(); float* w = tensor->data<float>();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w, platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]); sizeof(float) * tensor->numel(), copy_streams_[i]);
...@@ -161,7 +164,8 @@ void PullDenseWorker::PullDense(bool force_update) { ...@@ -161,7 +164,8 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i)); dwp_param_.program_config(0).pull_dense_table_id(i));
if (force_update || CheckUpdateParam(tid)) { if (force_update || CheckUpdateParam(tid)) {
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
VLOG(3) << "pull dense " << force_update << " " << tid; VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, false); &pull_dense_status_, false);
......
...@@ -297,7 +297,7 @@ bool SaveTensorToDisk(const std::string& file_name, ...@@ -297,7 +297,7 @@ bool SaveTensorToDisk(const std::string& file_name,
tensor->numel() * framework::SizeOfType(tensor->type()); tensor->numel() * framework::SizeOfType(tensor->type());
auto* data_ptr = tensor->data<void>(); auto* data_ptr = tensor->data<void>();
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::Tensor temp; framework::Tensor temp;
TensorCopySync(*tensor, platform::CPUPlace(), &temp); TensorCopySync(*tensor, platform::CPUPlace(), &temp);
data_ptr = temp.data<void>(); data_ptr = temp.data<void>();
......
...@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include <float.h> #include <float.h>
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/executor_gc_helper.h"
...@@ -38,7 +38,7 @@ void SectionWorker::TrainFiles() { ...@@ -38,7 +38,7 @@ void SectionWorker::TrainFiles() {
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_); auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_);
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(
...@@ -70,7 +70,11 @@ void SectionWorker::TrainFiles() { ...@@ -70,7 +70,11 @@ void SectionWorker::TrainFiles() {
} }
} }
} }
#ifdef PADDLE_WITH_RCCL
hipDeviceSynchronize();
#else
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
} }
// backward pass // backward pass
...@@ -89,7 +93,11 @@ void SectionWorker::TrainFiles() { ...@@ -89,7 +93,11 @@ void SectionWorker::TrainFiles() {
} }
} }
} }
#ifdef PADDLE_WITH_RCCL
hipDeviceSynchronize();
#else
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
} }
// update pass // update pass
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册