未验证 提交 50967135 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part3), test=develop (#31011)

上级 cf43a321
...@@ -45,7 +45,24 @@ if(WITH_GPU) ...@@ -45,7 +45,24 @@ if(WITH_GPU)
endif() endif()
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
elseif(WITH_ROCM)
hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place)
hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor place device_memory_aligment)
hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)
if(WITH_DISTRIBUTE)
hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor)
else()
hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor)
endif()
hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
else() else()
cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place) cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place)
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
...@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass ...@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
sync_batch_norm_pass runtime_context_cache_pass) sync_batch_norm_pass runtime_context_cache_pass)
if(NOT APPLE AND NOT WIN32 AND WITH_GPU) if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
endif() endif()
cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS}) cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
place_(place), place_(place),
var_infos_(vars.begin(), vars.end()), var_infos_(vars.begin(), vars.end()),
gc_(gc) { gc_(gc) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>( dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device); BOOST_GET_CONST(platform::CUDAPlace, place).device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event_, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
#endif
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument( PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL.")); "The cuda envet created is NULL."));
} }
...@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
} }
EagerDeletionOpHandle::~EagerDeletionOpHandle() { EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (event_) { if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device); platform::CUDADeviceGuard guard(gpu_place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
#endif
} }
#endif #endif
} }
void EagerDeletionOpHandle::InitCUDA() { void EagerDeletionOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id = int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device; BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
events_[dev_id] = nullptr; events_[dev_id] = nullptr;
...@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {
void EagerDeletionOpHandle::ClearGarbages( void EagerDeletionOpHandle::ClearGarbages(
std::deque<std::shared_ptr<memory::Allocation>> *garbages) { std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (event_) { if (event_) {
auto compute_stream = dev_ctx_->stream(); auto compute_stream = dev_ctx_->stream();
auto callback_stream = auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream(); reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() { auto callback_func = [=]() {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamWaitEvent(callback_stream, event_, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0)); cudaStreamWaitEvent(callback_stream, event_, 0));
#endif
}; };
gc_->Add(std::move(*garbages), callback_func); gc_->Add(std::move(*garbages), callback_func);
} else { } else {
#endif #endif
gc_->Add(std::move(*garbages)); gc_->Add(std::move(*garbages));
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} }
#endif #endif
} }
......
...@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase { ...@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
std::vector<ir::MemOptVarInfo *> var_infos_; // not own std::vector<ir::MemOptVarInfo *> var_infos_; // not own
GarbageCollector *gc_; // not own GarbageCollector *gc_; // not own
std::vector<Variable *> vars_; std::vector<Variable *> vars_;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDADeviceContext *dev_ctx_{nullptr}; platform::CUDADeviceContext *dev_ctx_{nullptr};
cudaEvent_t event_{nullptr}; gpuEvent_t event_{nullptr};
#endif #endif
}; };
......
...@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item, ...@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
const platform::DeviceContext &ctx) { const platform::DeviceContext &ctx) {
if (src_item->IsInitialized() && src_item->numel() > 0) { if (src_item->IsInitialized() && src_item->numel() > 0) {
if (platform::is_gpu_place(src_item->place())) { if (platform::is_gpu_place(src_item->place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item); TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
#endif #endif
} else { } else {
......
...@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item, ...@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
framework::LoDTensor *dst_item) { framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) { if (src_item.IsInitialized() && src_item.numel() > 0) {
if (platform::is_gpu_place(src_item.place())) { if (platform::is_gpu_place(src_item.place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy(src_item, platform::CPUPlace(), dst_item); TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif #endif
} else { } else {
......
...@@ -26,7 +26,7 @@ namespace details { ...@@ -26,7 +26,7 @@ namespace details {
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>> typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor; GradientAndLoDTensor;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
FusedAllReduceOpHandle::FusedAllReduceOpHandle( FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes, ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce, const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
......
...@@ -33,7 +33,7 @@ namespace platform { ...@@ -33,7 +33,7 @@ namespace platform {
class NCCLCommunicator; class NCCLCommunicator;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL) #elif defined(PADDLE_WITH_XPU_BKCL)
...@@ -44,7 +44,7 @@ namespace paddle { ...@@ -44,7 +44,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
struct FusedAllReduceOpHandle : public AllReduceOpHandle { struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
......
...@@ -36,7 +36,7 @@ struct NCCLContextMap; ...@@ -36,7 +36,7 @@ struct NCCLContextMap;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -46,7 +46,7 @@ namespace details { ...@@ -46,7 +46,7 @@ namespace details {
struct FusedBroadcastOpHandle : public BroadcastOpHandle { struct FusedBroadcastOpHandle : public BroadcastOpHandle {
public: public:
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
FusedBroadcastOpHandle(ir::Node *node, FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes, const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
......
...@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_.emplace_back( nodes_.emplace_back(
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
if (use_device_ == p::kCUDA) { if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
...@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) { ...@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
} }
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL) #if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST(FusedBroadcastTester, GPULodTensor) { TEST(FusedBroadcastTester, GPULodTensor) {
TestFusedBroadcastOpHandle test_op; TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1}; std::vector<size_t> input_scope_idxes = {0, 1};
......
...@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
<< ", place:" << tensor->place() << ", numel:" << tensor->numel(); << ", place:" << tensor->place() << ", numel:" << tensor->numel();
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
tensor_check<platform::CUDADeviceContext>(op_type, var_name, *tensor, tensor_check<platform::CUDADeviceContext>(op_type, var_name, *tensor,
place); place);
#else #else
......
...@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value, ...@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
} }
__syncthreads; __syncthreads;
#ifdef PADDLE_WITH_HIP
if (true && hipThreadIdx_x == 0) {
printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
nan_count, inf_count, num_count);
#else
if (true && threadIdx.x == 0) { if (true && threadIdx.x == 0) {
printf("In block %d, there has %u,%u,%u nan,inf,num\n", blockIdx.x, printf("In block %d, there has %u,%u,%u nan,inf,num\n", blockIdx.x,
nan_count, inf_count, num_count); nan_count, inf_count, num_count);
#endif
PADDLE_ENFORCE(false, "===ERROR: in %s find nan or inf===", debug_info); PADDLE_ENFORCE(false, "===ERROR: in %s find nan or inf===", debug_info);
} }
} }
...@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed", "op_var2gpu_str, but now failed",
op_var)); op_var));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
hipMemcpyHostToDevice, dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream())); cudaMemcpyHostToDevice, dev_ctx->stream()));
#endif
} else { // get } else { // get
auto iter = op_var2gpu_str.find(op_var); auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true, PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
...@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
size_t blocks = size_t blocks =
std::min(static_cast<size_t>(128), std::min(static_cast<size_t>(128),
static_cast<size_t>((tensor_.numel() + threads - 1) / threads)); static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
print_num, gpu_str_ptr);
#else
CheckNanInfKernel<<<blocks, threads, 0, dev_ctx->stream()>>>( CheckNanInfKernel<<<blocks, threads, 0, dev_ctx->stream()>>>(
tensor_.data<T>(), tensor_.numel(), print_num, gpu_str_ptr); tensor_.data<T>(), tensor_.numel(), print_num, gpu_str_ptr);
#endif
} }
template <> template <>
......
...@@ -21,7 +21,12 @@ ...@@ -21,7 +21,12 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
DECLARE_bool(sync_nccl_allreduce); DECLARE_bool(sync_nccl_allreduce);
...@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
virtual ~NCCLOpHandleBase() { virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) { for (auto& ev : inter_events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
#endif
} }
for (auto& ev : exter_events_) { for (auto& ev : exter_events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
#endif
} }
} }
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
...@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
&inter_events_[dev_id], hipEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
&exter_events_[dev_id], hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&inter_events_[dev_id], cudaEventDisableTiming)); &inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&exter_events_[dev_id], cudaEventDisableTiming)); &exter_events_[dev_id], cudaEventDisableTiming));
#endif
VLOG(10) << "Create events on dev_id:" << dev_id VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id] << ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id]; << ", exter_event:" << &exter_events_[dev_id];
...@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
#ifdef PADDLE_WITH_HIP
hipEventRecord(inter_events_.at(dev_id), stream);
#else
cudaEventRecord(inter_events_.at(dev_id), stream); cudaEventRecord(inter_events_.at(dev_id), stream);
#endif
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
} }
} }
...@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype << ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place << ", stream:" << stream; << ", place:" << place << ", stream:" << stream;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream));
hipEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
}
#else
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
...@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
} }
#endif
} }
void InterBroadCast(platform::Place place, void* sendbuff, size_t count, void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
...@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", numel:" << count << ", dev_id:" << dev_id << ", numel:" << count << ", dev_id:" << dev_id
<< ", dtype:" << datatype << ", place:" << place << ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream; << ", stream:" << stream;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
#else
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
sendbuff, count, datatype, 0, comm, stream)); sendbuff, count, datatype, 0, comm, stream));
} }
...@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {
private: private:
// hierarchical needed events // hierarchical needed events
std::unordered_map<int, cudaEvent_t> inter_events_; std::unordered_map<int, gpuEvent_t> inter_events_;
std::unordered_map<int, cudaEvent_t> exter_events_; std::unordered_map<int, gpuEvent_t> exter_events_;
}; };
} // namespace details } // namespace details
......
...@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const { ...@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
} }
OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (auto &ev : events_) { for (auto &ev : events_) {
if (ev.second) { if (ev.second) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
#endif
} }
} }
#endif #endif
} }
void OpHandleBase::InitCUDA() { void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
#endif
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
...@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() { ...@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
} }
void OpHandleBase::Run(DeviceType use_device) { void OpHandleBase::Run(DeviceType use_device) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) { if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
InitCUDA(); InitCUDA();
} }
...@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) { ...@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
} }
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument( PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL.")); "Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
...@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { ...@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream(); static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) { for (auto &ev : events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
#endif
} }
} }
#else #else
...@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { ...@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if (in_var_handle) { if (in_var_handle) {
auto &place = in_var_handle->place(); auto &place = in_var_handle->place();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place)) static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream(); ->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#endif
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA.")); platform::errors::PreconditionNotMet("Not compiled with CUDA."));
...@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { ...@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if (in_var_handle) { if (in_var_handle) {
auto &place = in_var_handle->place(); auto &place = in_var_handle->place();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(pool.Get(place)) static_cast<platform::CUDADeviceContext *>(pool.Get(place))
->stream(); ->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not compiled with CUDA.")); "Not compiled with CUDA."));
...@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto *in_var_handle = dynamic_cast<VarHandle *>(in_var); auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
if (in_var_handle) { if (in_var_handle) {
if (platform::is_gpu_place(in_var_handle->place())) { if (platform::is_gpu_place(in_var_handle->place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place())) dev_ctxes_.at(in_var_handle->place()))
->stream(); ->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#endif
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA.")); platform::errors::PreconditionNotMet("Not compiled with CUDA."));
...@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { ...@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) { void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
callback(); callback();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!events_.empty()) { // Use event if (!events_.empty()) { // Use event
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second); auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
#endif
} }
} }
#endif #endif
...@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) { ...@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
void OpHandleBase::RunAndRecordEvent(platform::Place p, void OpHandleBase::RunAndRecordEvent(platform::Place p,
const std::function<void()> &callback) { const std::function<void()> &callback) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_cpu_place(p) || events_.empty()) { if (platform::is_cpu_place(p) || events_.empty()) {
callback(); callback();
} else { } else {
......
...@@ -157,8 +157,8 @@ class OpHandleBase { ...@@ -157,8 +157,8 @@ class OpHandleBase {
std::vector<Scope *> local_exec_scopes_; std::vector<Scope *> local_exec_scopes_;
bool skip_running_ = false; bool skip_running_ = false;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::unordered_map<int, cudaEvent_t> events_; std::unordered_map<int, gpuEvent_t> events_;
#endif #endif
DISABLE_COPY_AND_ASSIGN(OpHandleBase); DISABLE_COPY_AND_ASSIGN(OpHandleBase);
......
...@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto pre_in = pre_in_var->Get<framework::LoDTensor>(); auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data( VariableVisitor::GetMutableTensor(out_var).mutable_data(
......
...@@ -40,7 +40,7 @@ namespace platform { ...@@ -40,7 +40,7 @@ namespace platform {
struct NCCLContextMap; struct NCCLContextMap;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL) #elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/bkcl_helper.h"
...@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
...@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> GetLocalScopes() override { return local_scopes_; } std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
defined PADDLE_WITH_DISTRIBUTE
template <typename DevCtx, typename DataType> template <typename DevCtx, typename DataType>
void GatherSelectedRows( void GatherSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_, const std::vector<const SelectedRows *> &src_selecte_rows_,
......
...@@ -40,7 +40,7 @@ struct TestReduceOpHandle { ...@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
std::vector<p::Place> gpu_list_; std::vector<p::Place> gpu_list_;
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_; std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
...@@ -48,7 +48,7 @@ struct TestReduceOpHandle { ...@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) { for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait(); ctxs_[j]->Wait();
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_ctxs_) { if (nccl_ctxs_) {
nccl_ctxs_->WaitAll(); nccl_ctxs_->WaitAll();
} }
...@@ -58,7 +58,7 @@ struct TestReduceOpHandle { ...@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
void InitCtxOnGpu(bool use_gpu) { void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu; use_gpu_ = use_gpu;
if (use_gpu) { if (use_gpu) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
...@@ -83,7 +83,7 @@ struct TestReduceOpHandle { ...@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
gpu_list_.push_back(p); gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p)); ctxs_.emplace_back(new p::CPUDeviceContext(p));
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_ctxs_.reset(nullptr); nccl_ctxs_.reset(nullptr);
#endif #endif
} }
...@@ -104,7 +104,7 @@ struct TestReduceOpHandle { ...@@ -104,7 +104,7 @@ struct TestReduceOpHandle {
nodes.emplace_back(new ir::Node("node")); nodes.emplace_back(new ir::Node("node"));
if (use_gpu_) { if (use_gpu_) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
...@@ -112,7 +112,7 @@ struct TestReduceOpHandle { ...@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
platform::errors::PreconditionNotMet("Not compiled with NCLL.")); platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
...@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) { ...@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
test_op.InitReduceOp(out_scope_idx); test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceLodTensors(out_scope_idx); test_op.TestReduceLodTensors(out_scope_idx);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(ReduceTester, TestGPUReduceTestSelectedRows) { TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
TestReduceOpHandle test_op; TestReduceOpHandle test_op;
......
...@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor { ...@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif #endif
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT cast_coeff = static_cast<OutT>(coeff_); OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream(); auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data, memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
...@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>(); local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1})); tensor->Resize(make_ddim({1}));
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_)); this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
......
...@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) { ...@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
} }
void ShareTensorBufferOpHandle::InitCUDA() { void ShareTensorBufferOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id = int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device; BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
events_[dev_id] = nullptr; events_[dev_id] = nullptr;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册