未验证 提交 50967135 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part3), test=develop (#31011)

上级 cf43a321
......@@ -45,7 +45,24 @@ if(WITH_GPU)
endif()
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
elseif(WITH_ROCM)
hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place)
hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor place device_memory_aligment)
hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)
if(WITH_DISTRIBUTE)
hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor)
else()
hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor)
endif()
hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
else()
cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place)
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
......@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
sync_batch_norm_pass runtime_context_cache_pass)
if(NOT APPLE AND NOT WIN32 AND WITH_GPU)
if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
endif()
cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
place_(place),
var_infos_(vars.begin(), vars.end()),
gc_(gc) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) {
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event_, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
#endif
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
}
......@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
}
EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
#endif
}
#endif
}
void EagerDeletionOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
events_[dev_id] = nullptr;
......@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {
void EagerDeletionOpHandle::ClearGarbages(
std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (event_) {
auto compute_stream = dev_ctx_->stream();
auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamWaitEvent(callback_stream, event_, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
#endif
};
gc_->Add(std::move(*garbages), callback_func);
} else {
#endif
gc_->Add(std::move(*garbages));
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
}
#endif
}
......
......@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
std::vector<ir::MemOptVarInfo *> var_infos_; // not own
GarbageCollector *gc_; // not own
std::vector<Variable *> vars_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDADeviceContext *dev_ctx_{nullptr};
cudaEvent_t event_{nullptr};
gpuEvent_t event_{nullptr};
#endif
};
......
......@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
const platform::DeviceContext &ctx) {
if (src_item->IsInitialized() && src_item->numel() > 0) {
if (platform::is_gpu_place(src_item->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
#endif
} else {
......
......@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) {
if (platform::is_gpu_place(src_item.place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
......
......@@ -26,7 +26,7 @@ namespace details {
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
......
......@@ -33,7 +33,7 @@ namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
......@@ -44,7 +44,7 @@ namespace paddle {
namespace framework {
namespace details {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
......
......@@ -36,7 +36,7 @@ struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -46,7 +46,7 @@ namespace details {
struct FusedBroadcastOpHandle : public BroadcastOpHandle {
public:
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places,
......
......@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_.emplace_back(
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else
......@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST(FusedBroadcastTester, GPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
......
......@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
<< ", place:" << tensor->place() << ", numel:" << tensor->numel();
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
tensor_check<platform::CUDADeviceContext>(op_type, var_name, *tensor,
place);
#else
......
......@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
}
__syncthreads;
#ifdef PADDLE_WITH_HIP
if (true && hipThreadIdx_x == 0) {
printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
nan_count, inf_count, num_count);
#else
if (true && threadIdx.x == 0) {
printf("In block %d, there has %u,%u,%u nan,inf,num\n", blockIdx.x,
nan_count, inf_count, num_count);
#endif
PADDLE_ENFORCE(false, "===ERROR: in %s find nan or inf===", debug_info);
}
}
......@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed",
op_var));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
hipMemcpyHostToDevice, dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream()));
#endif
} else { // get
auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
......@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
size_t blocks =
std::min(static_cast<size_t>(128),
static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
print_num, gpu_str_ptr);
#else
CheckNanInfKernel<<<blocks, threads, 0, dev_ctx->stream()>>>(
tensor_.data<T>(), tensor_.numel(), print_num, gpu_str_ptr);
#endif
}
template <>
......
......@@ -21,7 +21,12 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/nccl_helper.h"
DECLARE_bool(sync_nccl_allreduce);
......@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
}
virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
#endif
}
for (auto& ev : exter_events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
#endif
}
}
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
......@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
}
platform::SetDeviceId(dev_id);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
&inter_events_[dev_id], hipEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
&exter_events_[dev_id], hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&exter_events_[dev_id], cudaEventDisableTiming));
#endif
VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id];
......@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
#ifdef PADDLE_WITH_HIP
hipEventRecord(inter_events_.at(dev_id), stream);
#else
cudaEventRecord(inter_events_.at(dev_id), stream);
#endif
if (FLAGS_sync_nccl_allreduce) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
}
}
......@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place << ", stream:" << stream;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream));
hipEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
}
#else
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
......@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
#endif
}
void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
......@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", numel:" << count << ", dev_id:" << dev_id
<< ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
#else
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
sendbuff, count, datatype, 0, comm, stream));
}
......@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {
private:
// hierarchical needed events
std::unordered_map<int, cudaEvent_t> inter_events_;
std::unordered_map<int, cudaEvent_t> exter_events_;
std::unordered_map<int, gpuEvent_t> inter_events_;
std::unordered_map<int, gpuEvent_t> exter_events_;
};
} // namespace details
......
......@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
}
OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (auto &ev : events_) {
if (ev.second) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
#endif
}
}
#endif
}
void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
platform::SetDeviceId(dev_id);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
#endif
}
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
for (auto &out_var : outputs_) {
......@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
}
void OpHandleBase::Run(DeviceType use_device) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
InitCUDA();
}
......@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
}
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
......@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
#endif
}
}
#else
......@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if (in_var_handle) {
auto &place = in_var_handle->place();
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#endif
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
......@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if (in_var_handle) {
auto &place = in_var_handle->place();
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto stream =
static_cast<platform::CUDADeviceContext *>(pool.Get(place))
->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not compiled with CUDA."));
......@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
if (in_var_handle) {
if (platform::is_gpu_place(in_var_handle->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place()))
->stream();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#endif
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
......@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
callback();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!events_.empty()) { // Use event
for (auto &p : dev_ctxes_) {
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
#endif
}
}
#endif
......@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
void OpHandleBase::RunAndRecordEvent(platform::Place p,
const std::function<void()> &callback) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_cpu_place(p) || events_.empty()) {
callback();
} else {
......
......@@ -157,8 +157,8 @@ class OpHandleBase {
std::vector<Scope *> local_exec_scopes_;
bool skip_running_ = false;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::unordered_map<int, gpuEvent_t> events_;
#endif
DISABLE_COPY_AND_ASSIGN(OpHandleBase);
......
......@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
}
});
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data(
......
......@@ -40,7 +40,7 @@ namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
......@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
const platform::NCCLContextMap *nccl_ctxs_;
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
......@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
defined PADDLE_WITH_DISTRIBUTE
template <typename DevCtx, typename DataType>
void GatherSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_,
......
......@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
std::vector<p::Place> gpu_list_;
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
......@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
......@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
......@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_ctxs_.reset(nullptr);
#endif
}
......@@ -104,7 +104,7 @@ struct TestReduceOpHandle {
nodes.emplace_back(new ir::Node("node"));
if (use_gpu_) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
......@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
......@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceLodTensors(out_scope_idx);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
......
......@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
......@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
......
......@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
}
void ShareTensorBufferOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
events_[dev_id] = nullptr;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册