Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
50967135
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
50967135
编写于
2月 22, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 22, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part3), test=develop (#31011)
上级
cf43a321
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
175 addition
and
47 deletion
+175
-47
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+18
-1
paddle/fluid/framework/details/eager_deletion_op_handle.cc
paddle/fluid/framework/details/eager_deletion_op_handle.cc
+21
-6
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+2
-2
paddle/fluid/framework/details/fetch_async_op_handle.cc
paddle/fluid/framework/details/fetch_async_op_handle.cc
+1
-1
paddle/fluid/framework/details/fetch_op_handle.cc
paddle/fluid/framework/details/fetch_op_handle.cc
+1
-1
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+2
-2
paddle/fluid/framework/details/fused_broadcast_op_handle.h
paddle/fluid/framework/details/fused_broadcast_op_handle.h
+2
-2
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+3
-2
paddle/fluid/framework/details/nan_inf_utils_detail.cc
paddle/fluid/framework/details/nan_inf_utils_detail.cc
+1
-1
paddle/fluid/framework/details/nan_inf_utils_detail.cu
paddle/fluid/framework/details/nan_inf_utils_detail.cu
+18
-0
paddle/fluid/framework/details/nccl_op_handle.h
paddle/fluid/framework/details/nccl_op_handle.h
+47
-3
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+41
-9
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-2
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+4
-3
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+7
-7
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+2
-2
paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
.../fluid/framework/details/share_tensor_buffer_op_handle.cc
+1
-1
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
50967135
...
...
@@ -45,7 +45,24 @@ if(WITH_GPU)
endif
()
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
nv_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
elseif
(
WITH_ROCM
)
hip_library
(
nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place
)
hip_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor
)
hip_library
(
fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor place device_memory_aligment
)
hip_library
(
grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle
)
if
(
WITH_DISTRIBUTE
)
hip_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor
)
else
()
hip_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor
)
endif
()
hip_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
hip_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
else
()
cc_library
(
nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place
)
cc_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
...
...
@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
sync_batch_norm_pass runtime_context_cache_pass
)
if
(
NOT APPLE AND NOT WIN32 AND
WITH_GPU
)
if
(
NOT APPLE AND NOT WIN32
AND
(
WITH_GPU OR WITH_ROCM
)
)
set
(
IR_PASS_DEPS
${
IR_PASS_DEPS
}
fusion_group_pass
)
endif
()
cc_library
(
build_strategy SRCS build_strategy.cc DEPS pass_builder
${
IR_PASS_DEPS
}
)
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.cc
浏览文件 @
50967135
...
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/platform/profiler.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
...
...
@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
place_
(
place
),
var_infos_
(
vars
.
begin
(),
vars
.
end
()),
gc_
(
gc
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
place
))
{
dev_ctx_
=
reinterpret_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
dynamic_cast
<
StreamGarbageCollector
*>
(
gc_
))
{
platform
::
CUDADeviceGuard
guard
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
event_
,
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
#endif
PADDLE_ENFORCE_NOT_NULL
(
event_
,
platform
::
errors
::
InvalidArgument
(
"The cuda envet created is NULL."
));
}
...
...
@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
}
EagerDeletionOpHandle
::~
EagerDeletionOpHandle
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
event_
)
{
auto
gpu_place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctx_
->
GetPlace
());
platform
::
CUDADeviceGuard
guard
(
gpu_place
.
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
event_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
event_
));
#endif
}
#endif
}
void
EagerDeletionOpHandle
::
InitCUDA
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctxes_
.
begin
()
->
first
).
device
;
events_
[
dev_id
]
=
nullptr
;
...
...
@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {
void
EagerDeletionOpHandle
::
ClearGarbages
(
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
*
garbages
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
event_
)
{
auto
compute_stream
=
dev_ctx_
->
stream
();
auto
callback_stream
=
reinterpret_cast
<
StreamGarbageCollector
*>
(
gc_
)
->
stream
();
auto
callback_func
=
[
=
]()
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
callback_stream
,
event_
,
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
callback_stream
,
event_
,
0
));
#endif
};
gc_
->
Add
(
std
::
move
(
*
garbages
),
callback_func
);
}
else
{
#endif
gc_
->
Add
(
std
::
move
(
*
garbages
));
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
}
#endif
}
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
50967135
...
...
@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
std
::
vector
<
ir
::
MemOptVarInfo
*>
var_infos_
;
// not own
GarbageCollector
*
gc_
;
// not own
std
::
vector
<
Variable
*>
vars_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
CUDADeviceContext
*
dev_ctx_
{
nullptr
};
cuda
Event_t
event_
{
nullptr
};
gpu
Event_t
event_
{
nullptr
};
#endif
};
...
...
paddle/fluid/framework/details/fetch_async_op_handle.cc
浏览文件 @
50967135
...
...
@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
const
platform
::
DeviceContext
&
ctx
)
{
if
(
src_item
->
IsInitialized
()
&&
src_item
->
numel
()
>
0
)
{
if
(
platform
::
is_gpu_place
(
src_item
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy
(
*
src_item
,
platform
::
CUDAPinnedPlace
(),
ctx
,
dst_item
);
#endif
}
else
{
...
...
paddle/fluid/framework/details/fetch_op_handle.cc
浏览文件 @
50967135
...
...
@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
framework
::
LoDTensor
*
dst_item
)
{
if
(
src_item
.
IsInitialized
()
&&
src_item
.
numel
()
>
0
)
{
if
(
platform
::
is_gpu_place
(
src_item
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy
(
src_item
,
platform
::
CPUPlace
(),
dst_item
);
#endif
}
else
{
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
浏览文件 @
50967135
...
...
@@ -26,7 +26,7 @@ namespace details {
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
const
LoDTensor
*>>>
GradientAndLoDTensor
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
浏览文件 @
50967135
...
...
@@ -33,7 +33,7 @@ namespace platform {
class
NCCLCommunicator
;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
...
...
@@ -44,7 +44,7 @@ namespace paddle {
namespace
framework
{
namespace
details
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandle
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle.h
浏览文件 @
50967135
...
...
@@ -36,7 +36,7 @@ struct NCCLContextMap;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
...
...
@@ -46,7 +46,7 @@ namespace details {
struct
FusedBroadcastOpHandle
:
public
BroadcastOpHandle
{
public:
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
浏览文件 @
50967135
...
...
@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
...
...
@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST
(
FusedBroadcastTester
,
GPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
...
...
paddle/fluid/framework/details/nan_inf_utils_detail.cc
浏览文件 @
50967135
...
...
@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
<<
", place:"
<<
tensor
->
place
()
<<
", numel:"
<<
tensor
->
numel
();
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
tensor_check
<
platform
::
CUDADeviceContext
>
(
op_type
,
var_name
,
*
tensor
,
place
);
#else
...
...
paddle/fluid/framework/details/nan_inf_utils_detail.cu
浏览文件 @
50967135
...
...
@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
}
__syncthreads
;
#ifdef PADDLE_WITH_HIP
if
(
true
&&
hipThreadIdx_x
==
0
)
{
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
hipBlockIdx_x
,
nan_count
,
inf_count
,
num_count
);
#else
if
(
true
&&
threadIdx
.
x
==
0
)
{
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
blockIdx
.
x
,
nan_count
,
inf_count
,
num_count
);
#endif
PADDLE_ENFORCE
(
false
,
"===ERROR: in %s find nan or inf==="
,
debug_info
);
}
}
...
...
@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed"
,
op_var
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
hipMemcpyHostToDevice
,
dev_ctx
->
stream
()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
()));
#endif
}
else
{
// get
auto
iter
=
op_var2gpu_str
.
find
(
op_var
);
PADDLE_ENFORCE_EQ
(
iter
!=
op_var2gpu_str
.
end
(),
true
,
...
...
@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
size_t
blocks
=
std
::
min
(
static_cast
<
size_t
>
(
128
),
static_cast
<
size_t
>
((
tensor_
.
numel
()
+
threads
-
1
)
/
threads
));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
CheckNanInfKernel
,
dim3
(
blocks
),
dim3
(
threads
),
0
,
dev_ctx
->
stream
(),
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
print_num
,
gpu_str_ptr
);
#else
CheckNanInfKernel
<<<
blocks
,
threads
,
0
,
dev_ctx
->
stream
()
>>>
(
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
print_num
,
gpu_str_ptr
);
#endif
}
template
<
>
...
...
paddle/fluid/framework/details/nccl_op_handle.h
浏览文件 @
50967135
...
...
@@ -21,7 +21,12 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/nccl_helper.h"
DECLARE_bool
(
sync_nccl_allreduce
);
...
...
@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
}
virtual
~
NCCLOpHandleBase
()
{
for
(
auto
&
ev
:
inter_events_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
ev
.
second
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
#endif
}
for
(
auto
&
ev
:
exter_events_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
ev
.
second
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
#endif
}
}
void
SetRunEnv
(
int
run_order
,
bool
use_hierarchical_allreduce
)
{
...
...
@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
}
platform
::
SetDeviceId
(
dev_id
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
inter_events_
[
dev_id
],
hipEventDisableTiming
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
exter_events_
[
dev_id
],
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
inter_events_
[
dev_id
],
cudaEventDisableTiming
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
exter_events_
[
dev_id
],
cudaEventDisableTiming
));
#endif
VLOG
(
10
)
<<
"Create events on dev_id:"
<<
dev_id
<<
", inter_event:"
<<
&
inter_events_
[
dev_id
]
<<
", exter_event:"
<<
&
exter_events_
[
dev_id
];
...
...
@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
ncclSum
,
0
,
comm
,
stream
));
#ifdef PADDLE_WITH_HIP
hipEventRecord
(
inter_events_
.
at
(
dev_id
),
stream
);
#else
cudaEventRecord
(
inter_events_
.
at
(
dev_id
),
stream
);
#endif
if
(
FLAGS_sync_nccl_allreduce
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
#endif
}
}
...
...
@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent
(
stream
,
inter_events_
.
at
(
dev_id
),
0
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
op
,
comm
,
stream
));
hipEventRecord
(
exter_events_
.
at
(
dev_id
),
stream
);
if
(
FLAGS_sync_nccl_allreduce
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream
));
}
#else
cudaStreamWaitEvent
(
stream
,
inter_events_
.
at
(
dev_id
),
0
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
...
...
@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
if
(
FLAGS_sync_nccl_allreduce
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
}
#endif
}
void
InterBroadCast
(
platform
::
Place
place
,
void
*
sendbuff
,
size_t
count
,
...
...
@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent
(
stream
,
exter_events_
.
at
(
dev_id
),
0
);
#else
cudaStreamWaitEvent
(
stream
,
exter_events_
.
at
(
dev_id
),
0
);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
sendbuff
,
count
,
datatype
,
0
,
comm
,
stream
));
}
...
...
@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {
private:
// hierarchical needed events
std
::
unordered_map
<
int
,
cuda
Event_t
>
inter_events_
;
std
::
unordered_map
<
int
,
cuda
Event_t
>
exter_events_
;
std
::
unordered_map
<
int
,
gpu
Event_t
>
inter_events_
;
std
::
unordered_map
<
int
,
gpu
Event_t
>
exter_events_
;
};
}
// namespace details
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
50967135
...
...
@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
}
OpHandleBase
::~
OpHandleBase
()
PADDLE_MAY_THROW
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
auto
&
ev
:
events_
)
{
if
(
ev
.
second
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
ev
.
second
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
#endif
}
}
#endif
}
void
OpHandleBase
::
InitCUDA
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
auto
&
p
:
dev_ctxes_
)
{
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
p
.
first
).
device
;
platform
::
SetDeviceId
(
dev_id
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
events_
[
dev_id
],
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
events_
[
dev_id
],
cudaEventDisableTiming
));
#endif
}
if
(
IsMultiDeviceTransfer
()
&&
dev_ctxes_
.
size
()
>
0
)
{
for
(
auto
&
out_var
:
outputs_
)
{
...
...
@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
}
void
OpHandleBase
::
Run
(
DeviceType
use_device
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
events_
.
empty
()
&&
use_device
==
p
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
InitCUDA
();
}
...
...
@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
}
void
OpHandleBase
::
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_NOT_NULL
(
waited_ctx
,
platform
::
errors
::
InvalidArgument
(
"Argument waited_ctx is NULL."
));
if
(
platform
::
is_cpu_place
(
waited_ctx
->
GetPlace
())
||
events_
.
empty
())
{
...
...
@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
waited_ctx
)
->
stream
();
for
(
auto
&
ev
:
events_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
stream
,
ev
.
second
,
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
ev
.
second
,
0
));
#endif
}
}
#else
...
...
@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if
(
in_var_handle
)
{
auto
&
place
=
in_var_handle
->
place
();
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctxes_
.
at
(
place
))
->
stream
();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#endif
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
...
...
@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if
(
in_var_handle
)
{
auto
&
place
=
in_var_handle
->
place
();
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
))
->
stream
();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
#endif
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
...
...
@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto
*
in_var_handle
=
dynamic_cast
<
VarHandle
*>
(
in_var
);
if
(
in_var_handle
)
{
if
(
platform
::
is_gpu_place
(
in_var_handle
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctxes_
.
at
(
in_var_handle
->
place
()))
->
stream
();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#endif
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
...
...
@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
void
OpHandleBase
::
RunAndRecordEvent
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback
();
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
!
events_
.
empty
())
{
// Use event
for
(
auto
&
p
:
dev_ctxes_
)
{
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
p
.
first
).
device
;
auto
*
cuda_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
);
VLOG
(
10
)
<<
"cudadevicecontext:"
<<
cuda_dev_ctx
<<
", dev_id:"
<<
dev_id
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
events_
.
at
(
dev_id
),
cuda_dev_ctx
->
stream
()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
events_
.
at
(
dev_id
),
cuda_dev_ctx
->
stream
()));
#endif
}
}
#endif
...
...
@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
void
OpHandleBase
::
RunAndRecordEvent
(
platform
::
Place
p
,
const
std
::
function
<
void
()
>
&
callback
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_cpu_place
(
p
)
||
events_
.
empty
())
{
callback
();
}
else
{
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
50967135
...
...
@@ -157,8 +157,8 @@ class OpHandleBase {
std
::
vector
<
Scope
*>
local_exec_scopes_
;
bool
skip_running_
=
false
;
#if
def PADDLE_WITH_CUDA
std
::
unordered_map
<
int
,
cuda
Event_t
>
events_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
unordered_map
<
int
,
gpu
Event_t
>
events_
;
#endif
DISABLE_COPY_AND_ASSIGN
(
OpHandleBase
);
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
50967135
...
...
@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
}
});
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
50967135
...
...
@@ -40,7 +40,7 @@ namespace platform {
struct
NCCLContextMap
;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
...
...
@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
GetLocalScopes
()
override
{
return
local_scopes_
;
}
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
defined PADDLE_WITH_DISTRIBUTE
template
<
typename
DevCtx
,
typename
DataType
>
void
GatherSelectedRows
(
const
std
::
vector
<
const
SelectedRows
*>
&
src_selecte_rows_
,
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
50967135
...
...
@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
std
::
vector
<
p
::
Place
>
gpu_list_
;
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
...
...
@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
...
...
@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
...
...
@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
...
...
@@ -104,7 +104,7 @@ struct TestReduceOpHandle {
nodes
.
emplace_back
(
new
ir
::
Node
(
"node"
));
if
(
use_gpu_
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
...
...
@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with NCLL."
));
#endif
}
else
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
...
...
@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
test_op
.
InitReduceOp
(
out_scope_idx
);
test_op
.
TestReduceLodTensors
(
out_scope_idx
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
ReduceTester
,
TestGPUReduceTestSelectedRows
)
{
TestReduceOpHandle
test_op
;
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
50967135
...
...
@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
ctx_
)
->
stream
();
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
out_data
,
...
...
@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
local_exec_scopes_
[
0
]
->
FindVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
tensor
->
Resize
(
make_ddim
({
1
}));
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor
func
(
coeff_
,
tensor
,
place_
,
out_dtype_
,
this
->
dev_ctxes_
.
at
(
place_
));
this
->
RunAndRecordEvent
([
&
]
{
framework
::
VisitDataType
(
out_dtype_
,
func
);
});
...
...
paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
浏览文件 @
50967135
...
...
@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
}
void
ShareTensorBufferOpHandle
::
InitCUDA
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctxes_
.
begin
()
->
first
).
device
;
events_
[
dev_id
]
=
nullptr
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录