Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
50967135
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
50967135
编写于
2月 22, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 22, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part3), test=develop (#31011)
上级
cf43a321
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
175 addition
and
47 deletion
+175
-47
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+18
-1
paddle/fluid/framework/details/eager_deletion_op_handle.cc
paddle/fluid/framework/details/eager_deletion_op_handle.cc
+21
-6
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+2
-2
paddle/fluid/framework/details/fetch_async_op_handle.cc
paddle/fluid/framework/details/fetch_async_op_handle.cc
+1
-1
paddle/fluid/framework/details/fetch_op_handle.cc
paddle/fluid/framework/details/fetch_op_handle.cc
+1
-1
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+2
-2
paddle/fluid/framework/details/fused_broadcast_op_handle.h
paddle/fluid/framework/details/fused_broadcast_op_handle.h
+2
-2
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+3
-2
paddle/fluid/framework/details/nan_inf_utils_detail.cc
paddle/fluid/framework/details/nan_inf_utils_detail.cc
+1
-1
paddle/fluid/framework/details/nan_inf_utils_detail.cu
paddle/fluid/framework/details/nan_inf_utils_detail.cu
+18
-0
paddle/fluid/framework/details/nccl_op_handle.h
paddle/fluid/framework/details/nccl_op_handle.h
+47
-3
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+41
-9
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-2
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+4
-3
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+7
-7
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+2
-2
paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
.../fluid/framework/details/share_tensor_buffer_op_handle.cc
+1
-1
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
50967135
...
@@ -45,7 +45,24 @@ if(WITH_GPU)
...
@@ -45,7 +45,24 @@ if(WITH_GPU)
endif
()
endif
()
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
nv_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
nv_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
elseif
(
WITH_ROCM
)
hip_library
(
nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place
)
hip_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor
)
hip_library
(
fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor place device_memory_aligment
)
hip_library
(
grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle
)
if
(
WITH_DISTRIBUTE
)
hip_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor
)
else
()
hip_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor
)
endif
()
hip_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
hip_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
else
()
else
()
cc_library
(
nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place
)
cc_library
(
nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place
)
cc_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
cc_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
...
@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
...
@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
sync_batch_norm_pass runtime_context_cache_pass
)
sync_batch_norm_pass runtime_context_cache_pass
)
if
(
NOT APPLE AND NOT WIN32 AND
WITH_GPU
)
if
(
NOT APPLE AND NOT WIN32
AND
(
WITH_GPU OR WITH_ROCM
)
)
set
(
IR_PASS_DEPS
${
IR_PASS_DEPS
}
fusion_group_pass
)
set
(
IR_PASS_DEPS
${
IR_PASS_DEPS
}
fusion_group_pass
)
endif
()
endif
()
cc_library
(
build_strategy SRCS build_strategy.cc DEPS pass_builder
${
IR_PASS_DEPS
}
)
cc_library
(
build_strategy SRCS build_strategy.cc DEPS pass_builder
${
IR_PASS_DEPS
}
)
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.cc
浏览文件 @
50967135
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#endif
...
@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
...
@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
place_
(
place
),
place_
(
place
),
var_infos_
(
vars
.
begin
(),
vars
.
end
()),
var_infos_
(
vars
.
begin
(),
vars
.
end
()),
gc_
(
gc
)
{
gc_
(
gc
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
platform
::
is_gpu_place
(
place
))
{
dev_ctx_
=
reinterpret_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctx_
=
reinterpret_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
dynamic_cast
<
StreamGarbageCollector
*>
(
gc_
))
{
if
(
dynamic_cast
<
StreamGarbageCollector
*>
(
gc_
))
{
platform
::
CUDADeviceGuard
guard
(
platform
::
CUDADeviceGuard
guard
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
);
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
event_
,
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
#endif
PADDLE_ENFORCE_NOT_NULL
(
event_
,
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_NOT_NULL
(
event_
,
platform
::
errors
::
InvalidArgument
(
"The cuda envet created is NULL."
));
"The cuda envet created is NULL."
));
}
}
...
@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
...
@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
}
}
EagerDeletionOpHandle
::~
EagerDeletionOpHandle
()
{
EagerDeletionOpHandle
::~
EagerDeletionOpHandle
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
event_
)
{
if
(
event_
)
{
auto
gpu_place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctx_
->
GetPlace
());
auto
gpu_place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctx_
->
GetPlace
());
platform
::
CUDADeviceGuard
guard
(
gpu_place
.
device
);
platform
::
CUDADeviceGuard
guard
(
gpu_place
.
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
event_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
event_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
event_
));
#endif
}
}
#endif
#endif
}
}
void
EagerDeletionOpHandle
::
InitCUDA
()
{
void
EagerDeletionOpHandle
::
InitCUDA
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int
dev_id
=
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctxes_
.
begin
()
->
first
).
device
;
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctxes_
.
begin
()
->
first
).
device
;
events_
[
dev_id
]
=
nullptr
;
events_
[
dev_id
]
=
nullptr
;
...
@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {
...
@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {
void
EagerDeletionOpHandle
::
ClearGarbages
(
void
EagerDeletionOpHandle
::
ClearGarbages
(
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
*
garbages
)
{
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
*
garbages
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
event_
)
{
if
(
event_
)
{
auto
compute_stream
=
dev_ctx_
->
stream
();
auto
compute_stream
=
dev_ctx_
->
stream
();
auto
callback_stream
=
auto
callback_stream
=
reinterpret_cast
<
StreamGarbageCollector
*>
(
gc_
)
->
stream
();
reinterpret_cast
<
StreamGarbageCollector
*>
(
gc_
)
->
stream
();
auto
callback_func
=
[
=
]()
{
auto
callback_func
=
[
=
]()
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
callback_stream
,
event_
,
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
callback_stream
,
event_
,
0
));
cudaStreamWaitEvent
(
callback_stream
,
event_
,
0
));
#endif
};
};
gc_
->
Add
(
std
::
move
(
*
garbages
),
callback_func
);
gc_
->
Add
(
std
::
move
(
*
garbages
),
callback_func
);
}
else
{
}
else
{
#endif
#endif
gc_
->
Add
(
std
::
move
(
*
garbages
));
gc_
->
Add
(
std
::
move
(
*
garbages
));
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
}
}
#endif
#endif
}
}
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
50967135
...
@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
...
@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
std
::
vector
<
ir
::
MemOptVarInfo
*>
var_infos_
;
// not own
std
::
vector
<
ir
::
MemOptVarInfo
*>
var_infos_
;
// not own
GarbageCollector
*
gc_
;
// not own
GarbageCollector
*
gc_
;
// not own
std
::
vector
<
Variable
*>
vars_
;
std
::
vector
<
Variable
*>
vars_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
CUDADeviceContext
*
dev_ctx_
{
nullptr
};
platform
::
CUDADeviceContext
*
dev_ctx_
{
nullptr
};
cuda
Event_t
event_
{
nullptr
};
gpu
Event_t
event_
{
nullptr
};
#endif
#endif
};
};
...
...
paddle/fluid/framework/details/fetch_async_op_handle.cc
浏览文件 @
50967135
...
@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
...
@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
const
platform
::
DeviceContext
&
ctx
)
{
const
platform
::
DeviceContext
&
ctx
)
{
if
(
src_item
->
IsInitialized
()
&&
src_item
->
numel
()
>
0
)
{
if
(
src_item
->
IsInitialized
()
&&
src_item
->
numel
()
>
0
)
{
if
(
platform
::
is_gpu_place
(
src_item
->
place
()))
{
if
(
platform
::
is_gpu_place
(
src_item
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy
(
*
src_item
,
platform
::
CUDAPinnedPlace
(),
ctx
,
dst_item
);
TensorCopy
(
*
src_item
,
platform
::
CUDAPinnedPlace
(),
ctx
,
dst_item
);
#endif
#endif
}
else
{
}
else
{
...
...
paddle/fluid/framework/details/fetch_op_handle.cc
浏览文件 @
50967135
...
@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
...
@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
framework
::
LoDTensor
*
dst_item
)
{
framework
::
LoDTensor
*
dst_item
)
{
if
(
src_item
.
IsInitialized
()
&&
src_item
.
numel
()
>
0
)
{
if
(
src_item
.
IsInitialized
()
&&
src_item
.
numel
()
>
0
)
{
if
(
platform
::
is_gpu_place
(
src_item
.
place
()))
{
if
(
platform
::
is_gpu_place
(
src_item
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TensorCopy
(
src_item
,
platform
::
CPUPlace
(),
dst_item
);
TensorCopy
(
src_item
,
platform
::
CPUPlace
(),
dst_item
);
#endif
#endif
}
else
{
}
else
{
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
浏览文件 @
50967135
...
@@ -26,7 +26,7 @@ namespace details {
...
@@ -26,7 +26,7 @@ namespace details {
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
const
LoDTensor
*>>>
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
const
LoDTensor
*>>>
GradientAndLoDTensor
;
GradientAndLoDTensor
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
浏览文件 @
50967135
...
@@ -33,7 +33,7 @@ namespace platform {
...
@@ -33,7 +33,7 @@ namespace platform {
class
NCCLCommunicator
;
class
NCCLCommunicator
;
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#elif defined(PADDLE_WITH_XPU_BKCL)
...
@@ -44,7 +44,7 @@ namespace paddle {
...
@@ -44,7 +44,7 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandle
{
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandle
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle.h
浏览文件 @
50967135
...
@@ -36,7 +36,7 @@ struct NCCLContextMap;
...
@@ -36,7 +36,7 @@ struct NCCLContextMap;
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -46,7 +46,7 @@ namespace details {
...
@@ -46,7 +46,7 @@ namespace details {
struct
FusedBroadcastOpHandle
:
public
BroadcastOpHandle
{
struct
FusedBroadcastOpHandle
:
public
BroadcastOpHandle
{
public:
public:
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
浏览文件 @
50967135
...
@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
...
@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_
.
emplace_back
(
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
));
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_device_
==
p
::
kCUDA
)
{
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
=
new
FusedBroadcastOpHandle
(
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
#else
...
@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
...
@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
}
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST
(
FusedBroadcastTester
,
GPULodTensor
)
{
TEST
(
FusedBroadcastTester
,
GPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
...
...
paddle/fluid/framework/details/nan_inf_utils_detail.cc
浏览文件 @
50967135
...
@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
...
@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
<<
", place:"
<<
tensor
->
place
()
<<
", numel:"
<<
tensor
->
numel
();
<<
", place:"
<<
tensor
->
place
()
<<
", numel:"
<<
tensor
->
numel
();
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
tensor_check
<
platform
::
CUDADeviceContext
>
(
op_type
,
var_name
,
*
tensor
,
tensor_check
<
platform
::
CUDADeviceContext
>
(
op_type
,
var_name
,
*
tensor
,
place
);
place
);
#else
#else
...
...
paddle/fluid/framework/details/nan_inf_utils_detail.cu
浏览文件 @
50967135
...
@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
...
@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
}
}
__syncthreads
;
__syncthreads
;
#ifdef PADDLE_WITH_HIP
if
(
true
&&
hipThreadIdx_x
==
0
)
{
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
hipBlockIdx_x
,
nan_count
,
inf_count
,
num_count
);
#else
if
(
true
&&
threadIdx
.
x
==
0
)
{
if
(
true
&&
threadIdx
.
x
==
0
)
{
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
blockIdx
.
x
,
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
blockIdx
.
x
,
nan_count
,
inf_count
,
num_count
);
nan_count
,
inf_count
,
num_count
);
#endif
PADDLE_ENFORCE
(
false
,
"===ERROR: in %s find nan or inf==="
,
debug_info
);
PADDLE_ENFORCE
(
false
,
"===ERROR: in %s find nan or inf==="
,
debug_info
);
}
}
}
}
...
@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
...
@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed"
,
"op_var2gpu_str, but now failed"
,
op_var
));
op_var
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
hipMemcpyHostToDevice
,
dev_ctx
->
stream
()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
cudaMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
()));
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
()));
#endif
}
else
{
// get
}
else
{
// get
auto
iter
=
op_var2gpu_str
.
find
(
op_var
);
auto
iter
=
op_var2gpu_str
.
find
(
op_var
);
PADDLE_ENFORCE_EQ
(
iter
!=
op_var2gpu_str
.
end
(),
true
,
PADDLE_ENFORCE_EQ
(
iter
!=
op_var2gpu_str
.
end
(),
true
,
...
@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
...
@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
size_t
blocks
=
size_t
blocks
=
std
::
min
(
static_cast
<
size_t
>
(
128
),
std
::
min
(
static_cast
<
size_t
>
(
128
),
static_cast
<
size_t
>
((
tensor_
.
numel
()
+
threads
-
1
)
/
threads
));
static_cast
<
size_t
>
((
tensor_
.
numel
()
+
threads
-
1
)
/
threads
));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
CheckNanInfKernel
,
dim3
(
blocks
),
dim3
(
threads
),
0
,
dev_ctx
->
stream
(),
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
print_num
,
gpu_str_ptr
);
#else
CheckNanInfKernel
<<<
blocks
,
threads
,
0
,
dev_ctx
->
stream
()
>>>
(
CheckNanInfKernel
<<<
blocks
,
threads
,
0
,
dev_ctx
->
stream
()
>>>
(
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
print_num
,
gpu_str_ptr
);
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
print_num
,
gpu_str_ptr
);
#endif
}
}
template
<
>
template
<
>
...
...
paddle/fluid/framework/details/nccl_op_handle.h
浏览文件 @
50967135
...
@@ -21,7 +21,12 @@
...
@@ -21,7 +21,12 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
DECLARE_bool
(
sync_nccl_allreduce
);
DECLARE_bool
(
sync_nccl_allreduce
);
...
@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
}
}
virtual
~
NCCLOpHandleBase
()
{
virtual
~
NCCLOpHandleBase
()
{
for
(
auto
&
ev
:
inter_events_
)
{
for
(
auto
&
ev
:
inter_events_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
ev
.
second
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
#endif
}
}
for
(
auto
&
ev
:
exter_events_
)
{
for
(
auto
&
ev
:
exter_events_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
ev
.
second
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
#endif
}
}
}
}
void
SetRunEnv
(
int
run_order
,
bool
use_hierarchical_allreduce
)
{
void
SetRunEnv
(
int
run_order
,
bool
use_hierarchical_allreduce
)
{
...
@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
}
}
platform
::
SetDeviceId
(
dev_id
);
platform
::
SetDeviceId
(
dev_id
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
inter_events_
[
dev_id
],
hipEventDisableTiming
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
exter_events_
[
dev_id
],
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
inter_events_
[
dev_id
],
cudaEventDisableTiming
));
&
inter_events_
[
dev_id
],
cudaEventDisableTiming
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
exter_events_
[
dev_id
],
cudaEventDisableTiming
));
&
exter_events_
[
dev_id
],
cudaEventDisableTiming
));
#endif
VLOG
(
10
)
<<
"Create events on dev_id:"
<<
dev_id
VLOG
(
10
)
<<
"Create events on dev_id:"
<<
dev_id
<<
", inter_event:"
<<
&
inter_events_
[
dev_id
]
<<
", inter_event:"
<<
&
inter_events_
[
dev_id
]
<<
", exter_event:"
<<
&
exter_events_
[
dev_id
];
<<
", exter_event:"
<<
&
exter_events_
[
dev_id
];
...
@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
ncclSum
,
0
,
comm
,
stream
));
sendbuff
,
recvbuff
,
count
,
datatype
,
ncclSum
,
0
,
comm
,
stream
));
#ifdef PADDLE_WITH_HIP
hipEventRecord
(
inter_events_
.
at
(
dev_id
),
stream
);
#else
cudaEventRecord
(
inter_events_
.
at
(
dev_id
),
stream
);
cudaEventRecord
(
inter_events_
.
at
(
dev_id
),
stream
);
#endif
if
(
FLAGS_sync_nccl_allreduce
)
{
if
(
FLAGS_sync_nccl_allreduce
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
#endif
}
}
}
}
...
@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent
(
stream
,
inter_events_
.
at
(
dev_id
),
0
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
sendbuff
,
recvbuff
,
count
,
datatype
,
op
,
comm
,
stream
));
hipEventRecord
(
exter_events_
.
at
(
dev_id
),
stream
);
if
(
FLAGS_sync_nccl_allreduce
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream
));
}
#else
cudaStreamWaitEvent
(
stream
,
inter_events_
.
at
(
dev_id
),
0
);
cudaStreamWaitEvent
(
stream
,
inter_events_
.
at
(
dev_id
),
0
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
...
@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
if
(
FLAGS_sync_nccl_allreduce
)
{
if
(
FLAGS_sync_nccl_allreduce
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
}
}
#endif
}
}
void
InterBroadCast
(
platform
::
Place
place
,
void
*
sendbuff
,
size_t
count
,
void
InterBroadCast
(
platform
::
Place
place
,
void
*
sendbuff
,
size_t
count
,
...
@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
<<
", stream:"
<<
stream
;
<<
", stream:"
<<
stream
;
#ifdef PADDLE_WITH_HIP
hipStreamWaitEvent
(
stream
,
exter_events_
.
at
(
dev_id
),
0
);
#else
cudaStreamWaitEvent
(
stream
,
exter_events_
.
at
(
dev_id
),
0
);
cudaStreamWaitEvent
(
stream
,
exter_events_
.
at
(
dev_id
),
0
);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
sendbuff
,
count
,
datatype
,
0
,
comm
,
stream
));
sendbuff
,
count
,
datatype
,
0
,
comm
,
stream
));
}
}
...
@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {
...
@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {
private:
private:
// hierarchical needed events
// hierarchical needed events
std
::
unordered_map
<
int
,
cuda
Event_t
>
inter_events_
;
std
::
unordered_map
<
int
,
gpu
Event_t
>
inter_events_
;
std
::
unordered_map
<
int
,
cuda
Event_t
>
exter_events_
;
std
::
unordered_map
<
int
,
gpu
Event_t
>
exter_events_
;
};
};
}
// namespace details
}
// namespace details
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
50967135
...
@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
...
@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
}
}
OpHandleBase
::~
OpHandleBase
()
PADDLE_MAY_THROW
{
OpHandleBase
::~
OpHandleBase
()
PADDLE_MAY_THROW
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
auto
&
ev
:
events_
)
{
for
(
auto
&
ev
:
events_
)
{
if
(
ev
.
second
)
{
if
(
ev
.
second
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
ev
.
second
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
ev
.
second
));
#endif
}
}
}
}
#endif
#endif
}
}
void
OpHandleBase
::
InitCUDA
()
{
void
OpHandleBase
::
InitCUDA
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
auto
&
p
:
dev_ctxes_
)
{
for
(
auto
&
p
:
dev_ctxes_
)
{
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
p
.
first
).
device
;
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
p
.
first
).
device
;
platform
::
SetDeviceId
(
dev_id
);
platform
::
SetDeviceId
(
dev_id
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
events_
[
dev_id
],
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
events_
[
dev_id
],
cudaEventDisableTiming
));
cudaEventCreateWithFlags
(
&
events_
[
dev_id
],
cudaEventDisableTiming
));
#endif
}
}
if
(
IsMultiDeviceTransfer
()
&&
dev_ctxes_
.
size
()
>
0
)
{
if
(
IsMultiDeviceTransfer
()
&&
dev_ctxes_
.
size
()
>
0
)
{
for
(
auto
&
out_var
:
outputs_
)
{
for
(
auto
&
out_var
:
outputs_
)
{
...
@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
...
@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
}
}
void
OpHandleBase
::
Run
(
DeviceType
use_device
)
{
void
OpHandleBase
::
Run
(
DeviceType
use_device
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
events_
.
empty
()
&&
use_device
==
p
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
if
(
events_
.
empty
()
&&
use_device
==
p
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
InitCUDA
();
InitCUDA
();
}
}
...
@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
...
@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
}
}
void
OpHandleBase
::
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
)
{
void
OpHandleBase
::
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_NOT_NULL
(
waited_ctx
,
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_NOT_NULL
(
waited_ctx
,
platform
::
errors
::
InvalidArgument
(
"Argument waited_ctx is NULL."
));
"Argument waited_ctx is NULL."
));
if
(
platform
::
is_cpu_place
(
waited_ctx
->
GetPlace
())
||
events_
.
empty
())
{
if
(
platform
::
is_cpu_place
(
waited_ctx
->
GetPlace
())
||
events_
.
empty
())
{
...
@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
...
@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
auto
stream
=
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
waited_ctx
)
->
stream
();
static_cast
<
platform
::
CUDADeviceContext
*>
(
waited_ctx
)
->
stream
();
for
(
auto
&
ev
:
events_
)
{
for
(
auto
&
ev
:
events_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
stream
,
ev
.
second
,
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
ev
.
second
,
0
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
ev
.
second
,
0
));
#endif
}
}
}
}
#else
#else
...
@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
...
@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if
(
in_var_handle
)
{
if
(
in_var_handle
)
{
auto
&
place
=
in_var_handle
->
place
();
auto
&
place
=
in_var_handle
->
place
();
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
stream
=
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctxes_
.
at
(
place
))
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctxes_
.
at
(
place
))
->
stream
();
->
stream
();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
cudaStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#endif
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
...
@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
...
@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
if
(
in_var_handle
)
{
if
(
in_var_handle
)
{
auto
&
place
=
in_var_handle
->
place
();
auto
&
place
=
in_var_handle
->
place
();
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
::
Instance
();
auto
stream
=
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
))
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
))
->
stream
();
->
stream
();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
#endif
#else
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
"Not compiled with CUDA."
));
...
@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
...
@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto
*
in_var_handle
=
dynamic_cast
<
VarHandle
*>
(
in_var
);
auto
*
in_var_handle
=
dynamic_cast
<
VarHandle
*>
(
in_var
);
if
(
in_var_handle
)
{
if
(
in_var_handle
)
{
if
(
platform
::
is_gpu_place
(
in_var_handle
->
place
()))
{
if
(
platform
::
is_gpu_place
(
in_var_handle
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctxes_
.
at
(
in_var_handle
->
place
()))
dev_ctxes_
.
at
(
in_var_handle
->
place
()))
->
stream
();
->
stream
();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
cudaStreamWaitEvent
(
stream
,
in_var_handle
->
GetEvent
(),
0
));
#endif
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
...
@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
...
@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
void
OpHandleBase
::
RunAndRecordEvent
(
const
std
::
function
<
void
()
>
&
callback
)
{
void
OpHandleBase
::
RunAndRecordEvent
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback
();
callback
();
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
!
events_
.
empty
())
{
// Use event
if
(
!
events_
.
empty
())
{
// Use event
for
(
auto
&
p
:
dev_ctxes_
)
{
for
(
auto
&
p
:
dev_ctxes_
)
{
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
p
.
first
).
device
;
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
p
.
first
).
device
;
auto
*
cuda_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
);
auto
*
cuda_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
);
VLOG
(
10
)
<<
"cudadevicecontext:"
<<
cuda_dev_ctx
<<
", dev_id:"
<<
dev_id
;
VLOG
(
10
)
<<
"cudadevicecontext:"
<<
cuda_dev_ctx
<<
", dev_id:"
<<
dev_id
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
events_
.
at
(
dev_id
),
cuda_dev_ctx
->
stream
()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
events_
.
at
(
dev_id
),
cuda_dev_ctx
->
stream
()));
cudaEventRecord
(
events_
.
at
(
dev_id
),
cuda_dev_ctx
->
stream
()));
#endif
}
}
}
}
#endif
#endif
...
@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
...
@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
void
OpHandleBase
::
RunAndRecordEvent
(
platform
::
Place
p
,
void
OpHandleBase
::
RunAndRecordEvent
(
platform
::
Place
p
,
const
std
::
function
<
void
()
>
&
callback
)
{
const
std
::
function
<
void
()
>
&
callback
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_cpu_place
(
p
)
||
events_
.
empty
())
{
if
(
platform
::
is_cpu_place
(
p
)
||
events_
.
empty
())
{
callback
();
callback
();
}
else
{
}
else
{
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
50967135
...
@@ -157,8 +157,8 @@ class OpHandleBase {
...
@@ -157,8 +157,8 @@ class OpHandleBase {
std
::
vector
<
Scope
*>
local_exec_scopes_
;
std
::
vector
<
Scope
*>
local_exec_scopes_
;
bool
skip_running_
=
false
;
bool
skip_running_
=
false
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
unordered_map
<
int
,
cuda
Event_t
>
events_
;
std
::
unordered_map
<
int
,
gpu
Event_t
>
events_
;
#endif
#endif
DISABLE_COPY_AND_ASSIGN
(
OpHandleBase
);
DISABLE_COPY_AND_ASSIGN
(
OpHandleBase
);
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
50967135
...
@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
...
@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
}
}
});
});
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
50967135
...
@@ -40,7 +40,7 @@ namespace platform {
...
@@ -40,7 +40,7 @@ namespace platform {
struct
NCCLContextMap
;
struct
NCCLContextMap
;
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#include "paddle/fluid/platform/bkcl_helper.h"
...
@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
...
@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {
...
@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
GetLocalScopes
()
override
{
return
local_scopes_
;
}
std
::
vector
<
Scope
*>
GetLocalScopes
()
override
{
return
local_scopes_
;
}
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
defined PADDLE_WITH_DISTRIBUTE
template
<
typename
DevCtx
,
typename
DataType
>
template
<
typename
DevCtx
,
typename
DataType
>
void
GatherSelectedRows
(
void
GatherSelectedRows
(
const
std
::
vector
<
const
SelectedRows
*>
&
src_selecte_rows_
,
const
std
::
vector
<
const
SelectedRows
*>
&
src_selecte_rows_
,
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
50967135
...
@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
...
@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
std
::
vector
<
p
::
Place
>
gpu_list_
;
std
::
vector
<
p
::
Place
>
gpu_list_
;
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
#endif
...
@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
...
@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
ctxs_
[
j
]
->
Wait
();
}
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
nccl_ctxs_
)
{
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
nccl_ctxs_
->
WaitAll
();
}
}
...
@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
...
@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
void
InitCtxOnGpu
(
bool
use_gpu
)
{
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
use_gpu_
=
use_gpu
;
if
(
use_gpu
)
{
if
(
use_gpu
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
int
count
=
p
::
GetCUDADeviceCount
();
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
...
@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
...
@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
gpu_list_
.
push_back
(
p
);
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
nccl_ctxs_
.
reset
(
nullptr
);
nccl_ctxs_
.
reset
(
nullptr
);
#endif
#endif
}
}
...
@@ -104,7 +104,7 @@ struct TestReduceOpHandle {
...
@@ -104,7 +104,7 @@ struct TestReduceOpHandle {
nodes
.
emplace_back
(
new
ir
::
Node
(
"node"
));
nodes
.
emplace_back
(
new
ir
::
Node
(
"node"
));
if
(
use_gpu_
)
{
if
(
use_gpu_
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
#else
...
@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
...
@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with NCLL."
));
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with NCLL."
));
#endif
#endif
}
else
{
}
else
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
#else
...
@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
...
@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
test_op
.
InitReduceOp
(
out_scope_idx
);
test_op
.
InitReduceOp
(
out_scope_idx
);
test_op
.
TestReduceLodTensors
(
out_scope_idx
);
test_op
.
TestReduceLodTensors
(
out_scope_idx
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
ReduceTester
,
TestGPUReduceTestSelectedRows
)
{
TEST
(
ReduceTester
,
TestGPUReduceTestSelectedRows
)
{
TestReduceOpHandle
test_op
;
TestReduceOpHandle
test_op
;
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
50967135
...
@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
...
@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
"Please recompile or reinstall Paddle with XPU support."
));
"Please recompile or reinstall Paddle with XPU support."
));
#endif
#endif
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
ctx_
)
->
stream
();
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
ctx_
)
->
stream
();
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
out_data
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
out_data
,
...
@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
...
@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
local_exec_scopes_
[
0
]
->
FindVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
local_exec_scopes_
[
0
]
->
FindVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
tensor
->
Resize
(
make_ddim
({
1
}));
tensor
->
Resize
(
make_ddim
({
1
}));
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor
func
(
coeff_
,
tensor
,
place_
,
out_dtype_
,
ScaleLossGradFunctor
func
(
coeff_
,
tensor
,
place_
,
out_dtype_
,
this
->
dev_ctxes_
.
at
(
place_
));
this
->
dev_ctxes_
.
at
(
place_
));
this
->
RunAndRecordEvent
([
&
]
{
framework
::
VisitDataType
(
out_dtype_
,
func
);
});
this
->
RunAndRecordEvent
([
&
]
{
framework
::
VisitDataType
(
out_dtype_
,
func
);
});
...
...
paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
浏览文件 @
50967135
...
@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
...
@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
}
}
void
ShareTensorBufferOpHandle
::
InitCUDA
()
{
void
ShareTensorBufferOpHandle
::
InitCUDA
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int
dev_id
=
int
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctxes_
.
begin
()
->
first
).
device
;
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctxes_
.
begin
()
->
first
).
device
;
events_
[
dev_id
]
=
nullptr
;
events_
[
dev_id
]
=
nullptr
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录