Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
c7b7291b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c7b7291b
编写于
3月 06, 2018
作者:
X
Xin Pan
提交者:
GitHub
3月 06, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #8758 from panyx0718/nccl
[Speed]Avoid init_nccl for every steps.
上级
767acc6c
a4d68ed3
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
54 addition
and
32 deletion
+54
-32
paddle/fluid/operators/nccl/nccl_gpu_common.cc
paddle/fluid/operators/nccl/nccl_gpu_common.cc
+46
-1
paddle/fluid/operators/nccl/nccl_gpu_common.h
paddle/fluid/operators/nccl/nccl_gpu_common.h
+3
-26
paddle/fluid/operators/nccl_op.cu.cc
paddle/fluid/operators/nccl_op.cu.cc
+5
-5
未找到文件。
paddle/fluid/operators/nccl/nccl_gpu_common.cc
浏览文件 @
c7b7291b
...
...
@@ -16,5 +16,50 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
namespace
paddle
{
namespace
platform
{}
// namespace platform
namespace
platform
{
namespace
{
// TODO(panyx0718): Where to destroy them.
std
::
unique_ptr
<
std
::
vector
<
ncclComm_t
>>
global_comms
;
std
::
unique_ptr
<
std
::
unordered_map
<
int
,
int
>>
comm_id_map
;
bool
inited
=
false
;
size_t
last_num_gpus
=
-
1
;
// TODO(panyx0718): Need to decide whether Paddle supports parallel
// runs with different number GPUs. If true, current solution is not enough.
std
::
mutex
comm_mu
;
}
int
Communicator
::
GetCommId
(
int
device_id
)
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
comm_mu
);
return
comm_id_map
->
at
(
device_id
);
}
void
Communicator
::
InitAll
(
const
std
::
vector
<
int
>&
gpus
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
comm_mu
);
if
(
inited
&&
last_num_gpus
==
gpus
.
size
())
{
return
;
}
last_num_gpus
=
gpus
.
size
();
if
(
global_comms
)
{
for
(
size_t
i
=
0
;
i
<
global_comms
->
size
();
++
i
)
{
// FIXME(dzh) : PADDLE_ENFORCE return void
dynload
::
ncclCommDestroy
((
*
global_comms
)[
i
]);
}
}
global_comms
.
reset
(
new
std
::
vector
<
ncclComm_t
>
());
comm_id_map
.
reset
(
new
std
::
unordered_map
<
int
,
int
>
());
global_comms
->
resize
(
gpus
.
size
());
for
(
size_t
i
=
0
;
i
<
gpus
.
size
();
++
i
)
{
(
*
comm_id_map
)[
gpus
[
i
]]
=
i
;
}
PADDLE_ENFORCE
(
dynload
::
ncclCommInitAll
(
global_comms
->
data
(),
gpus
.
size
(),
gpus
.
data
()));
inited
=
true
;
}
const
std
::
vector
<
ncclComm_t
>&
Communicator
::
comms
()
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
comm_mu
);
return
*
global_comms
;
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/operators/nccl/nccl_gpu_common.h
浏览文件 @
c7b7291b
...
...
@@ -29,39 +29,16 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
constexpr
int
kInvalidGPUId
=
-
1
;
struct
Communicator
{
std
::
vector
<
ncclComm_t
>
comms_
;
std
::
unordered_map
<
int
,
int
>
comm_id_map_
;
bool
inited_
;
Communicator
()
{}
int
GetCommId
(
int
device_id
)
const
{
return
comm_id_map_
.
at
(
device_id
);
}
void
InitAll
(
const
std
::
vector
<
int
>&
gpus
)
{
comms_
.
resize
(
gpus
.
size
());
inited_
=
false
;
for
(
size_t
i
=
0
;
i
<
gpus
.
size
();
++
i
)
{
comm_id_map_
[
gpus
[
i
]]
=
i
;
}
PADDLE_ENFORCE
(
dynload
::
ncclCommInitAll
(
comms_
.
data
(),
gpus
.
size
(),
gpus
.
data
()));
inited_
=
true
;
}
int
GetCommId
(
int
device_id
)
const
;
~
Communicator
()
{
if
(
inited_
)
{
for
(
size_t
i
=
0
;
i
<
comms_
.
size
();
++
i
)
{
// FIXME(dzh) : PADDLE_ENFORCE return void
dynload
::
ncclCommDestroy
(
comms_
[
i
]);
}
}
}
void
InitAll
(
const
std
::
vector
<
int
>&
gpus
);
DISABLE_COPY_AND_ASSIGN
(
Communicator
)
;
const
std
::
vector
<
ncclComm_t
>&
comms
()
const
;
};
}
// namespace platform
...
...
paddle/fluid/operators/nccl_op.cu.cc
浏览文件 @
c7b7291b
...
...
@@ -78,7 +78,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
ins
[
i
]
->
data
<
T
>
(),
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
_
[
idx
]
,
stream
));
comm
->
comms
().
at
(
idx
)
,
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"gpu : "
...
...
@@ -127,7 +127,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
std
::
hash
<
std
::
string
>
hasher
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
if
(
root
==
platform
::
kInvalidGPUId
)
{
root
=
hasher
(
ins_names
[
i
])
%
comm
->
comms
_
.
size
();
root
=
hasher
(
ins_names
[
i
])
%
comm
->
comms
()
.
size
();
}
T
*
recvbuffer
=
nullptr
;
if
(
root
==
gpu_id
)
{
...
...
@@ -139,7 +139,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
ins
[
i
]
->
data
<
T
>
(),
recvbuffer
,
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
_
[
idx
]
,
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
)
,
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
...
...
@@ -176,7 +176,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
VLOG
(
1
)
<<
" before ncclBcast"
;
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
(
void
*
)
ins
[
i
]
->
data
<
T
>
(),
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
_
[
idx
]
,
stream
));
root
,
comm
->
comms
().
at
(
idx
)
,
stream
));
VLOG
(
1
)
<<
" after ncclBcast"
;
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
...
...
@@ -190,7 +190,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
_
[
idx
]
,
stream
));
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
)
,
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录