Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f5840d89
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f5840d89
编写于
5月 11, 2018
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
follow comments
上级
04bde96e
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
25 addition
and
22 deletion
+25
-22
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+1
-1
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/detail/sendrecvop_utils.cc
paddle/fluid/operators/detail/sendrecvop_utils.cc
+2
-2
paddle/fluid/operators/gen_nccl_id_op.cc
paddle/fluid/operators/gen_nccl_id_op.cc
+6
-3
paddle/fluid/operators/test_send_nccl_id.cc
paddle/fluid/operators/test_send_nccl_id.cc
+3
-3
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+8
-8
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+4
-4
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
f5840d89
...
...
@@ -80,7 +80,7 @@ ParallelExecutor::ParallelExecutor(
// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
auto
*
nccl_id_var
=
scope
->
FindVar
(
"NCCLID"
);
auto
*
nccl_id_var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
ncclUniqueId
*
nccl_id
=
nullptr
;
if
(
nccl_id_var
!=
nullptr
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
f5840d89
...
...
@@ -187,7 +187,7 @@ if(WITH_DISTRIBUTE)
if
(
WITH_GPU
)
op_library
(
gen_nccl_id_op DEPS nccl_common
)
else
()
set
(
DEPS_OPS
${
DEPS_OPS
}
gen_nccl_id_op
)
set
(
DEPS_OPS
${
DEPS_OPS
}
gen_nccl_id_op
)
endif
()
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
...
...
paddle/fluid/operators/detail/sendrecvop_utils.cc
浏览文件 @
f5840d89
...
...
@@ -162,8 +162,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
if
(
var
->
IsType
<
ncclUniqueId
>
())
{
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
NCCL_UNIQUE_ID_BYTES
);
ncclUniqueId
*
uid
=
var
->
GetMutable
<
ncclUniqueId
>
();
e
.
WriteRawBytes
(
std
::
string
(
uid
->
internal
,
NCCL_UNIQUE_ID_BYTES
));
ncclUniqueId
&
uid
=
var
->
Get
<
ncclUniqueId
>
();
e
.
WriteRawBytes
(
std
::
string
(
uid
.
internal
,
NCCL_UNIQUE_ID_BYTES
));
// for serialize NCCL_ID
::
grpc
::
Slice
slices
(
e
.
size
());
...
...
paddle/fluid/operators/gen_nccl_id_op.cc
浏览文件 @
f5840d89
...
...
@@ -52,17 +52,17 @@ class GenNCCLIdOp : public framework::OperatorBase {
private:
void
GenerateAndSend
(
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
auto
var
=
scope
->
FindVar
(
"NCCLID"
);
auto
var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
id
=
var
->
GetMutable
<
ncclUniqueId
>
();
platform
::
dynload
::
ncclGetUniqueId
(
id
);
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclGetUniqueId
(
id
)
);
std
::
vector
<
std
::
string
>
endpoint_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"endpoint_list"
);
detail
::
RPCClient
client
;
for
(
auto
&
ep
:
endpoint_list
)
{
VLOG
(
3
)
<<
"sending nccl id to "
<<
ep
;
client
.
AsyncSendVariable
(
ep
,
dev_ctx
,
*
scope
,
"NCCLID"
);
client
.
AsyncSendVariable
(
ep
,
dev_ctx
,
*
scope
,
NCCL_ID_VARNAME
);
}
client
.
Wait
();
VLOG
(
3
)
<<
"sending completed..."
;
...
...
@@ -71,6 +71,9 @@ class GenNCCLIdOp : public framework::OperatorBase {
void
GetIdByServer
(
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
// NOTE: Can not use unique_ptr here because the default
// deleter will call GRPC Server's base class's dtor and
// that will cause a wired crash.
rpc_service_
=
new
detail
::
AsyncGRPCServer
(
endpoint
,
true
);
framework
::
ProgramDesc
empty_program
;
framework
::
Executor
executor
(
dev_ctx
.
GetPlace
());
...
...
paddle/fluid/operators/test_send_nccl_id.cc
浏览文件 @
f5840d89
...
...
@@ -39,7 +39,7 @@ std::unique_ptr<detail::AsyncGRPCServer> rpc_service;
void
StartServer
()
{
f
::
Scope
scope
;
p
::
CPUPlace
place
;
scope
.
Var
(
"NCCLID"
);
scope
.
Var
(
NCCL_ID_VARNAME
);
p
::
DeviceContextPool
&
pool
=
p
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
p
::
CPUPlace
());
...
...
@@ -71,7 +71,7 @@ TEST(SendNcclId, Normal) {
p
::
DeviceContextPool
&
pool
=
p
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
p
::
CPUPlace
());
auto
var
=
scope
.
Var
(
"NCCLID"
);
auto
var
=
scope
.
Var
(
NCCL_ID_VARNAME
);
// var->SetType(f::proto::VarType_Type_RAW);
auto
id
=
var
->
GetMutable
<
ncclUniqueId
>
();
p
::
dynload
::
ncclGetUniqueId
(
id
);
...
...
@@ -80,7 +80,7 @@ TEST(SendNcclId, Normal) {
std
::
string
ep
=
string
::
Sprintf
(
"127.0.0.1:%d"
,
port
);
detail
::
RPCClient
client
;
client
.
AsyncSendVariable
(
ep
,
dev_ctx
,
scope
,
"NCCLID"
);
client
.
AsyncSendVariable
(
ep
,
dev_ctx
,
scope
,
NCCL_ID_VARNAME
);
client
.
Wait
();
server_thread
.
join
();
auto
*
ptr
=
rpc_service
.
release
();
...
...
paddle/fluid/platform/nccl_helper.h
浏览文件 @
f5840d89
...
...
@@ -21,6 +21,8 @@
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
#define NCCL_ID_VARNAME "NCCLID"
namespace
paddle
{
namespace
platform
{
...
...
@@ -76,7 +78,7 @@ struct NCCLContextMap {
explicit
NCCLContextMap
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
ncclUniqueId
*
nccl_id
=
nullptr
,
size_t
n
ode_count
=
0
,
size_t
trainer_id
=
0
)
{
size_t
n
um_trainers
=
0
,
size_t
trainer_id
=
0
)
{
PADDLE_ENFORCE
(
!
places
.
empty
());
order_
.
reserve
(
places
.
size
());
for
(
auto
&
p
:
places
)
{
...
...
@@ -94,16 +96,14 @@ struct NCCLContextMap {
std
::
unique_ptr
<
ncclComm_t
[]
>
comms
(
new
ncclComm_t
[
order_
.
size
()]);
// if pass nccl_id here, can assume we are doing multi node training
if
(
nccl_id
==
nullptr
)
{
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitAll
(
comms
.
get
(),
static_cast
<
int
>
(
order_
.
size
()),
order_
.
data
()));
}
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitAll
(
comms
.
get
(),
static_cast
<
int
>
(
order_
.
size
()),
order_
.
data
()));
}
else
{
PADDLE_ENFORCE_GT
(
n
ode_count
,
0
);
PADDLE_ENFORCE_GT
(
n
um_trainers
,
0
);
// TODO(wuyi): need to ensure each node have same number of GPUs
{
int
nranks
=
n
ode_count
*
order_
.
size
();
int
nranks
=
n
um_trainers
*
order_
.
size
();
NCCLGroupGuard
gurad
;
for
(
auto
&
gpu_id
:
order_
)
{
int
rank
=
trainer_id
*
order_
.
size
()
+
gpu_id
;
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
f5840d89
...
...
@@ -31,7 +31,7 @@ class ParallelExecutor(object):
allow_op_delay
=
False
,
share_vars_from
=
None
,
use_default_grad_scale
=
True
,
num_
node
s
=
0
,
num_
trainer
s
=
0
,
trainer_id
=
0
):
"""
ParallelExecutor can run program in parallel.
...
...
@@ -53,10 +53,10 @@ class ParallelExecutor(object):
gradients of each device and scaled gradients would be
aggregated. Otherwise, a customized scale value should be fed
to the network.
num_
node
s(int, default 0): If greater than 0, NCCL will be
num_
trainer
s(int, default 0): If greater than 0, NCCL will be
initialized with multpile rank of nodes, each node should have
same number of GPUs. Distributed training will be enabled then.
trainer_id(int, default 0): Must use together with num_
node
s.
trainer_id(int, default 0): Must use together with num_
trainer
s.
trainer_id is the "rank" of current node starts from 0.
Returns:
...
...
@@ -137,7 +137,7 @@ class ParallelExecutor(object):
local_scopes
,
allow_op_delay
,
use_default_grad_scale
,
num_
node
s
,
num_
trainer
s
,
trainer_id
)
self
.
scope
=
scope
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录