Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
d9320dcd
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
接近 2 年 前同步成功
通知
707
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d9320dcd
编写于
5月 04, 2018
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
complete code
上级
7237323c
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
73 addition
and
15 deletion
+73
-15
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+8
-2
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+2
-1
paddle/fluid/operators/detail/send_recv.proto
paddle/fluid/operators/detail/send_recv.proto
+1
-0
paddle/fluid/operators/detail/sendrecvop_utils.cc
paddle/fluid/operators/detail/sendrecvop_utils.cc
+20
-1
paddle/fluid/operators/detail/variable_response.cc
paddle/fluid/operators/detail/variable_response.cc
+10
-1
paddle/fluid/operators/gen_nccl_id_op.cc
paddle/fluid/operators/gen_nccl_id_op.cc
+2
-2
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+26
-6
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-2
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
d9320dcd
...
@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
...
@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
bool
allow_op_delay
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
bool
allow_op_delay
,
bool
use_default_grad_scale
)
bool
use_default_grad_scale
,
size_t
num_trainers
,
size_t
trainer_id
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
global_scope_
=
scope
;
...
@@ -80,7 +80,13 @@ ParallelExecutor::ParallelExecutor(
...
@@ -80,7 +80,13 @@ ParallelExecutor::ParallelExecutor(
// Bcast Parameters to all GPUs
// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
));
auto
*
nccl_id_var
=
scope
->
FindVar
(
"NCCLID"
);
ncclUniqueId
*
nccl_id
=
nullptr
;
if
(
nccl_id_var
!=
nullptr
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
}
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
,
nccl_id
,
num_trainers
,
trainer_id
));
#endif
#endif
if
(
platform
::
is_gpu_place
(
places
[
0
])
&&
member_
->
local_scopes_
.
size
()
!=
1
&&
if
(
platform
::
is_gpu_place
(
places
[
0
])
&&
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
// Is CUDA
local_scopes
.
empty
())
{
// Is CUDA
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
d9320dcd
...
@@ -40,7 +40,8 @@ class ParallelExecutor {
...
@@ -40,7 +40,8 @@ class ParallelExecutor {
const
ProgramDesc
&
main_program
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>&
local_scopes
,
const
std
::
vector
<
Scope
*>&
local_scopes
,
bool
allow_op_delay
,
bool
use_default_grad_scale
);
bool
allow_op_delay
,
bool
use_default_grad_scale
,
size_t
num_trainers
=
0
,
size_t
trainer_id
=
0
);
~
ParallelExecutor
();
~
ParallelExecutor
();
...
...
paddle/fluid/operators/detail/send_recv.proto
浏览文件 @
d9320dcd
...
@@ -32,6 +32,7 @@ service SendRecvService {
...
@@ -32,6 +32,7 @@ service SendRecvService {
enum
VarType
{
enum
VarType
{
LOD_TENSOR
=
0
;
LOD_TENSOR
=
0
;
SELECTED_ROWS
=
1
;
SELECTED_ROWS
=
1
;
NCCL_ID
=
2
;
}
}
// NOTICE(gongwb):don't modify this proto if you are not
// NOTICE(gongwb):don't modify this proto if you are not
...
...
paddle/fluid/operators/detail/sendrecvop_utils.cc
浏览文件 @
d9320dcd
...
@@ -43,13 +43,16 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -43,13 +43,16 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void
*
buf
=
buffer
.
get
();
void
*
buf
=
buffer
.
get
();
void
*
payload
=
nullptr
;
void
*
payload
=
nullptr
;
size_t
payload_size
;
size_t
payload_size
=
0
;
ProtoEncodeHelper
e
(
static_cast
<
char
*>
(
buf
),
1024
);
ProtoEncodeHelper
e
(
static_cast
<
char
*>
(
buf
),
1024
);
e
.
WriteString
(
VarMsg
::
kVarnameFieldNumber
,
name
);
e
.
WriteString
(
VarMsg
::
kVarnameFieldNumber
,
name
);
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
0
);
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
0
);
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
1
);
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
1
);
}
else
if
(
var
->
IsType
<
ncclUniqueId
>
())
{
// NOTE: sendrecv only support RAW type for NCCL_ID
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
2
);
}
}
if
(
!
out_name
.
empty
())
{
if
(
!
out_name
.
empty
())
{
...
@@ -139,11 +142,27 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -139,11 +142,27 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
payload_size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
tensor
->
type
());
payload_size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
tensor
->
type
());
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
payload_size
);
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
payload_size
);
}
break
;
}
break
;
case
framework
::
proto
::
VarType_Type_RAW
:
{
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
NCCL_UNIQUE_ID_BYTES
);
ncclUniqueId
*
uid
=
var
->
GetMutable
<
ncclUniqueId
>
();
e
.
WriteRawBytes
(
std
::
string
(
uid
->
internal
,
NCCL_UNIQUE_ID_BYTES
));
}
break
;
default:
default:
PADDLE_THROW
(
"Serialize does not support type: %s"
,
PADDLE_THROW
(
"Serialize does not support type: %s"
,
typeid
(
var
->
Type
()).
name
());
typeid
(
var
->
Type
()).
name
());
break
;
break
;
}
}
if
(
framework
::
ToVarType
(
var
->
Type
())
==
framework
::
proto
::
VarType_Type_RAW
)
{
// for serialize NCCL_ID
::
grpc
::
Slice
slices
(
e
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
.
begin
()),
e
.
data
(),
e
.
size
());
::
grpc
::
ByteBuffer
tmp
(
&
slices
,
1
);
msg
->
Swap
(
&
tmp
);
return
;
}
// steal reference of tensor data
// steal reference of tensor data
::
grpc
::
Slice
slices
[
4
];
// metadata, tensor, rows meta, rows
::
grpc
::
Slice
slices
[
4
];
// metadata, tensor, rows meta, rows
int
num_slices
=
2
;
// only SelectedRows have rows buffer
int
num_slices
=
2
;
// only SelectedRows have rows buffer
...
...
paddle/fluid/operators/detail/variable_response.cc
浏览文件 @
d9320dcd
...
@@ -367,9 +367,18 @@ int VariableResponse::Parse(Source* source) {
...
@@ -367,9 +367,18 @@ int VariableResponse::Parse(Source* source) {
}
}
case
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
:
{
case
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
:
{
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
&&
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
||
meta_
.
type
()
==
sendrecv
::
NCCL_ID
)
&&
meta_
.
varname
()
!=
""
,
meta_
.
varname
()
!=
""
,
"meta info should be got first!"
);
"meta info should be got first!"
);
if
(
meta_
.
type
()
==
sendrecv
::
NCCL_ID
)
{
auto
*
var
=
scope_
->
FindVar
(
meta_
.
varname
());
if
(
var
!=
nullptr
)
{
ncclUniqueId
*
id
=
var
->
GetMutable
<
ncclUniqueId
>
();
memcpy
(
id
->
internal
,
meta_
.
serialized
().
c_str
(),
meta_
.
serialized
().
size
());
}
}
int
length
=
0
;
int
length
=
0
;
if
(
wt
!=
WIRETYPE_LENGTH_DELIMITED
||
if
(
wt
!=
WIRETYPE_LENGTH_DELIMITED
||
...
...
paddle/fluid/operators/gen_nccl_id_op.cc
浏览文件 @
d9320dcd
...
@@ -54,7 +54,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
...
@@ -54,7 +54,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
auto
var
=
scope
->
FindVar
(
"NCCLID"
);
auto
var
=
scope
->
FindVar
(
"NCCLID"
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
id
=
var
->
GetMutable
<
ncclUniqueId
>
();
auto
id
=
var
->
GetMutable
<
ncclUniqueId
>
();
ncclGetUniqueId
(
id
);
platform
::
dynload
::
ncclGetUniqueId
(
id
);
std
::
vector
<
std
::
string
>
endpoint_list
=
std
::
vector
<
std
::
string
>
endpoint_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"endpoint_list"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"endpoint_list"
);
...
@@ -120,4 +120,4 @@ For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the ser
...
@@ -120,4 +120,4 @@ For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the ser
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
gen_nccl_id
_op
,
ops
::
GenNCCLIdOp
,
ops
::
GenNCCLIdOpMaker
);
REGISTER_OPERATOR
(
gen_nccl_id
,
ops
::
GenNCCLIdOp
,
ops
::
GenNCCLIdOpMaker
);
paddle/fluid/platform/nccl_helper.h
浏览文件 @
d9320dcd
...
@@ -73,7 +73,9 @@ struct NCCLContextMap {
...
@@ -73,7 +73,9 @@ struct NCCLContextMap {
std
::
unordered_map
<
int
,
NCCLContext
>
contexts_
;
std
::
unordered_map
<
int
,
NCCLContext
>
contexts_
;
std
::
vector
<
int
>
order_
;
std
::
vector
<
int
>
order_
;
explicit
NCCLContextMap
(
const
std
::
vector
<
platform
::
Place
>
&
places
)
{
explicit
NCCLContextMap
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
ncclUniqueId
*
nccl_id
=
nullptr
,
size_t
node_count
=
0
,
size_t
trainer_id
=
0
)
{
PADDLE_ENFORCE
(
!
places
.
empty
());
PADDLE_ENFORCE
(
!
places
.
empty
());
order_
.
reserve
(
places
.
size
());
order_
.
reserve
(
places
.
size
());
for
(
auto
&
p
:
places
)
{
for
(
auto
&
p
:
places
)
{
...
@@ -85,18 +87,36 @@ struct NCCLContextMap {
...
@@ -85,18 +87,36 @@ struct NCCLContextMap {
order_
.
size
(),
contexts_
.
size
(),
order_
.
size
(),
contexts_
.
size
(),
"NCCL Context Map does not support contain two or more same device"
);
"NCCL Context Map does not support contain two or more same device"
);
if
(
places
.
size
()
>
1
)
{
if
(
places
.
size
()
<=
1
)
{
std
::
unique_ptr
<
ncclComm_t
[]
>
comms
(
new
ncclComm_t
[
order_
.
size
()]);
return
;
}
std
::
unique_ptr
<
ncclComm_t
[]
>
comms
(
new
ncclComm_t
[
order_
.
size
()]);
// if pass nccl_id here, can assume we are doing multi node training
if
(
nccl_id
==
nullptr
)
{
{
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitAll
(
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitAll
(
comms
.
get
(),
static_cast
<
int
>
(
order_
.
size
()),
order_
.
data
()));
comms
.
get
(),
static_cast
<
int
>
(
order_
.
size
()),
order_
.
data
()));
}
}
int
i
=
0
;
}
else
{
for
(
auto
&
dev_id
:
order_
)
{
PADDLE_ENFORCE_GT
(
node_count
,
0
);
contexts_
.
at
(
dev_id
).
comm_
=
comms
[
i
++
];
PADDLE_ENFORCE_EQ
(
node_count
%
places
.
size
(),
0
,
"must have same number of GPUs on each node"
);
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
int
nranks
=
node_count
*
order_
.
size
();
for
(
auto
&
gpu_id
:
order_
)
{
int
rank
=
trainer_id
*
order_
.
size
()
+
gpu_id
;
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_id
));
PADDLE_ENFORCE
(
ncclCommInitRank
(
comms
.
get
()
+
gpu_id
,
nranks
,
*
nccl_id
,
rank
));
}
}
}
}
}
int
i
=
0
;
for
(
auto
&
dev_id
:
order_
)
{
contexts_
.
at
(
dev_id
).
comm_
=
comms
[
i
++
];
}
}
}
NCCLContextMap
(
const
NCCLContextMap
&
other
)
=
delete
;
NCCLContextMap
(
const
NCCLContextMap
&
other
)
=
delete
;
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
d9320dcd
...
@@ -502,11 +502,13 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -502,11 +502,13 @@ All parameter, weight, gradient are variables in Paddle.
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
std
::
vector
<
Scope
*>
&
local_scopes
,
Scope
*
scope
,
std
::
vector
<
Scope
*>
&
local_scopes
,
bool
allow_op_delay
,
bool
use_default_grad_scale
)
{
bool
allow_op_delay
,
bool
use_default_grad_scale
,
size_t
num_trainers
,
size_t
trainer_id
)
{
new
(
&
self
)
ParallelExecutor
(
new
(
&
self
)
ParallelExecutor
(
num_threads
,
use_event
,
places
,
params
,
bcast_vars
,
num_threads
,
use_event
,
places
,
params
,
bcast_vars
,
main_program
,
loss_var_name
,
scope
,
local_scopes
,
main_program
,
loss_var_name
,
scope
,
local_scopes
,
allow_op_delay
,
use_default_grad_scale
);
allow_op_delay
,
use_default_grad_scale
,
num_trainers
,
trainer_id
);
})
})
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToGPUs
)
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToGPUs
)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录