Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
35744e7b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
35744e7b
编写于
3月 15, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Polish code
上级
22bb262a
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
82 addition
and
22 deletion
+82
-22
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+79
-21
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+2
-0
python/paddle/fluid/tests/unittests/test_parallel_executor.py
...on/paddle/fluid/tests/unittests/test_parallel_executor.py
+1
-1
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
35744e7b
...
@@ -20,6 +20,12 @@ limitations under the License. */
...
@@ -20,6 +20,12 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
#ifdef PADDLE_WITH_CUDA
// FIXME: CHECK the return value of x;
#define NCCL_INVOKE(x) x
#endif
struct
OpHandle
;
struct
OpHandle
;
struct
VarHandle
{
struct
VarHandle
{
...
@@ -71,9 +77,51 @@ class ParallelExecutorPrivate {
...
@@ -71,9 +77,51 @@ class ParallelExecutorPrivate {
std
::
unordered_map
<
platform
::
Place
,
Scope
*
,
platform
::
PlaceHash
>
std
::
unordered_map
<
platform
::
Place
,
Scope
*
,
platform
::
PlaceHash
>
local_scopes_
;
local_scopes_
;
std
::
unordered_map
<
platform
::
Place
,
platform
::
CUDADeviceContext
,
platform
::
PlaceHash
>
#ifdef PADDLE_WITH_CUDA
dev_ctxs_
;
struct
NCCLContext
{
std
::
unique_ptr
<
platform
::
CUDADeviceContext
>
ctx_
;
ncclComm_t
comm
;
explicit
NCCLContext
(
int
dev_id
)
{
ctx_
.
reset
(
new
platform
::
CUDADeviceContext
(
platform
::
CUDAPlace
(
dev_id
)));
}
cudaStream_t
stream
()
const
{
return
ctx_
->
stream
();
}
int
device_id
()
const
{
return
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_
->
GetPlace
()).
device
;
}
static
void
InitNCCLContext
(
std
::
map
<
int
,
NCCLContext
>
&
contexts
)
{
std
::
vector
<
ncclComm_t
>
comms
;
std
::
vector
<
int
>
devs
;
comms
.
resize
(
contexts
.
size
());
devs
.
reserve
(
contexts
.
size
());
for
(
auto
&
ctx
:
contexts
)
{
devs
.
push_back
(
ctx
.
first
);
}
NCCL_INVOKE
(
platform
::
dynload
::
ncclCommInitAll
(
&
comms
[
0
],
static_cast
<
int
>
(
contexts
.
size
()),
&
devs
[
0
]));
int
i
=
0
;
for
(
auto
&
ctx
:
contexts
)
{
ctx
.
second
.
comm
=
comms
[
i
++
];
}
}
};
std
::
map
<
int
,
NCCLContext
>
communication_streams_
;
NCCLContext
&
GetNCCLCtx
(
platform
::
Place
p
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
return
communication_streams_
.
at
(
dev_id
);
}
#endif
platform
::
Place
main_place_
;
platform
::
Place
main_place_
;
std
::
unordered_map
<
platform
::
Place
,
std
::
unordered_map
<
platform
::
Place
,
...
@@ -107,8 +155,10 @@ ParallelExecutor::ParallelExecutor(
...
@@ -107,8 +155,10 @@ ParallelExecutor::ParallelExecutor(
member_
->
main_place_
=
places
[
0
];
member_
->
main_place_
=
places
[
0
];
// Bcast Parameters to all GPUs
// Bcast Parameters to all GPUs
if
(
platform
::
is_gpu_place
(
member_
->
main_place_
))
{
// Is CUDA
if
(
platform
::
is_gpu_place
(
member_
->
main_place_
)
&&
// BCastParamsToGPUs(startup_program);
member_
->
local_scopes_
.
size
()
!=
1
)
{
// Is CUDA
BuildNCCLCommunicator
();
BCastParamsToGPUs
(
startup_program
);
}
}
// Startup Program has been run. All local scopes has correct parameters.
// Startup Program has been run. All local scopes has correct parameters.
...
@@ -241,20 +291,20 @@ VarHandle *ParallelExecutor::GetVarHandle(const std::string &each_var_name,
...
@@ -241,20 +291,20 @@ VarHandle *ParallelExecutor::GetVarHandle(const std::string &each_var_name,
void
ParallelExecutor
::
BCastParamsToGPUs
(
void
ParallelExecutor
::
BCastParamsToGPUs
(
const
ProgramDesc
&
startup_program
)
const
{
const
ProgramDesc
&
startup_program
)
const
{
#ifdef PADDLE_WITH_CUDA
auto
*
main_scope
=
member_
->
local_scopes_
[
member_
->
main_place_
];
auto
*
main_scope
=
member_
->
local_scopes_
[
member_
->
main_place_
];
for
(
auto
*
var_desc
:
startup_program
.
Block
(
0
).
AllVars
())
{
for
(
auto
*
var_desc
:
startup_program
.
Block
(
0
).
AllVars
())
{
if
(
var_desc
->
GetType
()
==
proto
::
VarType
::
LOD_TENSOR
)
{
if
(
var_desc
->
GetType
()
==
proto
::
VarType
::
LOD_TENSOR
)
{
auto
&
main_tensor
=
auto
&
main_tensor
=
main_scope
->
FindVar
(
var_desc
->
Name
())
->
Get
<
LoDTensor
>
();
main_scope
->
FindVar
(
var_desc
->
Name
())
->
Get
<
LoDTensor
>
();
ncclDataType_t
data_type
=
ToNCCLDataType
(
main_tensor
.
type
());
ncclDataType_t
data_type
=
ToNCCLDataType
(
main_tensor
.
type
());
auto
&
dims
=
main_tensor
.
dims
();
auto
&
dims
=
main_tensor
.
dims
();
size_t
numel
=
main_tensor
.
numel
();
size_t
numel
=
main_tensor
.
numel
();
std
::
vector
<
std
::
pair
<
void
*
,
const
platform
::
DeviceContext
*>>
mems
;
std
::
vector
<
std
::
pair
<
void
*
,
ParallelExecutorPrivate
::
NCCLContext
*>>
mems
.
emplace_back
(
mems
;
const_cast
<
void
*>
(
main_tensor
.
data
<
void
>
()),
mems
.
emplace_back
(
const_cast
<
void
*>
(
main_tensor
.
data
<
void
>
()),
new
platform
::
CUDADeviceContext
(
&
member_
->
GetNCCLCtx
(
member_
->
main_place_
));
boost
::
get
<
platform
::
CUDAPlace
>
(
member_
->
main_place_
)));
for
(
auto
&
pair
:
member_
->
local_scopes_
)
{
for
(
auto
&
pair
:
member_
->
local_scopes_
)
{
if
(
pair
.
first
==
member_
->
main_place_
)
{
if
(
pair
.
first
==
member_
->
main_place_
)
{
...
@@ -265,8 +315,7 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -265,8 +315,7 @@ void ParallelExecutor::BCastParamsToGPUs(
auto
*
t
=
local_scope
->
Var
(
var_desc
->
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
*
t
=
local_scope
->
Var
(
var_desc
->
Name
())
->
GetMutable
<
LoDTensor
>
();
t
->
Resize
(
dims
);
t
->
Resize
(
dims
);
mems
.
emplace_back
(
t
->
mutable_data
(
pair
.
first
,
main_tensor
.
type
()),
mems
.
emplace_back
(
t
->
mutable_data
(
pair
.
first
,
main_tensor
.
type
()),
new
platform
::
CUDADeviceContext
(
&
member_
->
GetNCCLCtx
(
member_
->
main_place_
));
boost
::
get
<
platform
::
CUDAPlace
>
(
pair
.
first
)));
}
}
// TODO(yy): Invoke ncclBCast here. mems, numel, data_type. The mems[0]
// TODO(yy): Invoke ncclBCast here. mems, numel, data_type. The mems[0]
...
@@ -274,17 +323,26 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -274,17 +323,26 @@ void ParallelExecutor::BCastParamsToGPUs(
(
void
)(
data_type
);
(
void
)(
data_type
);
(
void
)(
numel
);
(
void
)(
numel
);
// Free Communication Ctx
for
(
auto
&
pair
:
mems
)
{
// Release Communication Ctx
// FIXME: Store CUDA DevCtx to member. Since NCCL All Reduce will use
// this
delete
pair
.
second
;
}
}
}
}
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
void
ParallelExecutor
::
BuildNCCLCommunicator
()
const
{
#ifdef PADDLE_WITH_CUDA
for
(
auto
&
place_pair
:
member_
->
local_scopes_
)
{
auto
place
=
place_pair
.
first
;
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
member_
->
communication_streams_
.
emplace
(
dev_id
,
ParallelExecutorPrivate
::
NCCLContext
(
dev_id
));
}
}
ParallelExecutorPrivate
::
NCCLContext
::
InitNCCLContext
(
member_
->
communication_streams_
);
#endif
}
}
std
::
vector
<
LoDTensor
>
ParallelExecutor
::
Run
(
std
::
vector
<
LoDTensor
>
ParallelExecutor
::
Run
(
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
35744e7b
...
@@ -55,6 +55,8 @@ class ParallelExecutor {
...
@@ -55,6 +55,8 @@ class ParallelExecutor {
void
ConstructDependencyGraph
(
const
std
::
unordered_set
<
std
::
string
>&
params
,
void
ConstructDependencyGraph
(
const
std
::
unordered_set
<
std
::
string
>&
params
,
const
ProgramDesc
&
main_program
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
)
const
;
const
std
::
string
&
loss_var_name
)
const
;
void
BuildNCCLCommunicator
()
const
;
};
};
}
// namespace framework
}
// namespace framework
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor.py
浏览文件 @
35744e7b
...
@@ -35,7 +35,7 @@ class ParallelExecutor(unittest.TestCase):
...
@@ -35,7 +35,7 @@ class ParallelExecutor(unittest.TestCase):
adam
=
fluid
.
optimizer
.
Adam
()
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
adam
.
minimize
(
loss
)
act_places
=
[]
act_places
=
[]
for
each
in
[
fluid
.
CUDAPlace
(
0
)
,
fluid
.
CUDAPlace
(
1
)
]:
for
each
in
[
fluid
.
CUDAPlace
(
0
)]:
p
=
fluid
.
core
.
Place
()
p
=
fluid
.
core
.
Place
()
p
.
set_place
(
each
)
p
.
set_place
(
each
)
act_places
.
append
(
p
)
act_places
.
append
(
p
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录