Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
867c312b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
867c312b
编写于
11月 27, 2018
作者:
G
gongweibao
提交者:
GitHub
11月 27, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix allreduce dependency order. (#14586)
上级
56a4912b
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
190 addition
and
8 deletion
+190
-8
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+2
-1
paddle/fluid/framework/details/all_reduce_deps_pass.cc
paddle/fluid/framework/details/all_reduce_deps_pass.cc
+125
-0
paddle/fluid/framework/details/all_reduce_deps_pass.h
paddle/fluid/framework/details/all_reduce_deps_pass.h
+33
-0
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+21
-0
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+6
-0
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+2
-7
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
867c312b
...
...
@@ -39,11 +39,12 @@ if (WITH_GPU)
endif
()
cc_library
(
sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass
)
cc_library
(
all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass
)
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass
all_reduce_deps_pass
)
if
(
WITH_GPU
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
endif
()
...
...
paddle/fluid/framework/details/all_reduce_deps_pass.cc
0 → 100644
浏览文件 @
867c312b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
VarHandle
*
GetValidInput
(
const
OpHandleBase
*
a
)
{
for
(
auto
p
:
a
->
Inputs
())
{
VarHandle
*
b
=
dynamic_cast
<
VarHandle
*>
(
p
);
if
(
b
)
{
return
b
;
}
}
return
nullptr
;
}
std
::
unique_ptr
<
ir
::
Graph
>
AllReduceDepsPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
graph_ops
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
// get vars order
int
order
=
0
;
std
::
unordered_map
<
std
::
string
,
int
>
vars
;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto
&
ops
=
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
for
(
auto
*
op_desc
:
ops
)
{
auto
outputs
=
op_desc
->
Outputs
();
for
(
auto
&
o_it
:
outputs
)
{
for
(
auto
&
v
:
o_it
.
second
)
{
// values
vars
[
v
]
=
order
;
}
}
order
++
;
}
std
::
vector
<
OpHandleBase
*>
dist_ops
;
// get allreduce ops.
for
(
auto
&
op
:
graph_ops
)
{
// FIXME(gongwb):add broad cast.
if
(
op
->
Name
()
==
"all_reduce"
||
op
->
Name
()
==
"reduce"
)
{
dist_ops
.
push_back
(
op
);
}
}
VLOG
(
10
)
<<
"dist_ops size:"
<<
dist_ops
.
size
()
<<
std
::
endl
;
std
::
sort
(
dist_ops
.
begin
(),
dist_ops
.
end
(),
[
&
](
OpHandleBase
*
op1
,
OpHandleBase
*
op2
)
{
VarHandle
*
i0
=
dynamic_cast
<
VarHandle
*>
(
GetValidInput
(
op1
));
VarHandle
*
i1
=
dynamic_cast
<
VarHandle
*>
(
GetValidInput
(
op2
));
PADDLE_ENFORCE
(
i0
!=
nullptr
&&
i1
!=
nullptr
,
"%s convert to %s error"
,
op1
->
DebugString
(),
op2
->
DebugString
());
auto
l_it
=
vars
.
find
(
i0
->
name_
);
auto
r_it
=
vars
.
find
(
i1
->
name_
);
if
(
l_it
->
second
<
r_it
->
second
)
return
true
;
if
(
l_it
->
second
==
r_it
->
second
)
{
return
i0
->
name_
<
i1
->
name_
;
}
return
false
;
});
// add dependency.
auto
&
sorted_ops
=
dist_ops
;
for
(
size_t
i
=
1
;
i
<
sorted_ops
.
size
();
++
i
)
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
auto
*
pre_op
=
sorted_ops
[
i
-
1
];
auto
*
op
=
sorted_ops
[
i
];
pre_op
->
AddOutput
(
dep_var
);
op
->
AddInput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
VLOG
(
10
)
<<
"add all_reduce sequential dependencies between "
<<
pre_op
<<
" and "
<<
op
;
VLOG
(
10
)
<<
"pre_op:"
<<
pre_op
->
DebugString
()
<<
", op:"
<<
op
->
DebugString
();
}
return
graph
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
all_reduce_deps_pass
,
paddle
::
framework
::
details
::
AllReduceDepsPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
paddle/fluid/framework/details/all_reduce_deps_pass.h
0 → 100644
浏览文件 @
867c312b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
// TODO(gongwb): overlap allreduce with backward computation.
class
AllReduceDepsPass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
867c312b
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
...
...
@@ -24,6 +25,10 @@ namespace paddle {
namespace
framework
{
namespace
details
{
static
inline
bool
SeqOnlyAllReduceOps
(
const
BuildStrategy
&
strategy
)
{
return
(
!
strategy
.
enable_sequential_execution_
&&
strategy
.
num_trainers_
>
1
);
}
class
ParallelExecutorPassBuilder
:
public
ir
::
PassBuilder
{
public:
explicit
ParallelExecutorPassBuilder
(
const
BuildStrategy
&
strategy
)
...
...
@@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Verify that the graph is correct for multi-device executor.
AppendPass
(
"multi_devices_check_pass"
);
if
(
SeqOnlyAllReduceOps
(
strategy
))
{
AppendPass
(
"all_reduce_deps_pass"
);
}
if
(
strategy_
.
remove_unnecessary_lock_
)
{
AppendPass
(
"modify_op_lock_and_record_event_pass"
);
}
...
...
@@ -124,6 +133,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
VLOG
(
1
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
VLOG
(
1
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
<<
", num_trainers:"
<<
num_trainers_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
...
...
@@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
USE_PASS
(
multi_devices_check_pass
);
USE_PASS
(
multi_devices_print_pass
);
USE_PASS
(
sequential_execution_pass
);
USE_PASS
(
all_reduce_deps_pass
);
USE_PASS
(
modify_op_lock_and_record_event_pass
);
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
867c312b
...
...
@@ -73,6 +73,7 @@ struct BuildStrategy {
bool
fuse_broadcast_op_
{
false
};
int
num_trainers_
{
1
};
bool
remove_unnecessary_lock_
{
false
};
// NOTE:
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
867c312b
...
...
@@ -860,6 +860,12 @@ All parameter, weight, gradient are variables in Paddle.
self
.
remove_unnecessary_lock_
=
b
;
},
R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC"
)
.
def_property
(
"num_trainers"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
num_trainers_
;
},
[](
BuildStrategy
&
self
,
int
num_trainers
)
{
self
.
num_trainers_
=
num_trainers
;
})
.
def_property
(
"fuse_elewise_add_act_ops"
,
[](
const
BuildStrategy
&
self
)
{
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
867c312b
...
...
@@ -124,16 +124,11 @@ class ParallelExecutor(object):
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
*
2
# Set 1 thread num under nccl2 distribute
# env to make sure all gpus run ops in same order.
if
num_trainers
>
1
:
assert
(
use_cuda
)
# FIXME(gongwb): avoid this set.
exec_strategy
.
num_threads
=
1
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
build_strategy
.
num_trainers
=
num_trainers
main
=
main_program
main
=
main
if
main
else
framework
.
default_main_program
()
if
scope
==
None
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录