Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
fd3aad6c
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fd3aad6c
编写于
7月 23, 2019
作者:
C
chengduo
提交者:
GitHub
7月 23, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Make fuse_optimizer_op_pass also work when the model contains sparse gradients. (#18664)
* support sparse gradients test=develop
上级
6b78e00d
变更
26
展开全部
隐藏空白更改
内联
并排
Showing
26 changed file
with
693 addition
and
474 deletion
+693
-474
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+1
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+28
-23
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+4
-16
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+6
-2
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
...id/framework/details/scope_buffered_ssa_graph_executor.cc
+25
-0
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-1
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+150
-159
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h
+0
-0
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
...work/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+165
-137
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
...ework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+16
-13
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+0
-2
paddle/fluid/framework/ir/graph_helper.h
paddle/fluid/framework/ir/graph_helper.h
+4
-0
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+3
-3
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+1
-1
paddle/fluid/framework/ir/pass.cc
paddle/fluid/framework/ir/pass.cc
+0
-4
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-0
paddle/fluid/operators/coalesce_tensor_op.cc
paddle/fluid/operators/coalesce_tensor_op.cc
+24
-39
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+4
-4
paddle/fluid/platform/device_memory_aligment.cc
paddle/fluid/platform/device_memory_aligment.cc
+34
-0
paddle/fluid/platform/device_memory_aligment.h
paddle/fluid/platform/device_memory_aligment.h
+27
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-1
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+6
-0
python/paddle/fluid/tests/unittests/simple_nets.py
python/paddle/fluid/tests/unittests/simple_nets.py
+28
-0
python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
...n/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
+4
-5
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
...paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+83
-32
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
.../paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+77
-31
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
fd3aad6c
...
...
@@ -95,5 +95,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass
alloc_continuous_space_for_grad
_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
coalesce_grad_tensor
_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass
)
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
fd3aad6c
...
...
@@ -76,6 +76,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
PADDLE_ENFORCE
(
!
FLAGS_use_mkldnn
,
"Please compile with MKLDNN first to use MKLDNN"
);
#endif
if
(
strategy_
.
enable_sequential_execution_
)
{
VLOG
(
1
)
<<
"Add sequential_execution_pass"
;
AppendPass
(
"sequential_execution_pass"
);
...
...
@@ -108,30 +109,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
// for single card training, fuse_all_reduce_ops is unnecessary.
//
alloc_continuous_space_for_grad
_pass should be before of MultiDevPass.
//
coalesce_grad_tensor
_pass should be before of MultiDevPass.
if
(
strategy_
.
fuse_all_reduce_ops_
)
{
VLOG
(
1
)
<<
"Add
alloc_continuous_space_for_grad
_pass"
;
AppendPass
(
"
alloc_continuous_space_for_grad
_pass"
);
VLOG
(
1
)
<<
"Add
coalesce_grad_tensor
_pass"
;
AppendPass
(
"
coalesce_grad_tensor
_pass"
);
}
// Fuse all the optimization operators.
if
(
strategy_
.
is_distribution_
)
{
VLOG
(
3
)
<<
"Currently, fuse_all_optimizer_ops only works under "
"Non-distributed mode."
;
strategy_
.
fuse_all_optimizer_ops_
=
false
;
}
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
||
strategy_
.
is_distribution_
)
{
VLOG
(
3
)
<<
"Currently, fuse_all_optimizer_ops only works under AllReduce "
"mode."
;
strategy_
.
fuse_all_optimizer_ops_
=
false
;
}
if
(
strategy_
.
fuse_all_optimizer_ops_
)
{
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
||
strategy_
.
is_distribution_
)
{
VLOG
(
3
)
<<
"Currently, fuse_all_optimizer_ops only works under AllReduce "
"mode."
;
strategy_
.
fuse_all_optimizer_ops_
=
false
;
}
else
{
// NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused.
VLOG
(
1
)
<<
"Add fuse_adam_op_pass"
;
AppendPass
(
"fuse_adam_op_pass"
);
VLOG
(
1
)
<<
"Add fuse_sgd_op_pass"
;
AppendPass
(
"fuse_sgd_op_pass"
);
VLOG
(
1
)
<<
"Add fuse_momentum_op_pass"
;
AppendPass
(
"fuse_momentum_op_pass"
);
}
// NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused.
VLOG
(
1
)
<<
"Add fuse_adam_op_pass"
;
AppendPass
(
"fuse_adam_op_pass"
);
VLOG
(
1
)
<<
"Add fuse_sgd_op_pass"
;
AppendPass
(
"fuse_sgd_op_pass"
);
VLOG
(
1
)
<<
"Add fuse_momentum_op_pass"
;
AppendPass
(
"fuse_momentum_op_pass"
);
}
// Add a graph viz pass to record a graph.
...
...
@@ -301,7 +306,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
NCCLCommunicator
>
(
kNCCLCtxs
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"
alloc_continuous_space_for_grad
_pass"
||
}
else
if
(
pass
->
Type
()
==
"
coalesce_grad_tensor
_pass"
||
pass
->
Type
()
==
"fuse_adam_op_pass"
||
pass
->
Type
()
==
"fuse_sgd_op_pass"
||
pass
->
Type
()
==
"fuse_momentum_op_pass"
||
...
...
@@ -321,7 +326,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
new
bool
(
use_hierarchical_allreduce_
));
#endif
}
}
else
if
(
pass
->
Type
()
==
"
alloc_continuous_space_for_grad
_pass"
)
{
}
else
if
(
pass
->
Type
()
==
"
coalesce_grad_tensor
_pass"
)
{
pass
->
Erase
(
kPlaces
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
,
&
places
);
pass
->
Erase
(
kLocalScopes
);
...
...
@@ -389,7 +394,7 @@ USE_PASS(backward_optimizer_op_deps_pass);
USE_PASS
(
modify_op_lock_and_record_event_pass
);
USE_PASS
(
inplace_pass
);
USE_PASS
(
lock_free_optimize_pass
);
USE_PASS
(
alloc_continuous_space_for_grad
_pass
);
USE_PASS
(
coalesce_grad_tensor
_pass
);
USE_PASS
(
graph_to_program_pass
);
USE_PASS
(
fuse_adam_op_pass
);
USE_PASS
(
fuse_sgd_op_pass
);
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
浏览文件 @
fd3aad6c
...
...
@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
skip_fused_all_reduce_check
,
false
,
""
);
...
...
@@ -24,19 +25,6 @@ namespace paddle {
namespace
framework
{
namespace
details
{
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
static
size_t
Alignment
(
size_t
size
,
const
platform
::
Place
&
place
)
{
// Allow to allocate the minimum chunk size is 4 KB.
size_t
alignment
=
1
<<
12
;
if
(
platform
::
is_gpu_place
(
place
))
{
// Allow to allocate the minimum chunk size is 256 B.
alignment
=
1
<<
8
;
}
size_t
remaining
=
size
%
alignment
;
return
remaining
==
0
?
size
:
size
+
(
alignment
-
remaining
);
}
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
const
LoDTensor
*>>>
GradientAndLoDTensor
;
...
...
@@ -121,7 +109,7 @@ void FusedAllReduceOpHandle::RunImpl() {
for
(
size_t
k
=
1
;
k
<
g_tensor
.
size
();
++
k
)
{
const
void
*
cur_address
=
g_tensor
.
at
(
k
-
1
).
second
->
data
<
void
>
();
int64_t
len
=
g_tensor
.
at
(
k
-
1
).
second
->
numel
();
auto
offset
=
Alignment
(
len
*
size_of_dtype
,
places_
[
0
]);
auto
offset
=
platform
::
Alignment
(
len
*
size_of_dtype
,
places_
[
0
]);
void
*
infer_next_address
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
cur_address
)
+
offset
);
const
void
*
next_address
=
g_tensor
.
at
(
k
).
second
->
data
<
void
>
();
...
...
@@ -241,8 +229,8 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
// Get element number
int64_t
len
=
grad_tensor
.
at
(
i
).
second
->
numel
();
PADDLE_ENFORCE_GT
(
len
,
0
);
// Alignment(len)
*
numel
+=
Alignment
(
len
*
size_of_dtype
,
places_
[
0
])
/
size_of_dtype
;
*
numel
+=
platform
::
Alignment
(
len
*
size_of_dtype
,
places_
[
0
])
/
size_of_dtype
;
}
}
...
...
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
fd3aad6c
...
...
@@ -62,11 +62,15 @@ typedef std::vector<std::string> FusedGrads;
constexpr
char
kFusedGrads
[]
=
"fused_gradients"
;
typedef
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
ParamsAndGrads
;
constexpr
char
kParamsAndGrads
[]
=
"params_grads"
;
constexpr
char
kParamsAndDenseGrads
[]
=
"params_and_dense_grads"
;
constexpr
char
kParamsAndSparseGrads
[]
=
"params_and_sparse_grads"
;
typedef
std
::
vector
<
ProgramDesc
>
ProgramDescs
;
constexpr
char
kProgramDescs
[]
=
"program_descs"
;
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>>
GroupParamsAndGrads
;
constexpr
char
kGroupParamsAnd
Grads
[]
=
"group_params
_grads"
;
constexpr
char
kGroupParamsAnd
DenseGrads
[]
=
"group_params_dense
_grads"
;
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
浏览文件 @
fd3aad6c
...
...
@@ -17,6 +17,8 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -70,6 +72,29 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
InitializeVariable
(
pair
.
first
,
pair
.
second
);
}
}
const
ir
::
Graph
&
graph
=
Graph
();
if
(
graph
.
Has
(
details
::
kProgramDescs
))
{
auto
&
program_descs
=
graph
.
Get
<
details
::
ProgramDescs
>
(
details
::
kProgramDescs
);
// Init vars
auto
&
fused_grad_vars
=
graph
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
for
(
size_t
i
=
0
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
for
(
auto
&
var_name
:
fused_grad_vars
)
{
auto
var
=
local_exec_scopes_
[
i
]
->
Var
(
var_name
);
var
->
GetMutable
<
LoDTensor
>
();
}
}
for
(
auto
&
program_desc
:
program_descs
)
{
for
(
auto
&
op_desc
:
program_desc
.
Block
(
0
).
AllOps
())
{
for
(
size_t
i
=
0
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
auto
op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
op
->
Run
(
*
local_exec_scopes_
[
i
],
places_
[
i
]);
}
}
}
}
}
void
ScopeBufferedSSAGraphExecutor
::
DropLocalExeScopes
()
{
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
fd3aad6c
...
...
@@ -45,7 +45,7 @@ cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library
(
graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits
)
cc_library
(
fuse_pass_base SRCS fuse_pass_base.cc DEPS pass
)
cc_library
(
alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad
_pass.cc DEPS graph graph_helper
)
cc_library
(
coalesce_grad_tensor_pass SRCS coalesce_grad_tensor
_pass.cc DEPS graph graph_helper
)
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_viz_pass base
)
...
...
paddle/fluid/framework/ir/
alloc_continuous_space_for_grad
_pass.cc
→
paddle/fluid/framework/ir/
coalesce_grad_tensor
_pass.cc
浏览文件 @
fd3aad6c
此差异已折叠。
点击以展开。
paddle/fluid/framework/ir/
alloc_continuous_space_for_grad
_pass.h
→
paddle/fluid/framework/ir/
coalesce_grad_tensor
_pass.h
浏览文件 @
fd3aad6c
文件已移动
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
浏览文件 @
fd3aad6c
...
...
@@ -25,9 +25,6 @@ namespace ir {
void
FuseOptimizerOpPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
&
result
=
*
graph
;
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
const
std
::
string
fuse_op_type
=
GetOpType
();
std
::
vector
<
std
::
string
>
aux_var_names
=
GetAuxiliaryVarNames
();
aux_var_names
.
emplace_back
(
kParam
);
...
...
@@ -35,70 +32,91 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
// Step 1: Get the specified op and auxiliary variables.
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
result
);
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
aux_var_set
;
std
::
vector
<
ir
::
Node
*>
opt_ops
;
auto
vars_info
=
GetVarInfo
(
result
);
std
::
vector
<
ir
::
Node
*>
opt_nodes
;
size_t
opt_ops_num
=
0
;
// Note: Only take care about the dense gradients.
for
(
auto
&
node
:
topo_nodes
)
{
GetSpecifiedOpsAndVars
(
fuse_op_type
,
aux_var_names
,
node
,
&
opt_ops
,
&
aux_var_set
);
}
VLOG
(
6
)
<<
"Find "
<<
fuse_op_type
<<
" operators: "
<<
opt_ops
.
size
();
if
(
opt_ops
.
size
()
==
0
)
{
return
;
if
(
node
->
Op
()
->
Type
()
==
fuse_op_type
)
{
auto
grad_name
=
node
->
Op
()
->
Input
(
kGrad
);
PADDLE_ENFORCE_EQ
(
grad_name
.
size
(),
static_cast
<
size_t
>
(
1
));
if
(
IsLoDTensorType
(
GetTypeOfVar
(
vars_info
,
grad_name
[
0
])))
{
opt_nodes
.
emplace_back
(
node
);
}
++
opt_ops_num
;
}
}
if
(
result
.
Has
(
details
::
kFusedOptType
))
{
VLOG
(
6
)
<<
"Currently only support fusing one type optimizer op. Has fused "
<<
result
.
Get
<
details
::
FusedOptType
>
(
details
::
kFusedOptType
);
VLOG
(
6
)
<<
"Find "
<<
fuse_op_type
<<
" operators : "
<<
opt_ops_num
<<
", and "
<<
opt_nodes
.
size
()
<<
" for dense gradients "
;
if
(
opt_nodes
.
size
()
==
0
||
result
.
Has
(
details
::
kFusedOptType
))
{
if
(
result
.
Has
(
details
::
kFusedOptType
))
{
auto
&
opt_type
=
result
.
Get
<
details
::
FusedOptType
>
(
details
::
kFusedOptType
);
VLOG
(
6
)
<<
"Currently only support fusing one type optimizer op. "
"Has fused "
<<
opt_type
;
}
return
;
}
else
{
result
.
Set
(
details
::
kFusedOptType
,
new
details
::
FusedOptType
);
}
result
.
Set
(
details
::
kFusedOptType
,
new
details
::
FusedOptType
);
result
.
Get
<
details
::
FusedOptType
>
(
details
::
kFusedOptType
)
=
fuse_op_type
;
if
(
!
result
.
Has
(
details
::
kProgramDescs
))
{
result
.
Set
(
details
::
kProgramDescs
,
new
details
::
ProgramDescs
);
}
// Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
// initialized in scopes before execution.
if
(
!
result
.
Has
(
details
::
kFusedVars
))
{
result
.
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
}
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
aux_var_set
;
GetSpecifiedOpsAndVars
(
aux_var_names
,
opt_nodes
,
&
aux_var_set
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
fused_vars_name
;
fused_vars_name
.
reserve
(
aux_var_names
.
size
());
auto
&
fused_var_set
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
const
std
::
string
prefix
(
details
::
kFusedVarNamePrefix
);
// NOTE: the fused_var_name should be unique.
for
(
auto
&
var_name
:
aux_var_names
)
{
// NOTE: the fused_var_name should be unique.
auto
fused_var_name
=
prefix
+
"_"
+
fuse_op_type
+
"_"
+
var_name
+
"_"
+
aux_var_set
[
var_name
][
0
];
VLOG
(
6
)
<<
var_name
<<
": "
<<
fused_var_name
;
fused_vars_name
.
emplace
(
var_name
,
fused_var_name
);
PADDLE_ENFORCE_EQ
(
fused_var_set
.
count
(
fused_var_name
),
0
);
fused_var_set
.
insert
(
fused_var_name
);
fused_vars_name
.
emplace
(
var_name
,
fused_var_name
);
}
// Step 3: Get the fused Gradient's name
bool
grad_fused
=
false
;
if
(
result
.
Has
(
details
::
kParamsAndGrads
))
{
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
PADDLE_ENFORCE_EQ
(
params_grads
.
size
(),
aux_var_set
.
at
(
kGrad
).
size
(),
"The number of gradients and optimizer ops is not equal."
);
std
::
unordered_set
<
std
::
string
>
opt_grad_set
(
aux_var_set
.
at
(
kGrad
).
begin
(),
aux_var_set
.
at
(
kGrad
).
end
());
size_t
same_grad_num
=
0
;
for
(
auto
&
p_g
:
params_grads
)
{
if
(
opt_grad_set
.
count
(
p_g
.
second
))
{
++
same_grad_num
;
if
(
result
.
Has
(
details
::
kParamsAndDenseGrads
))
{
// NOTE: kParamsAndDenseGrads is generated by
// alloc_continue_space_for_grad_pass
auto
&
params_and_dense_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndDenseGrads
);
PADDLE_ENFORCE_LE
(
params_and_dense_grads
.
size
(),
aux_var_set
.
at
(
kGrad
).
size
(),
"The number of dense gradients should be little than optimizer ops."
);
std
::
unordered_set
<
std
::
string
>
opt_grad_set
(
aux_var_set
.
at
(
kGrad
).
size
());
for
(
auto
&
p_g
:
params_and_dense_grads
)
{
opt_grad_set
.
insert
(
p_g
.
second
);
}
std
::
vector
<
size_t
>
new_grad_idx
;
for
(
size_t
idx
=
0
;
idx
<
aux_var_set
.
at
(
kGrad
).
size
();
++
idx
)
{
auto
&
grad
=
aux_var_set
.
at
(
kGrad
).
at
(
idx
);
if
(
!
opt_grad_set
.
count
(
grad
))
{
new_grad_idx
.
emplace_back
(
idx
);
}
}
// NOTE(zcd): the gradient of kParamsAndGrads may be different with the
// kGrad.
if
(
same_grad_num
==
aux_var_set
.
at
(
kGrad
).
size
())
{
// NOTE(zcd): the gradient of kParamsAndDenseGrads may be different
// with the kGrad. The gradients of kParamsAndDenseGrads is
// collected during backward stage, but in optimization state, the
// some gradient's name maybe changed.
if
(
new_grad_idx
.
size
()
==
0
)
{
if
(
!
result
.
Has
(
details
::
kFusedGrads
))
{
PADDLE_THROW
(
"The
alloc_continuous_space_for_grad_pass should be called before
"
"this pass."
);
"The
coalesce_grad_tensor_pass should
"
"
be called before
this pass."
);
}
auto
&
fused_grad
=
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
);
PADDLE_ENFORCE_NE
(
fused_grad
.
size
(),
0
,
...
...
@@ -115,136 +133,146 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars
(
params_grads
,
&
aux_var_set
,
&
opt_ops
);
SortParametersAndAuxVars
(
params_and_dense_grads
,
&
aux_var_set
,
&
opt_nodes
);
grad_fused
=
true
;
}
else
{
VLOG
(
10
)
<<
"The number of new gradients is "
<<
new_grad_idx
.
size
();
if
(
new_grad_idx
.
size
()
==
1
)
return
;
// NOTE(zcd): If the gradients of backward stage and optimization stage
// have diff, Only take care of the the gradient of optimization stage.
GradientsFilter
(
new_grad_idx
,
&
opt_nodes
,
&
aux_var_set
);
}
}
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
separately.
aux_var_names
.
pop_back
();
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
// separately.
if
(
!
grad_fused
)
{
InitFusedGradsAndAllocSpaceForGrads
(
places
,
local_scopes
,
aux_var_set
.
at
(
kParam
),
aux_var_set
.
at
(
kGrad
),
fused_vars_name
.
at
(
kGrad
),
&
result
);
InitFusedGradsAndAllocSpaceForGrads
(
aux_var_set
.
at
(
kParam
),
aux_var_set
.
at
(
kGrad
),
fused_vars_name
.
at
(
kGrad
),
&
result
);
}
InitFusedVarsAndAllocSpaceForVars
(
places
,
local_scopes
,
aux_var_names
,
aux_var_set
,
fused_vars_name
);
aux_var_names
.
pop_back
();
InitFusedVarsAndAllocSpaceForVars
(
aux_var_names
,
aux_var_set
,
fused_vars_name
,
&
result
);
// Step 5: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps
(
aux_var_set
,
fused_vars_name
,
opt_
op
s
,
&
result
);
FuseOptimizerOps
(
aux_var_set
,
fused_vars_name
,
opt_
node
s
,
&
result
);
// Step 6: Remove optimizer Ops
for
(
auto
&
opt_op
:
opt_
op
s
)
{
for
(
auto
&
opt_op
:
opt_
node
s
)
{
graph
->
RemoveNode
(
opt_op
);
}
}
void
FuseOptimizerOpPass
::
GradientsFilter
(
const
std
::
vector
<
size_t
>
&
new_grad_idx
,
std
::
vector
<
Node
*>
*
opt_nodes
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
*
aux_var_set
)
const
{
for
(
auto
&
aux_vars
:
*
aux_var_set
)
{
std
::
vector
<
std
::
string
>
sorted_vars
;
sorted_vars
.
reserve
(
aux_vars
.
second
.
size
());
for
(
size_t
i
:
new_grad_idx
)
{
sorted_vars
.
emplace_back
(
aux_vars
.
second
.
at
(
i
));
}
std
::
swap
(
aux_vars
.
second
,
sorted_vars
);
if
(
VLOG_IS_ON
(
6
))
{
std
::
stringstream
out
;
for
(
auto
&
var_name
:
aux_vars
.
second
)
{
out
<<
var_name
<<
" "
;
}
VLOG
(
6
)
<<
aux_vars
.
first
<<
": "
<<
out
.
str
();
}
}
std
::
vector
<
Node
*>
sorted_ops
;
for
(
size_t
i
:
new_grad_idx
)
{
sorted_ops
.
emplace_back
(
opt_nodes
->
at
(
i
));
}
std
::
swap
(
*
opt_nodes
,
sorted_ops
);
}
void
FuseOptimizerOpPass
::
InitFusedGradsAndAllocSpaceForGrads
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
std
::
string
>
&
params
,
const
std
::
vector
<
std
::
string
>
&
grads
,
const
std
::
string
&
fused_grad_name
,
ir
::
Graph
*
result
)
const
{
// Get Var Nodes
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
vars
;
for
(
ir
::
Node
*
node
:
result
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars
.
emplace
(
node
->
Var
()
->
Name
(),
node
);
}
}
auto
vars_info
=
GetVarInfo
(
*
result
);
// Set Gradients as Persistable to prevent this var becoming reusable.
for
(
auto
&
grad_var_name
:
grads
)
{
auto
iter
=
vars
.
find
(
grad_var_name
);
PADDLE_ENFORCE
(
iter
!=
vars
.
end
());
PADDLE_ENFORCE_NOT_NULL
(
iter
->
second
->
Var
());
PADDLE_ENFORCE
(
iter
->
second
->
Var
()
->
GetType
()
==
proto
::
VarType
::
LOD_TENSOR
,
auto
iter
=
vars_info
.
find
(
grad_var_name
);
PADDLE_ENFORCE
(
iter
!=
vars_info
.
end
());
PADDLE_ENFORCE
(
!
iter
->
second
.
empty
());
PADDLE_ENFORCE_NOT_NULL
(
iter
->
second
.
front
()
->
Var
());
PADDLE_ENFORCE
(
IsLoDTensorType
(
iter
->
second
.
front
()
->
Var
()
->
GetType
()),
"Currently the gradient type only should be LoDTensor when "
"fusing optimizer ops."
);
iter
->
second
->
Var
()
->
SetPersistable
(
true
);
}
// Init Grads
for
(
auto
it
=
local_scopes
.
rbegin
();
it
!=
local_scopes
.
rend
();
++
it
)
{
auto
&
scope
=
*
it
;
VLOG
(
6
)
<<
"Init: "
<<
fused_grad_name
;
PADDLE_ENFORCE
(
scope
->
FindVar
(
fused_grad_name
)
==
nullptr
,
"%s has existed in scope."
,
fused_grad_name
);
scope
->
Var
(
fused_grad_name
)
->
GetMutable
<
LoDTensor
>
();
for
(
auto
&
grad_var_name
:
grads
)
{
auto
iter
=
vars
.
find
(
grad_var_name
);
PADDLE_ENFORCE
(
iter
!=
vars
.
end
());
PADDLE_ENFORCE_NOT_NULL
(
iter
->
second
->
Var
());
scope
->
Var
(
grad_var_name
)
->
GetMutable
<
LoDTensor
>
();
for
(
auto
var
:
iter
->
second
)
{
var
->
Var
()
->
SetPersistable
(
true
);
}
}
// Define Ops
ProgramDesc
program_desc
;
result
->
Get
<
details
::
ProgramDescs
>
(
details
::
kProgramDescs
).
emplace_back
();
ProgramDesc
&
program_desc
=
result
->
Get
<
details
::
ProgramDescs
>
(
details
::
kProgramDescs
).
back
();
auto
*
global_block
=
program_desc
.
MutableBlock
(
0
);
AppendAllocContinuousSpace
(
params
,
grads
,
fused_grad_name
,
global_block
,
false
,
false
);
// Run Ops
RunInitOps
(
places
,
local_scopes
,
*
global_block
);
}
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
FuseOptimizerOpPass
::
GetVarInfo
(
const
Graph
&
result
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
vars
;
for
(
Node
*
node
:
result
.
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars
[
node
->
Var
()
->
Name
()].
emplace_back
(
node
);
}
}
return
vars
;
}
bool
FuseOptimizerOpPass
::
IsLoDTensorType
(
const
proto
::
VarType
::
Type
&
type
)
const
{
// Current only support LOD_TENSOR.
return
type
==
proto
::
VarType
::
LOD_TENSOR
;
}
proto
::
VarType
::
Type
FuseOptimizerOpPass
::
GetTypeOfVar
(
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
&
var_nodes
,
const
std
::
string
&
name
)
const
{
auto
grad_iter
=
var_nodes
.
find
(
name
);
PADDLE_ENFORCE
(
grad_iter
!=
var_nodes
.
end
());
PADDLE_ENFORCE
(
grad_iter
->
second
.
size
()
>
0
);
PADDLE_ENFORCE_NOT_NULL
(
grad_iter
->
second
.
front
()
->
Var
());
return
grad_iter
->
second
.
front
()
->
Var
()
->
GetType
();
}
void
FuseOptimizerOpPass
::
InitFusedVarsAndAllocSpaceForVars
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
std
::
string
>
&
aux_var_names
,
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
&
aux_var_set
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
fused_vars_name
)
const
{
// Init Vars
for
(
auto
&
var_name
:
aux_var_names
)
{
auto
&
fused_var_name
=
fused_vars_name
.
at
(
var_name
);
InitVars
(
local_scopes
,
fused_var_name
);
}
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
fused_vars_name
,
ir
::
Graph
*
result
)
const
{
// Define Ops
ProgramDesc
program_desc
;
result
->
Get
<
details
::
ProgramDescs
>
(
details
::
kProgramDescs
).
emplace_back
();
ProgramDesc
&
program_desc
=
result
->
Get
<
details
::
ProgramDescs
>
(
details
::
kProgramDescs
).
back
();
auto
*
global_block
=
program_desc
.
MutableBlock
(
0
);
for
(
auto
&
var_name
:
aux_var_names
)
{
AppendAllocContinuousSpace
(
aux_var_set
.
at
(
var_name
),
aux_var_set
.
at
(
var_name
),
fused_vars_name
.
at
(
var_name
),
global_block
,
true
);
}
// Run Ops
RunInitOps
(
places
,
local_scopes
,
*
global_block
);
}
void
FuseOptimizerOpPass
::
RunInitOps
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
BlockDesc
&
global_block
)
const
{
for
(
size_t
i
=
0
;
i
<
local_scopes
.
size
();
++
i
)
{
for
(
auto
&
op_desc
:
global_block
.
AllOps
())
{
auto
op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
op
->
Run
(
*
local_scopes
[
i
],
places
[
i
]);
}
}
}
void
FuseOptimizerOpPass
::
InitVars
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
string
&
fused_var_name
)
const
{
// Alloc parameters and auxiliary vars in the respective scope.
size_t
idx
=
local_scopes
.
size
();
for
(
auto
iter
=
local_scopes
.
rbegin
();
iter
!=
local_scopes
.
rend
();
++
iter
,
--
idx
)
{
auto
&
scope
=
*
iter
;
VLOG
(
6
)
<<
"Init: "
<<
fused_var_name
;
PADDLE_ENFORCE
(
scope
->
FindVar
(
fused_var_name
)
==
nullptr
,
"%s has exist in scope[%d]"
,
fused_var_name
,
idx
);
scope
->
Var
(
fused_var_name
)
->
GetMutable
<
LoDTensor
>
();
}
}
void
FuseOptimizerOpPass
::
SortParametersAndAuxVars
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
&
params_grads
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
*
aux_vars_set
,
std
::
vector
<
ir
::
Node
*>
*
ops
)
const
{
PADDLE_ENFORCE_NE
(
aux_vars_set
->
count
(
"Param"
),
static_cast
<
size_t
>
(
0
));
auto
&
param_vec
=
aux_vars_set
->
at
(
"Param"
);
PADDLE_ENFORCE_NE
(
aux_vars_set
->
count
(
kParam
),
static_cast
<
size_t
>
(
0
));
auto
&
param_vec
=
aux_vars_set
->
at
(
kParam
);
std
::
vector
<
size_t
>
param_sort_idx
;
param_sort_idx
.
reserve
(
param_vec
.
size
());
...
...
@@ -264,11 +292,13 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
}
std
::
swap
(
aux_vars
.
second
,
sorted_vars
);
std
::
stringstream
out
;
for
(
auto
&
var_name
:
aux_vars
.
second
)
{
out
<<
var_name
<<
" "
;
if
(
VLOG_IS_ON
(
6
))
{
std
::
stringstream
out
;
for
(
auto
&
var_name
:
aux_vars
.
second
)
{
out
<<
var_name
<<
" "
;
}
VLOG
(
6
)
<<
aux_vars
.
first
<<
": "
<<
out
.
str
();
}
VLOG
(
6
)
<<
aux_vars
.
first
<<
": "
<<
out
.
str
();
}
std
::
vector
<
ir
::
Node
*>
sorted_ops
;
...
...
@@ -280,21 +310,19 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
}
void
FuseOptimizerOpPass
::
GetSpecifiedOpsAndVars
(
const
std
::
string
&
op_type
,
const
std
::
vector
<
std
::
string
>
&
aux_vars_name
,
ir
::
Node
*
node
,
std
::
vector
<
ir
::
Node
*>
*
op
s
,
const
std
::
vector
<
std
::
string
>
&
aux_vars_name
,
const
std
::
vector
<
ir
::
Node
*>
&
opt_node
s
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
*
aux_args_name
)
const
{
if
(
node
->
Op
()
->
Type
()
!=
op_type
)
return
;
std
::
stringstream
out
;
for
(
auto
&
var_n
:
aux_vars_name
)
{
auto
arg_names
=
node
->
Op
()
->
Input
(
var_n
);
PADDLE_ENFORCE_EQ
(
arg_names
.
size
(),
static_cast
<
size_t
>
(
1
)
);
(
*
aux_args_name
)[
var_n
].
emplace_back
(
arg_names
[
0
])
;
out
<<
var_n
<<
", "
<<
arg_names
[
0
]
<<
"; "
;
for
(
auto
&
node
:
opt_nodes
)
{
std
::
stringstream
out
;
for
(
auto
&
var_n
:
aux_vars_name
)
{
auto
arg_names
=
node
->
Op
()
->
Input
(
var_n
);
PADDLE_ENFORCE_EQ
(
arg_names
.
size
(),
static_cast
<
size_t
>
(
1
)
);
(
*
aux_args_name
)[
var_n
].
emplace_back
(
arg_names
[
0
]
);
out
<<
var_n
<<
", "
<<
arg_names
[
0
]
<<
"; "
;
}
}
VLOG
(
7
)
<<
out
.
str
();
ops
->
emplace_back
(
node
);
}
void
FuseOptimizerOpPass
::
AppendAllocContinuousSpace
(
...
...
@@ -302,7 +330,7 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const
std
::
vector
<
std
::
string
>
&
out_args
,
const
std
::
string
&
fused_out_arg
,
BlockDesc
*
global_block
,
bool
copy_data
,
bool
check_name
)
const
{
auto
op_desc
=
global_block
->
AppendOp
();
op_desc
->
SetType
(
"
alloc_continuous_space
"
);
op_desc
->
SetType
(
"
coalesce_tensor
"
);
op_desc
->
SetInput
(
"Input"
,
in_args
);
op_desc
->
SetOutput
(
"Output"
,
out_args
);
op_desc
->
SetOutput
(
"FusedOutput"
,
{
fused_out_arg
});
...
...
@@ -311,10 +339,10 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
}
void
FuseOptimizerOpPass
::
InserInputAndOutputForOptOps
(
const
std
::
vector
<
ir
::
Node
*>
&
opt_
op
s
,
ir
::
Node
*
opt_node
)
const
{
const
std
::
vector
<
ir
::
Node
*>
&
opt_
node
s
,
ir
::
Node
*
opt_node
)
const
{
std
::
unordered_set
<
ir
::
Node
*>
inputs
;
std
::
unordered_set
<
ir
::
Node
*>
outputs
;
for
(
auto
opt_op
:
opt_
op
s
)
{
for
(
auto
opt_op
:
opt_
node
s
)
{
// set inputs
inputs
.
insert
(
opt_op
->
inputs
.
begin
(),
opt_op
->
inputs
.
end
());
for
(
auto
&
input
:
opt_op
->
inputs
)
{
...
...
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
浏览文件 @
fd3aad6c
...
...
@@ -55,8 +55,8 @@ class FuseOptimizerOpPass : public ir::Pass {
const
std
::
vector
<
ir
::
Node
*>
&
adam_ops
,
ir
::
Graph
*
graph
)
const
=
0
;
void
GetSpecifiedOpsAndVars
(
const
std
::
string
&
op_type
,
const
std
::
vector
<
std
::
string
>
&
aux_vars_name
,
ir
::
Node
*
node
,
std
::
vector
<
ir
::
Node
*>
*
op
s
,
const
std
::
vector
<
std
::
string
>
&
aux_vars_name
,
const
std
::
vector
<
ir
::
Node
*>
&
opt_node
s
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
*
aux_args_name
)
const
;
...
...
@@ -67,27 +67,30 @@ class FuseOptimizerOpPass : public ir::Pass {
bool
check_name
=
true
)
const
;
void
InitFusedGradsAndAllocSpaceForGrads
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
std
::
string
>
&
params
,
const
std
::
vector
<
std
::
string
>
&
grads
,
const
std
::
string
&
fused_grad_name
,
ir
::
Graph
*
result
)
const
;
void
InitFusedVarsAndAllocSpaceForVars
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
std
::
string
>
&
aux_var_names
,
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
&
aux_var_set
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
fused_vars_name
)
const
;
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
fused_vars_name
,
ir
::
Graph
*
result
)
const
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
GetVarInfo
(
const
Graph
&
result
)
const
;
proto
::
VarType
::
Type
GetTypeOfVar
(
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
&
var_nodes
,
const
std
::
string
&
name
)
const
;
void
RunInitOps
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
BlockDesc
&
global_block
)
const
;
void
GradientsFilter
(
const
std
::
vector
<
size_t
>
&
new_grad_idx
,
std
::
vector
<
Node
*>
*
opt_nodes
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
*
aux_var_set
)
const
;
void
InitVars
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
string
&
fused_var_name
)
const
;
bool
IsLoDTensorType
(
const
proto
::
VarType
::
Type
&
type
)
const
;
};
}
// namespace ir
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
fd3aad6c
...
...
@@ -108,8 +108,6 @@ bool VarDescIsConsistency(const Graph &graph) {
var_name2node_set
;
for
(
ir
::
Node
*
node
:
graph
.
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
var_name2node_set
[
node
->
Var
()
->
Name
()].
emplace
(
node
);
}
}
...
...
paddle/fluid/framework/ir/graph_helper.h
浏览文件 @
fd3aad6c
...
...
@@ -38,6 +38,10 @@ struct NodeComp {
bool
HasCircle
(
const
Graph
&
graph
);
// Check if the var desc of node is consistency.
// The graph may have the same name node, for example, parameter
// is the input of operator and it also is the output of optimizer.
// For the persistable variable, the var_desc of the nodes with
// the same node name should be equal.
bool
VarDescIsConsistency
(
const
Graph
&
graph
);
// Find All Circles for debugging,
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
fd3aad6c
...
...
@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass {
#endif
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAnd
Dense
Grads
);
size_t
num_of_all_reduce
=
params_grads
.
size
();
std
::
unordered_set
<
std
::
string
>
grads
;
grads
.
reserve
(
num_of_all_reduce
);
...
...
@@ -60,8 +60,8 @@ class FuseAllReduceOpPass : public ir::Pass {
"it is not supported currently."
);
VLOG
(
10
)
<<
"Insert fused_all_reduce"
;
auto
&
group_params_grads
=
graph
->
Get
<
details
::
GroupParamsAndGrads
>
(
details
::
kGroupParamsAnd
Grads
);
auto
&
group_params_grads
=
graph
->
Get
<
details
::
GroupParamsAndGrads
>
(
details
::
kGroupParamsAndDense
Grads
);
for
(
auto
&
group_p_g
:
group_params_grads
)
{
size_t
group_size
=
group_p_g
.
size
();
...
...
paddle/fluid/framework/ir/node.h
浏览文件 @
fd3aad6c
...
...
@@ -49,7 +49,7 @@ class Node {
public:
virtual
~
Node
()
{
if
(
!
wrapper_
.
empty
())
{
VLOG
(
4
)
<<
"ir::Node deleting a wrapper node "
<<
Name
();
VLOG
(
10
)
<<
"ir::Node deleting a wrapper node "
<<
Name
();
wrapper_deleter_
();
}
}
...
...
paddle/fluid/framework/ir/pass.cc
浏览文件 @
fd3aad6c
...
...
@@ -33,16 +33,12 @@ Graph* Pass::Apply(Graph* graph) const {
PADDLE_ENFORCE
(
graph
->
Has
(
attr
),
"Required graph atrribute %s not set."
,
attr
);
}
auto
*
native_graph
=
graph
;
ApplyImpl
(
graph
);
// TODO(panyx0718): Add more verifications.
PADDLE_ENFORCE
(
!
HasCircle
(
*
graph
),
"Illegal Pass. Generated graph shouldn't has cycle."
);
PADDLE_ENFORCE
(
VarDescIsConsistency
(
*
graph
),
"The VarDescs of persistable variable are not consistency."
);
PADDLE_ENFORCE
(
graph
==
native_graph
,
"Pass::Apply() cannot delete the passed graph and shouldn't "
"return a new graph.(For the need of pybind11)"
);
applied_
=
true
;
return
graph
;
}
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
fd3aad6c
...
...
@@ -88,6 +88,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
endif
()
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
device_memory_aligment
)
# FIXME(typhoonzero): operator deps may not needed.
# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
...
...
paddle/fluid/operators/
alloc_continuous_space
_op.cc
→
paddle/fluid/operators/
coalesce_tensor
_op.cc
浏览文件 @
fd3aad6c
...
...
@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -26,7 +27,7 @@ static framework::proto::VarType::Type kDefaultDtype =
framework
::
proto
::
VarType
::
Type
::
VarType_Type_BOOL
;
template
<
typename
DeviceContext
,
typename
T
>
class
AllocContinuousSpaceKernel
:
public
framework
::
OpKernel
<
T
>
{
class
CoalesceTensorOp
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
&
in_var_names
=
context
.
Inputs
(
"Input"
);
...
...
@@ -86,8 +87,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
framework
::
TensorCopy
(
*
in_tensors
[
i
],
context
.
GetPlace
(),
dev_ctx
,
&
sub_tensor
);
offset
+=
Alignment
(
len
*
size_of_dtype
,
context
.
GetPlace
())
/
size_of_dtype
;
offset
+=
platform
::
Alignment
(
len
*
size_of_dtype
,
context
.
GetPlace
())
/
size_of_dtype
;
}
}
else
if
(
context
.
Attr
<
bool
>
(
"set_constant"
))
{
math
::
SetConstant
<
DeviceContext
,
T
>
set_constant
;
...
...
@@ -106,7 +107,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
->
ShareDataWith
(
fused_tensor
->
Slice
(
static_cast
<
int64_t
>
(
offset
),
static_cast
<
int64_t
>
(
offset
+
len
)))
.
Resize
(
dim
);
len
=
Alignment
(
len
*
size_of_dtype
,
context
.
GetPlace
())
/
size_of_dtype
;
len
=
platform
::
Alignment
(
len
*
size_of_dtype
,
context
.
GetPlace
())
/
size_of_dtype
;
offset
+=
len
;
ss
<<
"output("
<<
out_var_names
[
i
]
<<
") dim:("
<<
dim
<<
")"
<<
" address: "
<<
out_tensors
[
i
]
->
data
<
void
>
()
<<
", "
;
...
...
@@ -115,19 +117,6 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
}
private:
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
size_t
Alignment
(
size_t
size
,
const
platform
::
Place
&
place
)
const
{
// Allow to allocate the minimum chunk size is 4 KB.
size_t
alignment
=
1
<<
12
;
if
(
platform
::
is_gpu_place
(
place
))
{
// Allow to allocate the minimum chunk size is 256 B.
alignment
=
1
<<
8
;
}
size_t
remaining
=
size
%
alignment
;
return
remaining
==
0
?
size
:
size
+
(
alignment
-
remaining
);
}
void
GetMemSizeAndDtype
(
const
std
::
vector
<
const
framework
::
LoDTensor
*>
&
lod_tensors
,
const
std
::
vector
<
std
::
string
>
var_names
,
size_t
*
numel
,
...
...
@@ -156,7 +145,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GT
(
size
,
0
);
ss
<<
"input("
<<
var_names
[
i
]
<<
") dim:("
<<
lod_tensors
[
i
]
->
dims
()
<<
"), "
;
*
numel
+=
Alignment
(
static_cast
<
size_t
>
(
size
)
*
size_of_dtype
,
place
)
/
*
numel
+=
platform
::
Alignment
(
static_cast
<
size_t
>
(
size
)
*
size_of_dtype
,
place
)
/
size_of_dtype
;
}
...
...
@@ -176,17 +166,17 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
void
Make
()
override
{
AddInput
(
"Input"
,
"(vector<LoDTensor>) The input tensors of"
"
alloc_continuous_space
operator."
)
"
coalesce_tensor
operator."
)
.
AsDuplicable
();
AddOutput
(
"Output"
,
"(vector<LoDTensor>) The output "
"tensors of
alloc_continuous_space
operator. And the address "
"tensors of
coalesce_tensor
operator. And the address "
"of output tensors are continuous, they are sliced from the "
"tensor of FusedOutput."
)
.
AsDuplicable
();
AddOutput
(
"FusedOutput"
,
"(LoDTensor) The output tensor "
"of
alloc_continuous_space
operator. And the tensors of"
"of
coalesce_tensor
operator. And the tensors of"
" Output is sliced from the tensor of FusedOutput."
);
AddAttr
<
bool
>
(
"copy_data"
,
"Whether to copy the Input value to Output."
)
.
SetDefault
(
false
);
...
...
@@ -204,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment
(
R"DOC(
AllocContinuousSpace Operator.
alloc_continuous_space
is used to make the address of Output
coalesce_tensor
is used to make the address of Output
continuous according to the Input. This Op will alloc a big tensor
according to the tensors of Input, the dtype is the same with those input tensors,
the size is the sum of those input tensors' numel, and the dim of the big
...
...
@@ -213,7 +203,7 @@ The tensors of Output are sliced from the tensor of FusedOutput.
Note that, the dtype of Input should be the same, and the dim of Input
and Output should equal.
The tensors of Input and Output could be the same or different. And
alloc_continuous_space
allows copying the value of Input to Output, or
coalesce_tensor
allows copying the value of Input to Output, or
setting the Output with a constant value.
)DOC"
);
...
...
@@ -223,27 +213,22 @@ setting the Output with a constant value.
}
// namespace operators
}
// namespace paddle
REGISTER_OPERATOR
(
alloc_continuous_space
,
paddle
::
operators
::
AllocContinuousSpaceOp
,
REGISTER_OPERATOR
(
coalesce_tensor
,
paddle
::
operators
::
AllocContinuousSpaceOp
,
paddle
::
operators
::
AllocContinuousSpaceOpMaker
);
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CPU_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
coalesce_tensor
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CPUDeviceContext
,
plat
::
float16
>
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
coalesce_tensor
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
CoalesceTensorOp
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
#endif
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
fd3aad6c
...
...
@@ -102,17 +102,17 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
if
(
WITH_GPU
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce
)
nv_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
nv_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place
)
else
()
cc_library
(
profiler SRCS profiler.cc DEPS device_tracer enforce
)
cc_library
(
profiler SRCS profiler.cc DEPS device_tracer enforce
)
cc_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place
)
endif
()
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
cc_test
(
float16_test SRCS float16_test.cc DEPS lod_tensor
)
IF
(
WITH_GPU
)
nv_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
ENDIF
()
nv_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
if
(
WITH_GPU
)
...
...
paddle/fluid/platform/device_memory_aligment.cc
0 → 100644
浏览文件 @
fd3aad6c
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_memory_aligment.h"
namespace
paddle
{
namespace
platform
{
size_t
Alignment
(
size_t
size
,
const
platform
::
Place
&
place
)
{
size_t
alignment
=
1024
;
if
(
platform
::
is_cpu_place
(
place
))
{
alignment
=
CpuMinChunkSize
();
}
else
{
#ifdef PADDLE_WITH_CUDA
alignment
=
GpuMinChunkSize
();
#else
PADDLE_THROW
(
"Fluid is not compiled with CUDA"
);
#endif
}
size_t
remaining
=
size
%
alignment
;
return
remaining
==
0
?
size
:
size
+
(
alignment
-
remaining
);
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device_memory_aligment.h
0 → 100644
浏览文件 @
fd3aad6c
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
namespace
paddle
{
namespace
platform
{
size_t
Alignment
(
size_t
size
,
const
platform
::
Place
&
place
);
}
// namespace platform
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
fd3aad6c
...
...
@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/ir/
alloc_continuous_space_for_grad
_pass.h"
#include "paddle/fluid/framework/ir/
coalesce_grad_tensor
_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
...
...
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
fd3aad6c
...
...
@@ -38,6 +38,7 @@ class TestParallelExecutorBase(unittest.TestCase):
batch_size
=
None
,
allow_op_delay
=
False
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
seed
=
None
,
use_parallel_executor
=
True
,
use_reduce
=
False
,
...
...
@@ -74,6 +75,10 @@ class TestParallelExecutorBase(unittest.TestCase):
if
memory_opt
:
fluid
.
memory_optimize
(
main
)
if
get_data_from_feeder
is
not
None
:
assert
feed_dict
is
None
feed_dict
=
get_data_from_feeder
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -81,6 +86,7 @@ class TestParallelExecutorBase(unittest.TestCase):
exec_strategy
.
allow_op_delay
=
allow_op_delay
if
use_fast_executor
:
exec_strategy
.
use_experimental_executor
=
True
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
\
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
...
...
python/paddle/fluid/tests/unittests/simple_nets.py
浏览文件 @
fd3aad6c
...
...
@@ -55,6 +55,34 @@ def fc_with_batchnorm(use_feed=None):
return
loss
def
bow_net
(
use_feed
,
dict_dim
,
is_sparse
=
False
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
is_sparse
=
is_sparse
,
size
=
[
dict_dim
,
emb_dim
])
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bow_tanh
=
fluid
.
layers
.
tanh
(
bow
)
fc_1
=
fluid
.
layers
.
fc
(
input
=
bow_tanh
,
size
=
hid_dim
,
act
=
"tanh"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
return
avg_cost
def
init_data
(
batch_size
=
32
,
img_shape
=
[
784
],
label_range
=
9
):
np
.
random
.
seed
(
5
)
assert
isinstance
(
img_shape
,
list
)
...
...
python/paddle/fluid/tests/unittests/test_
alloc_continuous_space
_op.py
→
python/paddle/fluid/tests/unittests/test_
coalesce_tensor
_op.py
浏览文件 @
fd3aad6c
...
...
@@ -24,7 +24,7 @@ alignment = 256
class
TestAllocContinuousSpace
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"
alloc_continuous_space
"
self
.
op_type
=
"
coalesce_tensor
"
self
.
dtype
=
np
.
float32
attrs
=
self
.
init_attr
()
self
.
copy_data
=
attrs
[
"copy_data"
]
...
...
@@ -64,14 +64,13 @@ class TestAllocContinuousSpace(OpTest):
out
[
0
:
length
]
=
input
[
1
].
flatten
()
inputs
.
append
(
out
)
alloc_continuous_space
_var
=
np
.
concatenate
([
input
for
input
in
inputs
])
coalesce_tensor
_var
=
np
.
concatenate
([
input
for
input
in
inputs
])
if
set_constant
:
alloc_continuous_space_var
=
np
.
ones
(
(
len
(
alloc_continuous_space_var
)))
*
constant
coalesce_tensor_var
=
np
.
ones
((
len
(
coalesce_tensor_var
)))
*
constant
outputs
=
[(
out
[
0
],
np
.
ones
(
out
[
1
].
shape
).
astype
(
self
.
dtype
)
*
constant
)
for
out
in
outputs
]
return
outputs
,
alloc_continuous_space
_var
return
outputs
,
coalesce_tensor
_var
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
...
...
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
浏览文件 @
fd3aad6c
...
...
@@ -11,71 +11,122 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
,
bow_net
from
fake_reader
import
fake_imdb_reader
from
parallel_executor_test_base
import
TestParallelExecutorBase
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
from
functools
import
partial
import
paddle
import
paddle.dataset.mnist
as
mnist
import
unittest
import
os
class
Test
MNIST
(
TestParallelExecutorBase
):
class
Test
FuseAllReduceOpsBase
(
TestParallelExecutorBase
):
@
classmethod
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_init_data
(
self
,
random
=
True
):
np
.
random
.
seed
(
5
)
if
random
:
img
=
np
.
random
.
random
(
size
=
[
32
,
784
]).
astype
(
np
.
float32
)
else
:
img
=
np
.
ones
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
def
_compare_fuse_all_reduce_ops
(
self
,
model
,
use_cuda
):
def
compare_fuse_all_reduce_ops
(
self
,
model
,
use_cuda
,
init_feed_dicta
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
None
,
fuse_all_optimizer_ops
=
False
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
def
_optimizer
(
learning_rate
=
1e-6
):
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-6
))
return
optimizer
feed_dict_data
=
None
if
init_feed_dicta
is
not
None
:
img
,
label
=
init_feed_dicta
()
feed_dict_data
=
{
"image"
:
img
,
"label"
:
label
}
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
}
,
feed_dict
=
feed_dict_data
,
get_data_from_feeder
=
get_data_from_feeder
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
False
,
fuse_all_optimizer_ops
=
fuse_all_optimizer_ops
,
memory_opt
=
False
,
optimizer
=
_
optimizer
)
optimizer
=
optimizer
)
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
}
,
feed_dict
=
feed_dict_data
,
get_data_from_feeder
=
get_data_from_feeder
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
True
,
fuse_all_optimizer_ops
=
fuse_all_optimizer_ops
,
memory_opt
=
False
,
optimizer
=
_
optimizer
)
optimizer
=
optimizer
)
for
loss
in
zip
(
not_fuse_op_first_loss
,
fuse_op_first_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
not_fuse_op_last_loss
,
fuse_op_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_all_reduce_ops
(
simple_fc_net
,
True
)
self
.
_compare_fuse_all_reduce_ops
(
simple_fc_net
,
False
)
def
optimizer
(
self
,
learning_rate
=
1e-3
):
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-3
))
return
optimizer
class
TestFuseAllReduceOps
(
TestFuseAllReduceOpsBase
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_cuda
):
self
.
compare_fuse_all_reduce_ops
(
model
,
use_cuda
,
init_feed_dicta
=
init_data
,
optimizer
=
self
.
optimizer
,
fuse_all_optimizer_ops
=
True
)
def
test_simple_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
True
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
False
)
def
test_batchnorm_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
True
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
False
)
class
TestFuseAllReduceOpsAndOptiOps
(
TestFuseAllReduceOps
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_cuda
):
self
.
compare_fuse_all_reduce_ops
(
model
,
use_cuda
,
init_feed_dicta
=
init_data
,
optimizer
=
self
.
optimizer
,
fuse_all_optimizer_ops
=
True
)
class
TestFuseAllReduceOpsWithSparseGrad
(
TestFuseAllReduceOpsBase
):
@
classmethod
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
cls
.
word_dict_len
=
5147
batch_size
=
64
reader
=
fake_imdb_reader
(
cls
.
word_dict_len
,
batch_size
*
100
)
reader
=
paddle
.
batch
(
reader
,
batch_size
=
batch_size
)()
cls
.
train_data
=
next
(
reader
)
def
get_data_from_feeder
(
self
):
place
=
fluid
.
CPUPlace
()
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
return
feeder
.
feed
(
self
.
train_data
)
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_cuda
):
self
.
compare_fuse_all_reduce_ops
(
model
,
use_cuda
,
get_data_from_feeder
=
self
.
get_data_from_feeder
,
optimizer
=
self
.
optimizer
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_all_reduce_ops
(
fc_with_batchnorm
,
True
)
self
.
_compare_fuse_all_reduce_ops
(
fc_with_batchnorm
,
False
)
def
test_simple_bow_net_with_fuse_all_reduce
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
False
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
浏览文件 @
fd3aad6c
...
...
@@ -11,30 +11,39 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
,
bow_net
from
fake_reader
import
fake_imdb_reader
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
functools
import
partial
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
unittest
import
os
class
TestFuse
Adam
Ops
(
TestParallelExecutorBase
):
class
TestFuse
Optimization
Ops
(
TestParallelExecutorBase
):
@
classmethod
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_get_feed_dict
(
self
):
img
,
label
=
init_data
()
return
{
"image"
:
img
,
"label"
:
label
}
def
_compare_fused_optimizer_ops
(
self
,
model
,
use_cuda
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
feed_dict
=
{
"image"
:
img
,
"label"
:
label
}
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
feed_dict
,
get_data_from_feeder
=
get_data_from_feeder
,
use_cuda
=
use_cuda
,
fuse_all_optimizer_ops
=
False
,
memory_opt
=
False
,
# avoid the gradient's name changed in Python side.
...
...
@@ -42,6 +51,7 @@ class TestFuseAdamOps(TestParallelExecutorBase):
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
feed_dict
,
get_data_from_feeder
=
get_data_from_feeder
,
use_cuda
=
use_cuda
,
fuse_all_optimizer_ops
=
True
,
memory_opt
=
False
,
# avoid the gradient's name changed in Python side.
...
...
@@ -52,48 +62,84 @@ class TestFuseAdamOps(TestParallelExecutorBase):
for
loss
in
zip
(
not_fuse_op_last_loss
,
fuse_op_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
_decorate_compare_fused_optimizer_ops
(
self
,
model
,
use_cuda
,
optimizer
):
self
.
_compare_fused_optimizer_ops
(
model
,
use_cuda
,
feed_dict
=
self
.
_get_feed_dict
(),
optimizer
=
optimizer
)
class
TestFuseAdamOps
(
TestFuseOptimizationOps
):
def
optimizer
(
self
,
learning_rate
=
1e-4
):
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fused_optimizer_ops
(
simple_fc_net
,
True
)
self
.
_compare_fused_optimizer_ops
(
simple_fc_net
,
False
)
self
.
_decorate_compare_fused_optimizer_ops
(
simple_fc_net
,
True
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
simple_fc_net
,
False
,
optimizer
=
self
.
optimizer
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
True
)
self
.
_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
False
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
True
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
False
,
optimizer
=
self
.
optimizer
)
class
TestFuseSGDOps
(
TestFuseAdamOps
):
def
sgd_
optimizer
(
self
,
learning_rate
=
1e-3
):
def
optimizer
(
self
,
learning_rate
=
1e-3
):
return
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fused_optimizer_ops
(
simple_fc_net
,
True
,
optimizer
=
self
.
sgd_optimizer
)
self
.
_compare_fused_optimizer_ops
(
simple_fc_net
,
False
,
optimizer
=
self
.
sgd_optimizer
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
True
,
optimizer
=
self
.
sgd_optimizer
)
self
.
_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
False
,
optimizer
=
self
.
sgd_optimizer
)
class
TestFuseMomentumOps
(
TestFuseAdamOps
):
def
momentum_
optimizer
(
self
,
learning_rate
=
1e-3
):
def
optimizer
(
self
,
learning_rate
=
1e-3
):
return
fluid
.
optimizer
.
Momentum
(
learning_rate
=
learning_rate
,
momentum
=
0.1
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fused_optimizer_ops
(
simple_fc_net
,
True
,
optimizer
=
self
.
momentum_optimizer
)
self
.
_compare_fused_optimizer_ops
(
simple_fc_net
,
False
,
optimizer
=
self
.
momentum_optimizer
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
True
,
optimizer
=
self
.
momentum_optimizer
)
class
TestSpareFuseAdamOps
(
TestFuseOptimizationOps
):
@
classmethod
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
cls
.
word_dict_len
=
5147
batch_size
=
64
reader
=
fake_imdb_reader
(
cls
.
word_dict_len
,
batch_size
*
100
)
reader
=
paddle
.
batch
(
reader
,
batch_size
=
batch_size
)()
cls
.
train_data
=
next
(
reader
)
def
_get_data_from_feeder
(
self
):
place
=
fluid
.
CPUPlace
()
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
return
feeder
.
feed
(
self
.
train_data
)
def
_decorate_compare_fused_optimizer_ops
(
self
,
model
,
use_cuda
,
optimizer
):
self
.
_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
False
,
optimizer
=
self
.
momentum_optimizer
)
model
,
use_cuda
,
get_data_from_feeder
=
self
.
_get_data_from_feeder
,
optimizer
=
optimizer
)
def
optimizer
(
self
,
learning_rate
=
1e-4
):
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
)
def
test_simple_bow_net_with_fuse_op
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
True
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
False
,
optimizer
=
self
.
optimizer
)
class
TestSpareFuseSGDOps
(
TestSpareFuseAdamOps
):
def
optimizer
(
self
,
learning_rate
=
1e-3
):
return
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
)
class
TestSpareFuseMomentumOps
(
TestSpareFuseAdamOps
):
def
optimizer
(
self
,
learning_rate
=
1e-3
):
return
fluid
.
optimizer
.
Momentum
(
learning_rate
=
learning_rate
,
momentum
=
0.1
)
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录