Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
26e32e09
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
26e32e09
编写于
1月 17, 2019
作者:
X
Xin Pan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
allow compiler to use graph
test=develop
上级
a7e7d952
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
460 addition
and
126 deletion
+460
-126
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+10
-16
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-1
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
...luid/framework/details/fast_threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/details/memory_optimize_helper_test.cc
...le/fluid/framework/details/memory_optimize_helper_test.cc
+4
-22
paddle/fluid/framework/details/memory_optimize_pass.cc
paddle/fluid/framework/details/memory_optimize_pass.cc
+1
-2
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+16
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+124
-30
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+5
-4
paddle/fluid/pybind/ir.cc
paddle/fluid/pybind/ir.cc
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+7
-3
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+60
-23
python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
...ddle/fluid/contrib/slim/unitest/test_quantization_pass.py
+204
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+1
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+2
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+4
-1
未找到文件。
paddle/fluid/API.spec
浏览文件 @
26e32e09
...
...
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program
_or_graph
'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
26e32e09
...
...
@@ -171,7 +171,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
}
std
::
unique_ptr
<
ir
::
Graph
>
BuildStrategy
::
Apply
(
const
ProgramDesc
&
main_program
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -182,7 +183,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
// Create a default one if not finalized by user.
CreatePassesFromStrategy
(
false
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
main_program
)
);
std
::
vector
<
OpDesc
*>
all_ops
=
graph
->
OriginProgram
().
Block
(
0
).
AllOps
(
);
for
(
std
::
shared_ptr
<
ir
::
Pass
>
&
pass
:
pass_builder_
->
AllPasses
())
{
if
(
IsMultiDevPass
(
pass
->
Type
()))
{
pass
->
Erase
(
kPlaces
);
...
...
@@ -204,37 +205,30 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
if
(
graph
->
Has
(
kAllOpDescs
))
{
graph
->
Erase
(
kAllOpDescs
);
}
const
std
::
vector
<
OpDesc
*>
*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
// take ownership
graph
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
&
all_ops
);
// take ownership
pass
->
Erase
(
kAllOpDescs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_desc
s
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
&
all_op
s
);
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
&
all_ops
);
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
LOG
(
INFO
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
<<
", num_trainers:"
<<
num_trainers_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
&
all_ops
);
}
else
if
(
pass
->
Type
()
==
"inplace_pass"
)
{
if
(
graph
->
Has
(
kAllOpDescs
))
{
graph
->
Erase
(
kAllOpDescs
);
}
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
graph
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
&
all_ops
);
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
if
(
!
use_cuda
)
{
LOG
(
WARNING
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
26e32e09
...
...
@@ -114,7 +114,7 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
const
ProgramDesc
&
main_program
,
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
26e32e09
...
...
@@ -24,12 +24,11 @@ namespace details {
FastThreadedSSAGraphExecutor
::
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
:
strategy_
(
strategy
),
local_scopes_
(
local_scopes
),
places_
(
places
),
graph_
(
std
::
move
(
graph
)
),
graph_
(
graph
),
pool_
(
strategy
.
num_threads_
),
prepare_pool_
(
1
),
// add one more thread for generate op_deps
fetch_ctxs_
(
places
)
{
...
...
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
}
if
(
exception_
.
IsCaught
())
{
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
exception_
.
ReThrow
();
}
}
num_complete
+=
num_comp
;
}
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
return
fetches
;
}
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
浏览文件 @
26e32e09
...
...
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
const
ir
::
Graph
&
Graph
()
const
override
;
...
...
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
ir
::
Graph
*
graph_
;
std
::
unordered_map
<
OpHandleBase
*
,
int
>
op_deps_
;
std
::
vector
<
OpHandleBase
*>
bootstrap_ops_
;
...
...
paddle/fluid/framework/details/memory_optimize_helper_test.cc
浏览文件 @
26e32e09
...
...
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
...
...
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
...
...
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
...
...
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
...
...
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
...
...
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
...
...
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
return
ret
;
};
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
...
...
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
...
...
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
...
...
paddle/fluid/framework/details/memory_optimize_pass.cc
浏览文件 @
26e32e09
...
...
@@ -336,5 +336,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
}
// namespace paddle
REGISTER_PASS
(
memory_optimize_pass
,
paddle
::
framework
::
details
::
MemoryOptimizePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
paddle
::
framework
::
details
::
MemoryOptimizePass
);
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
26e32e09
...
...
@@ -20,8 +20,7 @@ namespace framework {
namespace
details
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
{
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
ir
::
Graph
*
graph
)
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
;
graphs
.
reserve
(
places_
.
size
());
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
...
...
@@ -78,7 +77,7 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
ParallelSSAGraphExecutor
::
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
framework
::
ProgramDesc
&
main_prog
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
const
framework
::
ProgramDesc
&
main_prog
,
ir
::
Graph
*
graph
)
:
strategy_
(
std
::
move
(
strategy
)),
local_scopes_
(
std
::
move
(
local_scopes
)),
pool_
(
places
.
size
()
>=
2
?
new
::
ThreadPool
(
places
.
size
())
:
nullptr
),
...
...
@@ -86,7 +85,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
main_prog_
(
main_prog
),
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
graphs_
(
SeparateMultiDevicesGraph
(
std
::
move
(
graph
)
))
{
graphs_
(
SeparateMultiDevicesGraph
(
graph
))
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
auto
seq_allreduce_pass
=
...
...
@@ -107,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<<
" to run the operators of the graph on each device."
;
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
executors_
.
emplace_back
(
new
details
::
ThreadedSSAGraphExecutor
(
strategy_
,
local_scopes_
,
{
places_
[
i
]},
std
::
move
(
graphs_
.
at
(
i
)
)));
strategy_
,
local_scopes_
,
{
places_
[
i
]},
graphs_
.
at
(
i
).
get
(
)));
}
}
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
浏览文件 @
26e32e09
...
...
@@ -32,7 +32,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
framework
::
ProgramDesc
&
main_prog
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
~
ParallelSSAGraphExecutor
()
final
=
default
;
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graphs_
[
0
];
}
...
...
@@ -41,7 +41,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
private:
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
SeparateMultiDevicesGraph
(
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
26e32e09
...
...
@@ -23,9 +23,8 @@ namespace framework {
namespace
details
{
ThreadedSSAGraphExecutor
::
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
:
graph_
(
std
::
move
(
graph
)),
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
:
graph_
(
graph
),
pool_
(
strategy
.
num_threads_
>=
2
?
new
::
ThreadPool
(
strategy
.
num_threads_
)
:
nullptr
),
local_scopes_
(
local_scopes
),
...
...
@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
run_op_future
.
wait
();
}
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
exception_holder_
.
ReThrow
();
}
else
{
continue
;
...
...
@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
PADDLE_ENFORCE
(
ready_ops
.
empty
());
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
return
fetch_data
;
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
26e32e09
...
...
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
// Run a SSAGraph by a thread pool
...
...
@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details
::
OpHandleBase
*
op
);
private:
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
ir
::
Graph
*
graph_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
26e32e09
...
...
@@ -195,6 +195,22 @@ class Graph {
return
nullptr
;
}
<<<<<<<
HEAD
=======
// Returns reference to the original program.
// WARN: After a series of passes, the current graph can be quite
// different from OriginProgram. Caller shouldn't assume much from
// the returned OriginProgram.
const
ProgramDesc
&
OriginProgram
()
const
{
return
program_
;
}
void
ResolveHazard
(
const
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
&
var_nodes
);
private:
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
InitFromProgram
(
const
ProgramDesc
&
program
);
>>>>>>>
polish
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
26e32e09
...
...
@@ -184,7 +184,7 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
ParallelExecutor
::
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
ir
::
Graph
*>
&
graphs
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
...
...
@@ -216,15 +216,34 @@ ParallelExecutor::ParallelExecutor(
}
}
<<<<<<<
HEAD
std
::
unique_ptr
<
ir
::
Graph
>
temp_owned_graph
(
graph
);
// FIXME(Yancey1989): parallel graph mode get better performance
// in GPU allreduce distributed training. Need an elegant way to
// choice the execution strategy.
build_strategy
.
enable_parallel_graph_
=
EnableParallelGraphExecution
(
main_program
,
exec_strategy
,
build_strategy
);
EnableParallelGraphExecution
(
*
temp_owned_graph
,
exec_strategy
,
build_strategy
);
if
(
build_strategy
.
enable_parallel_graph_
)
VLOG
(
0
)
<<
"The Executor would execute the graph by ParallelGraph "
"Execution which can get better performance,"
<<
"you can force it off by env FLAGS_enable_parallel_graph=0"
;
=======
// TODO(panyx0718): Update pass interface so we don't need this here.
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
temp_owned_graphs
;
for
(
ir
::
Graph
*
g
:
graphs
)
{
temp_owned_graphs
.
emplace_back
(
g
);
}
<<<<<<<
HEAD
>>>>>>>
fix
parallel
graph
mode
program
=======
bool
parallel_graphs
=
(
temp_owned_graphs
.
size
()
>
1
);
if
(
parallel_graphs
)
{
PADDLE_ENFORCE_EQ
(
temp_owned_graphs
.
size
(),
places
.
size
());
}
VLOG
(
1
)
<<
"Enable ParallelGraph Execution: "
<<
parallel_graphs
;
>>>>>>>
polish
if
(
member_
->
use_cuda_
)
{
// Bcast Parameters to all GPUs
...
...
@@ -236,7 +255,7 @@ ParallelExecutor::ParallelExecutor(
if
(
nccl_id_var
!=
nullptr
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
}
if
(
build_strategy
.
enable_parallel_graph_
&&
member_
->
nranks_
>
1UL
)
{
if
(
parallel_graphs
&&
member_
->
nranks_
>
1UL
)
{
if
(
nccl_id
==
nullptr
)
{
local_nccl_id_
.
reset
(
new
ncclUniqueId
());
platform
::
dynload
::
ncclGetUniqueId
(
local_nccl_id_
.
get
());
...
...
@@ -258,44 +277,101 @@ ParallelExecutor::ParallelExecutor(
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
<<<<<<<
HEAD
std
::
unique_ptr
<
ir
::
Graph
>
graph
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
),
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
#else
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
)
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
=======
std
::
vector
<
ir
::
Graph
*>
compiled_graphs
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
parallel_graphs
)
{
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
auto
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graphs
[
i
]),
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
compiled_graphs
.
push_back
(
temp_owned_graph
.
release
());
}
}
else
{
auto
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graphs
[
0
]),
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
compiled_graphs
.
push_back
(
temp_owned_graph
.
release
());
}
#else
auto
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graphs
[
0
]),
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
compiled_graphs
.
push_back
(
temp_owned_graph
.
release
());
>>>>>>>
fix
parallel
graph
mode
program
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
VLOG
(
10
)
<<
"Eager Deletion Threshold "
<<
static_cast
<
float
>
(
max_memory_size
)
/
(
1
<<
30
);
if
(
max_memory_size
>=
0
)
{
<<<<<<<
HEAD
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graph
),
static_cast
<
size_t
>
(
max_memory_size
));
static_cast
<
size_t
>
(
max_memory_size
)).
release
();
=======
for
(
size_t
i
=
0
;
i
<
graphs
.
size
();
++
i
)
{
compiled_graphs
[
i
]
=
member_
->
PrepareGCAndRefCnts
(
std
::
unique_ptr
<
ir
::
Graph
>
(
compiled_graphs
[
i
]),
static_cast
<
size_t
>
(
max_memory_size
))
.
release
();
}
>>>>>>>
fix
parallel
graph
mode
program
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std
::
vector
<
details
::
VariableInfo
>
var_infos
;
<<<<<<<
HEAD
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
!
node
->
IsCtrlVar
()
&&
node
->
Var
())
{
var_infos
.
emplace_back
();
var_infos
.
back
().
name_
=
node
->
Var
()
->
Name
();
var_infos
.
back
().
type_
=
node
->
Var
()
->
GetType
();
var_infos
.
back
().
persistable_
=
node
->
Var
()
->
Persistable
();
=======
for
(
auto
&
graph
:
compiled_graphs
)
{
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
!
node
->
IsCtrlVar
()
&&
node
->
Var
())
{
var_infos
.
emplace_back
();
var_infos
.
back
().
name_
=
node
->
Var
()
->
Name
();
var_infos
.
back
().
type_
=
node
->
Var
()
->
GetType
();
var_infos
.
back
().
persistable_
=
node
->
Var
()
->
Persistable
();
}
>>>>>>>
fix
parallel
graph
mode
program
}
}
// If the loss_var_name is given, the number of graph should be only one.
if
(
loss_var_name
.
size
())
{
<<<<<<<
HEAD
size_t
graph_num
=
ir
::
GraphNum
(
*
graph
);
=======
size_t
graph_num
=
ir
::
GraphNum
(
*
compiled_graphs
[
0
]);
>>>>>>>
fix
parallel
graph
mode
program
if
(
graph_num
>
1
)
{
LOG
(
WARNING
)
<<
"The number of graph should be only one, "
"but the current graph has "
<<<<<<<
HEAD
<<
ir
::
GraphNum
(
*
graph
)
=======
<<
ir
::
GraphNum
(
*
compiled_graphs
[
0
])
>>>>>>>
fix
parallel
graph
mode
program
<<
" sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
...
...
@@ -303,26 +379,42 @@ ParallelExecutor::ParallelExecutor(
}
}
<<<<<<<
HEAD
if
(
build_strategy
.
enable_parallel_graph_
)
{
#ifdef PADDLE_WITH_CUDA
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
=======
if
(
parallel_graphs
)
{
>>>>>>>
polish
member_
->
executor_
.
reset
(
new
details
::
ParallelSSAGraphExecutor
(
<<<<<<<
HEAD
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
main_program
,
std
::
move
(
graph
)
));
graph
));
#else
PADDLE_THROW
(
"Paddle should be compiled with CUDA for ParallelGraph Execution."
);
#endif
}
else
{
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
}
else
{
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
=======
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
compiled_graphs
));
}
else
{
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)
));
compiled_graphs
[
0
]
));
}
else
{
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
compiled_graphs
[
0
]));
>>>>>>>
fix
parallel
graph
mode
program
}
}
...
...
@@ -452,24 +544,33 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
bool
ParallelExecutor
::
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
{
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
delete
member_
;
}
bool
EnableParallelGraphExecution
(
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
{
if
(
!
FLAGS_enable_parallel_graph
)
return
false
;
bool
enable_parallel_graph
=
true
;
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
for
(
auto
&
var_desc
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
var_desc
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
enable_parallel_graph
=
false
;
}
}
// TODO(Yancey1989): support pserver mode
for
(
auto
&
op_desc
:
main_program
.
Block
(
0
).
AllOps
())
{
if
(
op_desc
->
Type
()
==
"send"
||
op_desc
->
Type
()
==
"recv"
)
{
enable_parallel_graph
=
false
;
break
;
for
(
ir
::
Node
*
node
:
graph
.
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
if
(
node
->
Var
()
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
enable_parallel_graph
=
false
;
break
;
}
}
else
if
(
node
->
IsOp
()
&&
node
->
Op
())
{
// TODO(Yancey1989): support pserver mode
if
(
node
->
Op
()
->
Type
()
==
"send"
||
node
->
Op
()
->
Type
()
==
"recv"
)
{
enable_parallel_graph
=
false
;
break
;
}
}
}
...
...
@@ -481,13 +582,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
return
enable_parallel_graph
;
}
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
delete
member_
;
}
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
26e32e09
...
...
@@ -46,7 +46,7 @@ class ParallelExecutor {
public:
explicit
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
vector
<
ir
::
Graph
*>
&
graphs
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
...
...
@@ -71,9 +71,6 @@ class ParallelExecutor {
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
bool
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
;
ParallelExecutorPrivate
*
member_
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -81,5 +78,9 @@ class ParallelExecutor {
#endif
};
bool
EnableParallelGraphExecution
(
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/pybind/ir.cc
浏览文件 @
26e32e09
...
...
@@ -101,7 +101,8 @@ void BindGraph(py::module *m) {
[](
Graph
&
self
,
Node
&
node
)
{
return
self
.
RemoveNode
(
&
node
);
})
.
def
(
"retrieve_node"
,
&
Graph
::
RetrieveNode
,
return_value_policy
::
reference
)
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
);
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
)
.
def
(
"origin_program_desc"
,
&
Graph
::
OriginProgram
);
}
void
BindNode
(
py
::
module
*
m
)
{
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
26e32e09
...
...
@@ -976,6 +976,9 @@ All parameter, weight, gradient are variables in Paddle.
[](
ir
::
PassBuilder
&
self
,
size_t
idx
)
{
self
.
RemovePass
(
idx
);
});
// -- python binds for parallel executor.
m
.
def
(
"_enable_parallel_graph_execution"
,
framework
::
EnableParallelGraphExecution
);
py
::
class_
<
ParallelExecutor
>
pe
(
m
,
"ParallelExecutor"
);
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
,
R"DOC(
ExecutionStrategy allows the user to more preciously control how to run
...
...
@@ -1213,9 +1216,10 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"
);
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
ProgramDesc
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&>
())
const
std
::
unordered_set
<
std
::
string
>
&
,
const
std
::
vector
<
ir
::
Graph
*>
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&>
())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
...
python/paddle/fluid/compiler.py
浏览文件 @
26e32e09
...
...
@@ -17,6 +17,7 @@ import os
import
six
import
sys
from
..
import
compat
as
cpt
from
.
import
framework
from
.
import
core
...
...
@@ -36,7 +37,7 @@ def _place_obj(place):
class
CompiledProgram
(
object
):
"""
Compiles
a Program
for execution.
Compiles
to Graph
for execution.
1. Users first create the program with layers.
2. Optionally, users use CompiledProgram to optimize the program before run.
...
...
@@ -51,7 +52,7 @@ class CompiledProgram(object):
Example:
.. code-block:: python
place = fluid.CUDAPlace(0) if use_
cuda
else fluid.CPUPlace()
place = fluid.CUDAPlace(0) if use_
gpu
else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
...
...
@@ -62,11 +63,25 @@ class CompiledProgram(object):
fetch_list=[loss.name])
Args:
program: Program instance that contains the model logic.
program_or_graph (Graph|Program): If it's Program, it will be first
lowered to a graph for further optimizations. If it's a graph
(potentially optimized before), it will be directly used for
further optimizations. Note: graph is only supported when compiled
with with_data_parallel option.
"""
def
__init__
(
self
,
program
):
self
.
_program
=
program
def
__init__
(
self
,
program_or_graph
):
if
isinstance
(
program_or_graph
,
core
.
Graph
):
self
.
_graph
=
program_or_graph
self
.
_program
=
None
elif
isinstance
(
program_or_graph
,
framework
.
Program
):
self
.
_graph
=
core
.
Graph
(
program_or_graph
.
desc
)
self
.
_program
=
program_or_graph
else
:
raise
ValueError
(
"Wrong program_to_graph type: %s"
%
type
(
program_or_graph
))
self
.
_program_desc
=
self
.
_graph
.
origin_program_desc
()
self
.
_scope
=
None
self
.
_place
=
None
self
.
_executor
=
None
...
...
@@ -101,6 +116,7 @@ class CompiledProgram(object):
self
"""
assert
not
self
.
_is_data_parallel
,
"Already compiled with parallel."
assert
not
self
.
_is_inference
,
"Cannot compile both data parallel and inference"
self
.
_is_data_parallel
=
True
self
.
_build_strategy
=
build_strategy
self
.
_exec_strategy
=
exec_strategy
...
...
@@ -120,11 +136,13 @@ class CompiledProgram(object):
Returns:
self
"""
assert
not
self
.
_is_data_parallel
,
"Cannot compile both data parallel and inference."
assert
not
self
.
_is_inference
,
"Already compiled with inference"
assert
any
([
isinstance
(
config
,
InferNativeConfig
),
isinstance
(
config
,
InferAnalysisConfig
)
])
self
.
_is_data_parallel
=
False
self
.
_is_inference
=
True
self
.
_infer_config
=
config
return
self
...
...
@@ -173,37 +191,56 @@ class CompiledProgram(object):
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_exec_strategy
.
num_threads
=
cpu_num
*
2
trainers_endpoints
=
self
.
_program
.
_trainers_endpoints
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if
self
.
_build_strategy
.
memory_optimize
is
None
:
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
enable_inplace
is
None
:
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
if
self
.
_program
and
self
.
_build_strategy
.
num_trainers
>
1
and
\
self
.
_program
.
_trainers_endpoints
:
tps
=
self
.
_program
.
_trainers_endpoints
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
assert
self
.
_build_strategy
.
num_trainers
==
len
(
t
rainers_endpoint
s
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
t
rainers_endpoint
s
self
.
_persistable_vars
=
set
([
cpt
.
to_text
(
v
.
name
)
for
v
in
[
var
for
var
in
self
.
_program
.
list_vars
()
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
]
])
t
p
s
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
t
p
s
self
.
_persistable_vars
=
[]
for
block_id
in
range
(
self
.
_program_desc
.
num_blocks
()):
bdesc
=
self
.
_program_desc
.
block
(
block_id
)
self
.
_persistable_vars
.
extend
([
cpt
.
to_text
(
v
.
name
())
for
v
in
bdesc
.
all_vars
()
if
v
.
persistable
()
and
v
.
type
()
!=
core
.
VarDesc
.
VarType
.
RAW
])
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
# FIXME(Yancey1989): parallel graph mode get better performance
# in GPU allreduce distributed training. Need an elegant way to
# choice the execution strategy.
enable_parallel_graph
=
\
core
.
_enable_parallel_graph_execution
(
self
.
_graph
,
self
.
_exec_strategy
,
self
.
_build_strategy
)
and
\
self
.
_program
# only supported if compile program not graph.
self
.
_pe_graphs
=
[
self
.
_graph
]
if
enable_parallel_graph
:
for
_
in
range
(
len
(
places
)
-
1
):
self
.
_pe_graphs
.
append
(
core
.
Graph
(
self
.
_program_desc
))
return
core
.
ParallelExecutor
(
places
,
self
.
_persistable_vars
,
self
.
_program
.
desc
,
places
,
set
(
self
.
_persistable_vars
),
self
.
_pe_graphs
,
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
)
def
_compile_inference
(
self
):
assert
self
.
_is_data_parallel
is
False
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
def
_compile
(
self
,
scope
,
place
):
...
...
python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
0 → 100644
浏览文件 @
26e32e09
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import
unittest
import
random
import
numpy
as
np
import
paddle.fluid
as
fluid
import
six
from
paddle.fluid.framework
import
Program
from
paddle.fluid.framework
import
IrGraph
from
paddle.fluid.contrib.slim.quantization
import
QuantizationTransformPass
from
paddle.fluid
import
core
def
linear_fc
(
num
):
data
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
data
for
_
in
six
.
moves
.
xrange
(
num
):
hidden
=
fluid
.
layers
.
fc
(
hidden
,
size
=
128
,
act
=
'relu'
)
fc
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
)
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
fc
,
label
=
label
)
loss
=
fluid
.
layers
.
mean
(
loss
)
return
loss
def
residual_block
(
num
):
def
conv_bn_layer
(
input
,
ch_out
,
filter_size
,
stride
,
padding
,
act
=
'relu'
,
bias_attr
=
False
):
tmp
=
fluid
.
layers
.
conv2d
(
input
=
input
,
filter_size
=
filter_size
,
num_filters
=
ch_out
,
stride
=
stride
,
padding
=
padding
,
act
=
None
,
bias_attr
=
bias_attr
)
return
fluid
.
layers
.
batch_norm
(
input
=
tmp
,
act
=
act
)
data
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
data
for
_
in
six
.
moves
.
xrange
(
num
):
conv
=
conv_bn_layer
(
hidden
,
16
,
3
,
1
,
1
,
act
=
None
,
bias_attr
=
True
)
short
=
conv_bn_layer
(
hidden
,
16
,
1
,
1
,
0
,
act
=
None
)
hidden
=
fluid
.
layers
.
elementwise_add
(
x
=
conv
,
y
=
short
,
act
=
'relu'
)
fc
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
)
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
fc
,
label
)
loss
=
fluid
.
layers
.
mean
(
loss
)
return
loss
class
TestQuantizationTransformPass
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
quantizable_op_and_inputs
=
{
'conv2d'
:
[
'Input'
,
'Filter'
],
'depthwise_conv2d'
:
[
'Input'
,
'Filter'
],
'mul'
:
[
'X'
,
'Y'
]
}
self
.
quantizable_grad_op_inputs
=
{
'conv2d_grad'
:
[
'Input'
,
'Filter'
],
'depthwise_conv2d_grad'
:
[
'Input'
,
'Filter'
],
'mul_grad'
:
[
'X'
,
'Y'
]
}
def
check_program
(
self
,
transform_pass
,
program
):
quantized_ops
=
set
()
for
block
in
program
.
blocks
:
for
op
in
block
.
ops
:
# check forward
if
op
.
type
in
self
.
quantizable_op_and_inputs
:
for
arg_name
in
op
.
input_arg_names
:
self
.
assertTrue
(
arg_name
.
endswith
(
'.quantized.dequantized'
))
quantized_ops
.
add
(
arg_name
)
for
op
in
block
.
ops
:
# check backward
if
op
.
type
in
self
.
quantizable_grad_op_inputs
:
for
pname
in
self
.
quantizable_grad_op_inputs
[
op
.
type
]:
arg_name
=
op
.
input
(
pname
)[
0
]
self
.
assertTrue
(
arg_name
.
endswith
(
'.quantized.dequantized'
))
self
.
assertTrue
(
arg_name
in
quantized_ops
)
def
linear_fc_quant
(
self
,
quant_type
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
linear_fc
(
3
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
graph
=
IrGraph
(
core
.
Graph
(
main
.
desc
),
for_test
=
False
)
transform_pass
=
QuantizationTransformPass
(
scope
=
fluid
.
global_scope
(),
program_exe
=
exe
,
activation_quantize_type
=
quant_type
)
transform_pass
.
apply
(
graph
)
marked_nodes
=
set
()
for
op
in
graph
.
all_ops
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
graph
.
draw
(
'.'
,
'quantize_fc_'
+
quant_type
,
marked_nodes
)
program
=
graph
.
to_program
()
self
.
check_program
(
transform_pass
,
program
)
val_graph
=
IrGraph
(
core
.
Graph
(
program
.
desc
),
for_test
=
False
)
val_marked_nodes
=
set
()
for
op
in
val_graph
.
all_ops
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
val_marked_nodes
.
add
(
op
)
val_graph
.
draw
(
'.'
,
'val_fc_'
+
quant_type
,
val_marked_nodes
)
def
test_linear_fc_quant_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_abs_max'
self
.
linear_fc_quant
(
'abs_max'
)
def
test_linear_fc_quant_range_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_range_abs_max'
self
.
linear_fc_quant
(
'range_abs_max'
)
def
residual_block_quant
(
self
,
quant_type
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
residual_block
(
2
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
graph
=
IrGraph
(
core
.
Graph
(
main
.
desc
),
for_test
=
False
)
transform_pass
=
QuantizationTransformPass
(
scope
=
fluid
.
global_scope
(),
program_exe
=
exe
,
activation_quantize_type
=
quant_type
)
transform_pass
.
apply
(
graph
)
marked_nodes
=
set
()
for
op
in
graph
.
all_ops
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
graph
.
draw
(
'.'
,
'quantize_residual_'
+
quant_type
,
marked_nodes
)
program
=
graph
.
to_program
()
self
.
check_program
(
transform_pass
,
program
)
val_graph
=
IrGraph
(
core
.
Graph
(
program
.
desc
),
for_test
=
False
)
val_marked_nodes
=
set
()
for
op
in
val_graph
.
all_ops
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
val_marked_nodes
.
add
(
op
)
val_graph
.
draw
(
'.'
,
'val_residual_'
+
quant_type
,
val_marked_nodes
)
def
test_residual_block_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_abs_max'
self
.
residual_block_quant
(
'abs_max'
)
def
test_residual_block_range_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_range_abs_max'
self
.
residual_block_quant
(
'range_abs_max'
)
def
test_execute_graph
(
self
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
linear_fc
(
3
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.0001
)
opt
.
minimize
(
loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
graph
=
IrGraph
(
core
.
Graph
(
main
.
desc
),
for_test
=
False
)
exe
.
run
(
startup
)
binary
=
fluid
.
CompiledProgram
(
graph
.
graph
).
with_data_parallel
(
loss_name
=
loss
.
name
)
for
i
in
range
(
10
):
loss_val
=
exe
.
run
(
binary
,
feed
=
{
'image'
:
np
.
ones
(
[
32
,
784
],
dtype
=
np
.
float32
),
'label'
:
np
.
ones
(
[
32
,
1
],
dtype
=
np
.
int64
)
},
fetch_list
=
[
loss
])
if
i
==
0
:
start_loss
=
np
.
sum
(
loss_val
)
elif
i
==
9
:
end_loss
=
np
.
sum
(
loss_val
)
self
.
assertLess
(
end_loss
,
start_loss
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/executor.py
浏览文件 @
26e32e09
...
...
@@ -538,6 +538,7 @@ class Executor(object):
else
:
# TODO(panyx0718): Can compile program to optimize executor
# performance.
assert
program
.
_program
,
"CompiledProgram is compiled from graph, can only run with_data_parallel."
return
self
.
_run
(
program
.
_program
,
self
.
_default_executor
,
...
...
python/paddle/fluid/framework.py
浏览文件 @
26e32e09
...
...
@@ -2322,7 +2322,7 @@ class Program(object):
@
staticmethod
def
_construct_from_desc
(
desc
):
"""
Construct a program from program desc.
Construct a program from program desc.
(Experiment)
Args:
desc(core.ProgramDesc): The program desc for constructing.
...
...
@@ -2332,6 +2332,7 @@ class Program(object):
"""
p
=
Program
()
p
.
desc
=
desc
# TODO(wangzhen): Block.vars/ops are not filled, should fix it.
p
.
blocks
=
[
Block
(
p
,
i
)
for
i
in
six
.
moves
.
range
(
p
.
desc
.
num_blocks
())]
p
.
_sync_with_cpp
()
return
p
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
26e32e09
...
...
@@ -185,8 +185,11 @@ class ParallelExecutor(object):
places
=
list
(
map
(
place_obj
,
self
.
_places
))
# step7: init ParallelExecutor
# ParallelExecutor API will be deprecated, don't support parallel graph.
self
.
_graphs
=
[
core
.
Graph
(
main
.
desc
)]
self
.
executor
=
core
.
ParallelExecutor
(
places
,
persistable_vars
,
main
.
desc
,
places
,
persistable_vars
,
self
.
_graphs
,
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录