Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
885c4e57
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
885c4e57
编写于
1月 21, 2019
作者:
Y
Yan Chunwei
提交者:
GitHub
1月 21, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fea/infer memory optim2 (#14953)
上级
6597ccb0
变更
46
显示空白变更内容
内联
并排
Showing
46 changed file
with
1450 addition
and
92 deletion
+1450
-92
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+1
-0
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+141
-2
paddle/fluid/framework/ir/graph_helper.h
paddle/fluid/framework/ir/graph_helper.h
+17
-0
paddle/fluid/framework/ir/graph_to_program_pass.cc
paddle/fluid/framework/ir/graph_to_program_pass.cc
+24
-7
paddle/fluid/framework/ir/graph_to_program_pass.h
paddle/fluid/framework/ir/graph_to_program_pass.h
+4
-0
paddle/fluid/framework/ir/graph_viz_pass.cc
paddle/fluid/framework/ir/graph_viz_pass.cc
+1
-1
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+1
-1
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+5
-2
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+1
-0
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+10
-7
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+1
-1
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+4
-0
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+11
-0
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+7
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+1
-0
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+1
-1
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
...e/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+0
-1
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+4
-0
paddle/fluid/inference/analysis/passes/CMakeLists.txt
paddle/fluid/inference/analysis/passes/CMakeLists.txt
+10
-3
paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+12
-2
paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+3
-0
paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
...uid/inference/analysis/passes/ir_graph_to_program_pass.cc
+45
-0
paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
...luid/inference/analysis/passes/ir_graph_to_program_pass.h
+3
-17
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
...e/fluid/inference/analysis/passes/memory_optimize_pass.cc
+647
-0
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
...le/fluid/inference/analysis/passes/memory_optimize_pass.h
+106
-0
paddle/fluid/inference/analysis/passes/passes.cc
paddle/fluid/inference/analysis/passes/passes.cc
+10
-3
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+7
-4
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+93
-9
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+88
-4
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+10
-0
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+51
-0
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+1
-0
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+13
-2
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+11
-0
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+5
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+33
-13
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+2
-2
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+30
-0
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
...nference/tests/api/analyzer_text_classification_tester.cc
+2
-0
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+5
-4
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+3
-3
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+5
-0
paddle/fluid/inference/utils/benchmark.h
paddle/fluid/inference/utils/benchmark.h
+1
-1
paddle/fluid/inference/utils/benchmark_tester.cc
paddle/fluid/inference/utils/benchmark_tester.cc
+2
-2
paddle/fluid/operators/controlflow/feed_op.cc
paddle/fluid/operators/controlflow/feed_op.cc
+1
-0
paddle/fluid/string/pretty_log.h
paddle/fluid/string/pretty_log.h
+17
-0
未找到文件。
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
885c4e57
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
885c4e57
...
...
@@ -18,8 +18,10 @@ limitations under the License. */
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_traits.h"
DEFINE_string
(
print_sub_graph_dir
,
""
,
"FLAGS_print_sub_graph_dir is used "
...
...
@@ -41,7 +43,7 @@ void SortHelper(
}
}
VLOG
(
3
)
<<
"topology sort insert: "
<<
node
->
Name
()
VLOG
(
5
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
" "
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
ret
->
push_back
(
node
);
}
...
...
@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
return
ret
;
}
// Build operator inlink edge table.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationAdjList
(
const
Graph
&
graph
)
{
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list
;
for
(
auto
&
n
:
graph
.
Nodes
())
{
if
(
n
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
if
(
!
n
->
IsOp
()
)
continue
;
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
...
...
@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return
adj_list
;
}
// Build operator outlink edge table.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationOutAdjList
(
const
Graph
&
graph
)
{
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list
;
for
(
auto
&
n
:
graph
.
Nodes
())
{
if
(
!
n
->
IsOp
())
continue
;
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
for
(
auto
&
var
:
n
->
outputs
)
{
for
(
auto
&
adj_n
:
var
->
outputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
VLOG
(
40
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
}
}
}
return
adj_list
;
}
std
::
vector
<
ir
::
Node
*>
OpDFSSort
(
const
Graph
&
graph
)
{
auto
edge_table
=
BuildOperationOutAdjList
(
graph
);
std
::
stack
<
Node
*>
stack
;
for
(
auto
&
ele
:
edge_table
)
{
if
(
ele
.
first
->
inputs
.
empty
())
{
// find the input ops (those without input vars)
stack
.
push
(
ele
.
first
);
}
else
{
// find the ops with only persistable vars as inputs.
bool
all_persistable
=
true
;
for
(
auto
*
input
:
ele
.
first
->
inputs
)
{
if
(
!
(
input
->
IsVar
()
&&
input
->
Var
()
&&
input
->
Var
()
->
Persistable
()))
{
all_persistable
=
false
;
}
}
if
(
all_persistable
)
{
stack
.
push
(
ele
.
first
);
}
}
}
std
::
vector
<
Node
*>
res
;
// start from the feed op and DFS
std
::
unordered_set
<
Node
*>
unique_set
;
while
(
!
stack
.
empty
())
{
// will start from the last feed by default.
auto
cur
=
stack
.
top
();
stack
.
pop
();
unique_set
.
insert
(
cur
);
res
.
push_back
(
cur
);
for
(
auto
*
op
:
edge_table
[
cur
])
{
if
(
!
unique_set
.
count
(
op
))
{
stack
.
push
(
op
);
}
}
}
return
res
;
}
std
::
vector
<
ir
::
Node
*>
TopologyDfsSortOperations
(
const
Graph
&
graph
)
{
std
::
vector
<
ir
::
Node
*>
nodes
;
std
::
unordered_map
<
Node
*
,
int
>
in_degree
;
auto
set_out_ops_ready
=
[
&
](
Node
*
var
)
{
for
(
auto
*
op
:
var
->
outputs
)
{
--
in_degree
[
op
];
}
};
// build in_degree
for
(
auto
*
node
:
graph
.
Nodes
())
{
if
(
node
->
IsOp
())
{
in_degree
[
node
]
+=
node
->
inputs
.
size
();
}
else
if
(
node
->
IsVar
()
&&
node
->
inputs
.
empty
())
{
// put all the inputs of the whole graph ready.
set_out_ops_ready
(
node
);
}
}
std
::
deque
<
Node
*>
op_queue
;
// first visit
for
(
auto
&
node
:
OpDFSSort
(
graph
))
{
if
(
node
->
IsOp
())
{
op_queue
.
push_back
(
node
);
}
}
// traverse the graph
int
num_ops
=
op_queue
.
size
();
while
(
num_ops
)
{
for
(
auto
it
=
op_queue
.
begin
();
it
!=
op_queue
.
end
();
it
++
)
{
auto
*&
cur_op
=
*
it
;
if
(
!
cur_op
||
in_degree
[
cur_op
]
>
0
)
continue
;
// visit this node
// put all the output var of this op valid.
for
(
auto
*
out_var
:
cur_op
->
outputs
)
{
if
(
!
out_var
)
continue
;
set_out_ops_ready
(
out_var
);
}
VLOG
(
8
)
<<
"visit "
<<
cur_op
->
Name
();
nodes
.
push_back
(
cur_op
);
cur_op
=
nullptr
;
num_ops
--
;
}
}
return
nodes
;
}
size_t
GraphNum
(
const
Graph
&
graph
)
{
std
::
unordered_set
<
ir
::
Node
*>
nodes
(
graph
.
Nodes
());
std
::
unordered_set
<
ir
::
Node
*>
visited_nodes
;
...
...
@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
return
graph_count
;
}
void
CleanIndividualNodes
(
Graph
*
graph
)
{
std
::
unordered_set
<
Node
*>
nodes2rm
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
inputs
.
empty
()
&&
node
->
outputs
.
empty
())
{
nodes2rm
.
insert
(
node
);
}
}
for
(
auto
*
node
:
nodes2rm
)
{
graph
->
RemoveNode
(
node
);
}
}
std
::
vector
<
Node
*>
TopologyVarientSort
(
const
Graph
&
graph
,
SortKind
sort_kind
)
{
switch
(
sort_kind
)
{
case
SortKind
::
TS
:
return
framework
::
ir
::
TopologySortOperations
(
graph
);
default:
return
framework
::
ir
::
TopologyDfsSortOperations
(
graph
);
}
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_helper.h
浏览文件 @
885c4e57
...
...
@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
// `graph` cannot contain circle.
std
::
vector
<
ir
::
Node
*>
TopologySortOperations
(
const
Graph
&
graph
);
// Topological sort, but try to DFS.
std
::
vector
<
ir
::
Node
*>
TopologyDfsSortOperations
(
const
Graph
&
graph
);
// Different kinds to sort the operators in a graph to a sequence.
enum
class
SortKind
{
// Topological Search
TS
=
0
,
// Topological and Depth First Search
TDFS
};
// Several kinds of topological sort.
std
::
vector
<
Node
*>
TopologyVarientSort
(
const
Graph
&
graph
,
SortKind
sort_kind
);
// Clean the nodes that doesn't connect to others.
void
CleanIndividualNodes
(
Graph
*
graph
);
// Build an adjacency list of operations for the `graph`.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationAdjList
(
const
Graph
&
graph
);
...
...
paddle/fluid/framework/ir/graph_to_program_pass.cc
浏览文件 @
885c4e57
...
...
@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
...
...
@@ -29,6 +28,14 @@ namespace ir {
std
::
unique_ptr
<
Graph
>
GraphToProgramPass
::
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
{
// Remove the unneeded variables after memory optimization.
std
::
unordered_set
<
std
::
string
>
vars2remove
;
if
(
graph
->
Has
(
kGraphToProgramVarsToRemove
))
{
vars2remove
=
graph
->
Get
<
std
::
unordered_set
<
std
::
string
>>
(
kGraphToProgramVarsToRemove
);
VLOG
(
2
)
<<
"graph to program remove "
<<
vars2remove
.
size
()
<<
" nodes"
;
}
ProgramDesc
&
program
=
Get
<
ProgramDesc
>
(
"program"
);
std
::
unique_ptr
<
proto
::
ProgramDesc
>
program_pb
(
...
...
@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std
::
unordered_set
<
std
::
string
>
visited_vars
;
for
(
ir
::
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
IsVar
())
{
if
(
n
->
Var
()
&&
visited_vars
.
count
(
n
->
Var
()
->
Name
())
==
0
)
{
if
(
n
->
Var
()
&&
visited_vars
.
count
(
n
->
Var
()
->
Name
())
==
0
&&
!
vars2remove
.
count
(
n
->
Var
()
->
Name
()))
{
visited_vars
.
insert
(
n
->
Var
()
->
Name
());
block
->
add_vars
()
->
MergeFrom
(
*
n
->
Var
()
->
Proto
());
}
}
}
block
->
clear_ops
();
std
::
vector
<
ir
::
Node
*>
nodes
=
TopologySortOperations
(
*
graph
);
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
!
n
->
Op
())
{
continue
;
std
::
vector
<
ir
::
Node
*>
nodes
;
if
(
Has
(
kGraphToProgramSortKind
))
{
// Inference Memory Optimize relays on this branch.
int
sort_kind
=
Get
<
int
>
(
kGraphToProgramSortKind
);
nodes
=
TopologyVarientSort
(
*
graph
,
static_cast
<
framework
::
ir
::
SortKind
>
(
sort_kind
));
}
else
{
nodes
=
TopologySortOperations
(
*
graph
);
}
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
!
n
->
Op
())
continue
;
block
->
add_ops
()
->
MergeFrom
(
*
n
->
Op
()
->
Proto
());
}
program
.
CopyFrom
(
*
program_pb
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/ir/graph_to_program_pass.h
浏览文件 @
885c4e57
...
...
@@ -20,6 +20,10 @@ namespace paddle {
namespace
framework
{
namespace
ir
{
const
char
kGraphToProgramVarsToRemove
[]
=
"__graph_to_program_vars_to_remove__"
;
const
char
kGraphToProgramSortKind
[]
=
"__graph_to_program_sort_kind__"
;
class
GraphToProgramPass
:
public
Pass
{
protected:
std
::
unique_ptr
<
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
;
...
...
paddle/fluid/framework/ir/graph_viz_pass.cc
浏览文件 @
885c4e57
paddle/fluid/framework/ir/node.h
浏览文件 @
885c4e57
...
...
@@ -64,7 +64,7 @@ class Node {
std
::
string
Name
()
const
{
return
name_
;
}
VarDesc
*
Var
()
{
VarDesc
*
Var
()
const
{
PADDLE_ENFORCE
(
IsVar
());
return
var_desc_
.
get
();
}
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
885c4e57
...
...
@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
"running Paddle Inference"
;
#endif // PADDLE_ON_INFERENCE
for
(
auto
&
op
:
ops_
)
{
VLOG
(
3
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
Type
()
<<
" on scope "
<<
scope_
;
VLOG
(
4
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
DebugStringEx
(
scope_
)
<<
" on scope "
<<
scope_
;
op
->
SetIsCalledByExecutor
(
false
);
op
->
Run
(
*
scope_
,
place_
);
}
...
...
@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
anc
=
anc
->
parent
();
}
int
num_vars
=
0
;
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
continue
;
}
num_vars
++
;
if
(
persistable
==
var
->
Persistable
())
{
if
(
persistable
)
{
...
...
@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
}
}
}
VLOG
(
4
)
<<
"naive executor create "
<<
num_vars
<<
" vars"
;
}
void
NaiveExecutor
::
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
885c4e57
...
...
@@ -18,6 +18,7 @@ cc_library(analysis SRCS
analyzer.cc
analysis_pass
DEPS
${
analysis_deps
}
analysis_helper
${
INFER_IR_PASSES
}
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
...
...
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
885c4e57
...
...
@@ -15,8 +15,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -24,13 +24,16 @@ namespace analysis {
Analyzer
::
Analyzer
()
{}
void
Analyzer
::
Run
(
Argument
*
argument
)
{
Run
Ir
Analysis
(
argument
);
}
void
Analyzer
::
Run
(
Argument
*
argument
)
{
RunAnalysis
(
argument
);
}
void
Analyzer
::
RunIrAnalysis
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
"ir_analysis_compose_pass"
});
for
(
auto
&
pass
:
passes
)
{
PassRegistry
::
Global
().
Retreive
(
pass
)
->
Run
(
argument
);
void
Analyzer
::
RunAnalysis
(
Argument
*
argument
)
{
PADDLE_ENFORCE
(
argument
->
analysis_passes_valid
(),
"analsis_passes is not valid in the argument."
);
for
(
auto
&
pass
:
argument
->
analysis_passes
())
{
string
::
PrettyLogH1
(
"--- Running analysis [%s]"
,
pass
);
auto
*
ptr
=
PassRegistry
::
Global
().
Retreive
(
pass
);
PADDLE_ENFORCE_NOT_NULL
(
ptr
,
"no analysis pass called %s"
,
pass
);
ptr
->
Run
(
argument
);
}
}
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
885c4e57
...
...
@@ -54,7 +54,7 @@ class Analyzer final {
DISABLE_COPY_AND_ASSIGN
(
Analyzer
);
protected:
void
Run
Ir
Analysis
(
Argument
*
argument
);
void
RunAnalysis
(
Argument
*
argument
);
};
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
885c4e57
...
...
@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
argument
.
SetModelDir
(
FLAGS_inference_model_dir
);
argument
.
SetIrAnalysisPasses
({
"infer_clean_graph_pass"
});
argument
.
SetUseGPU
(
false
);
argument
.
SetAnalysisPasses
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
});
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
...
...
@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
argument
.
SetModelDir
(
FLAGS_inference_model_dir
);
argument
.
SetIrAnalysisPasses
({
"infer_clean_graph_pass"
});
argument
.
SetUseGPU
(
false
);
argument
.
SetAnalysisPasses
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
});
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
885c4e57
...
...
@@ -110,16 +110,20 @@ struct Argument {
// The overall Scope to work on.
DECL_ARGUMENT_UNIQUE_FIELD
(
scope
,
Scope
,
framework
::
Scope
);
// The default program, loaded from disk.
DECL_ARGUMENT_UNIQUE_FIELD
(
main_program
,
MainProgram
,
framework
::
ProgramDesc
);
// The ir passes to perform in analysis phase.
DECL_ARGUMENT_FIELD
(
ir_analysis_passes
,
IrAnalysisPasses
,
std
::
vector
<
std
::
string
>
);
DECL_ARGUMENT_FIELD
(
analysis_passes
,
AnalysisPasses
,
std
::
vector
<
std
::
string
>
);
// Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD
(
mkldnn_enabled_op_types
,
MKLDNNEnabledOpTypes
,
std
::
unordered_set
<
std
::
string
>
);
// Passed from config.
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
...
...
@@ -127,6 +131,13 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
memory_optim_force_update
,
MemoryOptimForceUpdate
,
bool
);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD
(
memory_optim_sort_kind
,
MemoryOptimSortKind
,
int
);
// The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD
(
ir_analyzed_program
,
IrAnalyzedProgram
,
framework
::
proto
::
ProgramDesc
);
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
885c4e57
...
...
@@ -28,6 +28,13 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
#define GCC_ATTRIBUTE(attr__) ;
#else
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
885c4e57
...
...
@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE
(
graph
.
get
());
// Apply all the passes
for
(
const
auto
&
pass
:
passes_
)
{
if
(
pass
->
Type
()
==
"graph_viz_pass"
)
continue
;
PrettyLogEndl
(
Style
::
H2
(),
"--- Running IR pass [%s]"
,
pass
->
Type
());
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
}
...
...
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
浏览文件 @
885c4e57
cc_library
(
subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc
)
if
(
TENSORRT_FOUND
)
if
(
WITH_GPU AND
TENSORRT_FOUND
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller
)
set
(
analysis_deps
${
analysis_deps
}
...
...
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
浏览文件 @
885c4e57
...
...
@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto
subgraphs
=
SubgraphDetector
(
graph_
,
node_inside_subgraph_teller_
)();
for
(
auto
&
subgraph
:
subgraphs
)
{
if
(
subgraph
.
size
()
<=
(
size_t
)
min_subgraph_size_
)
continue
;
LOG
(
INFO
)
<<
"detect a subgraph size "
<<
subgraph
.
size
();
std
::
unordered_set
<
Node
*>
subgraph_uniq
(
subgraph
.
begin
(),
subgraph
.
end
());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
885c4e57
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
block_desc
.
Proto
()
->
set_parent_idx
(
-
1
);
block_desc
.
Proto
()
->
set_idx
(
0
);
string
::
PrettyLogDetail
(
"--- detect a sub-graph with %d nodes"
,
subgraph
.
size
());
for
(
auto
*
node
:
subgraph
)
{
auto
*
op
=
block_desc
.
AppendOp
();
*
op
->
Proto
()
=
*
node
->
Op
()
->
Proto
();
...
...
paddle/fluid/inference/analysis/passes/CMakeLists.txt
浏览文件 @
885c4e57
cc_library
(
ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass
)
cc_library
(
ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices
_pass
)
cc_library
(
ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program
_pass
)
set
(
analysis_deps
${
analysis_deps
}
cc_library
(
analysis_passes SRCS passes.cc DEPS
ir_graph_build_pass
ir_analysis_pass
ir_params_sync_among_devices_pass
memory_optim_pass
ir_graph_to_program_pass
)
set
(
analysis_deps
${
analysis_deps
}
analysis_passes
subgraph_detector
CACHE INTERNAL
""
)
paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
浏览文件 @
885c4e57
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
namespace
paddle
{
...
...
@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
IRPassManager
the_ir_manager
(
argument
);
graph
=
the_ir_manager
.
Apply
(
std
::
move
(
graph
));
PADDLE_ENFORCE_GT
(
graph
->
Nodes
().
size
(),
0
);
argument
->
SetIrAnalyzedProgram
(
new
framework
::
proto
::
ProgramDesc
(
the_ir_manager
.
AcquireProgram
(
&
graph
,
argument
->
main_program
())));
argument
->
SetMainGraph
(
graph
.
release
());
CollectFusionStatis
(
argument
);
}
void
IrAnalysisPass
::
CollectFusionStatis
(
Argument
*
argument
)
{
if
(
!
argument
->
main_graph
().
Has
(
framework
::
ir
::
kFuseStatisAttr
))
{
LOG
(
INFO
)
<<
"argument has no fuse statis"
;
return
;
}
argument
->
SetFusionStatis
(
argument
->
main_graph
().
Get
<
Argument
::
fusion_statis_t
>
(
framework
::
ir
::
kFuseStatisAttr
));
}
std
::
string
IrAnalysisPass
::
repr
()
const
{
return
"ir-analysis-pass"
;
}
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
浏览文件 @
885c4e57
...
...
@@ -29,6 +29,9 @@ namespace analysis {
class
IrAnalysisPass
:
public
AnalysisPass
{
public:
void
RunImpl
(
Argument
*
argument
)
override
;
void
CollectFusionStatis
(
Argument
*
argument
);
std
::
string
repr
()
const
override
;
};
...
...
paddle/fluid/inference/analysis/passes/ir_
analysis_compose
_pass.cc
→
paddle/fluid/inference/analysis/passes/ir_
graph_to_program
_pass.cc
浏览文件 @
885c4e57
...
...
@@ -12,49 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
IrAnalysisComposePass
::
RunImpl
(
Argument
*
argument
)
{
ARGUMENT_CHECK_FIELD
(
argument
,
ir_analysis_passes
);
ApplyIrPasses
(
argument
);
CollectFusionStatis
(
argument
);
}
std
::
string
IrAnalysisComposePass
::
repr
()
const
{
return
"ir-analysis-compose-pass"
;
}
void
IrGraphToProgramPass
::
RunImpl
(
Argument
*
argument
)
{
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
,
});
for
(
const
auto
&
pass
:
passes
)
{
VLOG
(
2
)
<<
"Run pass "
<<
pass
;
auto
*
the_pass
=
PassRegistry
::
Global
().
Retreive
(
pass
);
the_pass
->
Run
(
argument
);
if
(
argument
->
memory_optim_sort_kind_valid
())
{
pass
->
Set
(
framework
::
ir
::
kGraphToProgramSortKind
,
new
int
(
argument
->
memory_optim_sort_kind
()));
}
}
void
IrAnalysisComposePass
::
CollectFusionStatis
(
Argument
*
argument
)
{
if
(
!
argument
->
main_graph
().
Has
(
framework
::
ir
::
kFuseStatisAttr
))
{
LOG
(
INFO
)
<<
"argument has no fuse statis"
;
return
;
}
argument
->
SetFusionStatis
(
argument
->
main_graph
().
Get
<
Argument
::
fusion_statis_t
>
(
framework
::
ir
::
kFuseStatisAttr
));
std
::
unique_ptr
<
Graph
>
graph
(
argument
->
main_graph_ptr
());
framework
::
ProgramDesc
desc
(
argument
->
main_program
());
pass
->
SetNotOwned
(
"program"
,
&
desc
)
;
auto
thegraph
=
pass
->
Apply
(
std
::
move
(
graph
))
;
thegraph
.
release
();
// the argument still own the graph.
argument
->
SetIrAnalyzedProgram
(
new
framework
::
proto
::
ProgramDesc
(
*
desc
.
Proto
()
));
}
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/passes/ir_
analysis_compose
_pass.h
→
paddle/fluid/inference/analysis/passes/ir_
graph_to_program
_pass.h
浏览文件 @
885c4e57
...
...
@@ -14,31 +14,17 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
/*
* The analysis pass to run a list of IR passes (like a function call).
* Currently, it should be the first pass of analysis phase.
*/
class
IrAnalysisComposePass
:
public
AnalysisPass
{
class
IrGraphToProgramPass
:
public
AnalysisPass
{
public:
void
RunImpl
(
Argument
*
argument
)
override
;
std
::
string
repr
()
const
override
;
void
RunImpl
(
Argument
*
argument
)
override
;
private:
void
ApplyIrPasses
(
Argument
*
argument
);
void
CollectFusionStatis
(
Argument
*
argument
);
// Assign a Scope for IR passes to modify the weights.
void
AssignScopeToModify
(
Argument
*
argument
);
std
::
string
repr
()
const
override
{
return
"ir-graph-to-param-pass"
;
}
};
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
0 → 100644
浏览文件 @
885c4e57
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include <algorithm>
#include <fstream>
#include <limits>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
Node
;
using
framework
::
ir
::
TopologyVarientSort
;
using
space_table_t
=
MemoryOptimizePass
::
space_table_t
;
// Collect the lifecycles of the tensors.
// Traverse the graph in topological order.
// The traversal order also affect the lifecycles, so different sort_kind is
// used.
void
MemoryOptimizePass
::
CollectLifeCycle
(
std
::
unordered_map
<
std
::
string
,
lifecycle_t
>*
lifecycles
,
int
sort_kind
)
const
{
max_lifecycle_
=
0
;
for
(
auto
*
op_node
:
framework
::
ir
::
TopologyVarientSort
(
*
graph_
,
static_cast
<
framework
::
ir
::
SortKind
>
(
sort_kind
)))
{
if
(
!
op_node
->
IsOp
())
continue
;
auto
reads
=
op_node
->
inputs
;
auto
writes
=
op_node
->
outputs
;
std
::
vector
<
Node
*>
requires
(
reads
.
begin
(),
reads
.
end
());
requires
.
insert
(
requires
.
end
(),
writes
.
begin
(),
writes
.
end
());
// Disable reuse of feed variables.
if
(
op_node
->
Name
()
==
"feed"
)
{
for
(
auto
*
node
:
op_node
->
outputs
)
{
auto
var
=
node
->
Name
();
lifecycles
->
emplace
(
var
,
std
::
make_pair
(
0
,
std
::
numeric_limits
<
int
>::
max
()));
}
}
else
{
// Normal operators.
for
(
const
Node
*
node
:
requires
)
{
if
(
node
->
Var
()
->
Persistable
())
continue
;
std
::
string
var
=
node
->
Name
();
if
(
!
lifecycles
->
count
(
var
))
{
(
*
lifecycles
)[
var
]
=
std
::
make_pair
(
max_lifecycle_
,
max_lifecycle_
);
}
else
{
(
*
lifecycles
)[
var
].
second
=
std
::
max
(
max_lifecycle_
,
lifecycles
->
at
(
var
).
second
);
// max()
}
}
}
++
max_lifecycle_
;
}
}
// TODO(Superjomn) Make this a general help method.
int
DataTypeToSpace
(
framework
::
proto
::
VarType_Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType_Type_BOOL
:
return
sizeof
(
bool
);
case
framework
::
proto
::
VarType_Type_FP32
:
return
sizeof
(
float
);
case
framework
::
proto
::
VarType_Type_INT32
:
return
sizeof
(
int32_t
);
case
framework
::
proto
::
VarType_Type_INT64
:
return
sizeof
(
int64_t
);
default:
PADDLE_THROW
(
"Unknown data type"
);
}
}
// Collect the memory size of the tensors.
void
MemoryOptimizePass
::
CollectVarMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
batch_var_ave_dim
,
std
::
unordered_map
<
std
::
string
,
Node
*>*
tensor_nodes
,
space_table_t
*
space_table
)
const
{
// Collect tensors from graph.
for
(
auto
*
node
:
graph_
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
()
->
GetType
()
==
framework
::
proto
::
VarType
::
Type
::
VarType_Type_LOD_TENSOR
)
{
// Parameters will not be reused.
if
(
node
->
Var
()
->
Persistable
())
continue
;
(
*
tensor_nodes
)[
node
->
Name
()]
=
node
;
(
*
space_table
)[
node
->
Name
()]
=
DataTypeToSpace
(
node
->
Var
()
->
GetDataType
())
*
batch_var_ave_dim
.
at
(
node
->
Name
());
}
}
}
// Find a sutable (big enough but smallest to avoid memory waste).
//
// Args:
// @tensor_nodes: the tensor nodes in the ir::Graph.
// @free_existing_tensors: the allocated tensor and are free.
// @space_table: the memory space of tensors.
// @tensor2use: the tensor that requires memory.
//
// Returns:
// true if found some existing tensor to reuse.
// false if no sutable tensor to reuse, one need to allocate a new tensor for
// this requirement.
// The suitable tensor for reuse is one that is approximately equal to the
// memory demand.
bool
FindSuitableTensorToReuse
(
const
std
::
string
&
tensor
,
int
space_required
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>&
tensor_nodes
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
const
space_table_t
&
space_table
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>&
var_clusters
,
std
::
string
*
tensor2use
)
__SHOULD_USE_RESULT__
;
bool
FindSuitableTensorToReuse
(
const
std
::
string
&
tensor
,
int
space_required
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>&
tensor_nodes
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
const
space_table_t
&
space_table
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>&
var_clusters
,
std
::
string
*
tensor2use
)
{
std
::
pair
<
std
::
string
,
size_t
>
best_fit
;
best_fit
.
second
=
std
::
numeric_limits
<
int
>::
max
();
VLOG
(
5
)
<<
"Split Tensors to "
<<
var_clusters
.
size
()
<<
" clusters"
;
// find the cluster this var belongs to.
const
std
::
unordered_set
<
std
::
string
>*
cluster
=
nullptr
;
for
(
const
auto
&
c
:
var_clusters
)
{
if
(
c
.
count
(
tensor
))
{
cluster
=
&
c
;
break
;
}
}
PADDLE_ENFORCE_NOT_NULL
(
cluster
,
"something wrong in memory optimization, the "
"variable %s not in the clusters."
,
tensor
);
for
(
auto
&
candidate
:
*
free_existing_tensors
)
{
// This is not a temporary tensor.
if
(
!
space_table
.
count
(
candidate
))
continue
;
// Not in the same cluster.
if
(
!
cluster
->
count
(
candidate
))
continue
;
size_t
space
=
space_table
.
at
(
candidate
);
size_t
space_diff
=
std
::
abs
<
size_t
>
(
space
-
space_required
);
if
(
space_diff
<
best_fit
.
second
)
{
best_fit
.
first
=
candidate
;
best_fit
.
second
=
space_diff
;
}
}
if
(
best_fit
.
second
<
std
::
numeric_limits
<
int
>::
max
())
{
*
tensor2use
=
best_fit
.
first
;
return
true
;
}
return
false
;
}
// Allocate new tensor instead of reusing the existing one.
void
AllocateNewTensor
(
const
std
::
string
&
name
,
size_t
space_required
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>&
tensor_nodes
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
space_table_t
*
space_table
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
)
{
// The newly born tensor is free to be used.
free_existing_tensors
->
insert
(
name
);
// Register the space it has.
PADDLE_ENFORCE
(
space_table
->
count
(
name
));
space_table
->
at
(
name
)
=
std
::
max
(
space_table
->
at
(
name
),
space_required
);
// The allocated new tensor use the memory of itself.
(
*
reuse_table
)[
name
]
=
name
;
}
// Free a tensor and make it resuable.
// @tensor: the tensor to free.
// @free_existing_tensors: the free and allocated tensors.
// @reuse_table: a map from a fake tensor to the existing allocated tensor.
void
FreeATensor
(
const
std
::
string
&
tensor
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
)
{
if
(
tensor
==
"feed"
||
tensor
==
"fetch"
)
return
;
// the really allocated tensor.
const
auto
&
free_tensor
=
reuse_table
->
at
(
tensor
);
free_existing_tensors
->
insert
(
free_tensor
);
}
// Reuse a free existing tensor.
void
ReuseATensor
(
const
std
::
string
&
tensor
,
const
std
::
string
&
tensor2reuse
,
size_t
memory_size
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
,
space_table_t
*
reused_space_table
)
{
auto
it
=
free_existing_tensors
->
find
(
tensor2reuse
);
PADDLE_ENFORCE
(
it
!=
free_existing_tensors
->
end
());
free_existing_tensors
->
erase
(
it
);
(
*
reuse_table
)[
tensor
]
=
tensor2reuse
;
// Update the memory size of a reused tensor, the memory will grow if the
// required memory is larger.
(
*
reused_space_table
)[
tensor2reuse
]
=
std
::
max
(
reused_space_table
->
at
(
tensor2reuse
),
memory_size
);
}
// Calculate the memory usage.
void
EvaluateMemoryUsage
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
reuse_table
,
const
space_table_t
&
space_table
,
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
var_batch_ave_size
,
size_t
*
allocated
,
size_t
*
saved
)
{
*
allocated
=
0
;
*
saved
=
0
;
for
(
auto
elem
:
reuse_table
)
{
if
(
elem
.
first
==
elem
.
second
)
{
*
allocated
+=
space_table
.
at
(
elem
.
first
);
VLOG
(
4
)
<<
elem
.
first
<<
" <-> "
<<
elem
.
second
<<
" "
<<
space_table
.
at
(
elem
.
first
)
<<
" "
<<
space_table
.
at
(
elem
.
second
);
}
else
{
*
saved
+=
space_table
.
at
(
elem
.
first
);
VLOG
(
4
)
<<
"reuse "
<<
elem
.
first
<<
" -> "
<<
elem
.
second
;
}
}
VLOG
(
4
)
<<
"allocated "
<<
*
allocated
;
VLOG
(
4
)
<<
"saved "
<<
*
saved
;
}
// Return saved ratio.
void
MemoryOptimizePass
::
MakeReusePlan
(
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>&
var_clusters
,
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
var_batch_ave_size
,
const
space_table_t
&
space_table
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
,
int
sort_kind
,
MemoryAllocation
*
memory_allocation
)
const
{
// Clear the existing plan.
reuse_table
->
clear
();
// The `space_table` stores the real memory size for each tensor.
// The `reused_space_table` stores the maximum memory size required by a
// tensor during the memory reusing, the small tensor might be reused by a
// larger tensor, and the memory size of the small one will grow.
auto
reused_space_table
=
space_table
;
std
::
unordered_map
<
std
::
string
,
lifecycle_t
>
life_cycles
;
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
// The allocated tensors whose memory can be reused, they will live across the
// program execution.
std
::
unordered_set
<
std
::
string
>
existing_tensors
;
// The existing tensor that has been allocated, and is also free to reuse.
std
::
unordered_set
<
std
::
string
>
free_existing_tensors
;
CollectLifeCycle
(
&
life_cycles
,
sort_kind
);
for
(
int
age
=
0
;
age
<
max_lifecycle_
;
++
age
)
{
std
::
unordered_set
<
std
::
string
>
born_tensors
;
std
::
unordered_set
<
std
::
string
>
dead_tensors
;
// Gather the dead and born tensors.
for
(
auto
elem_it
=
life_cycles
.
begin
();
elem_it
!=
life_cycles
.
end
();
elem_it
++
)
{
if
(
elem_it
->
second
.
first
==
-
1
)
{
continue
;
}
const
auto
&
tensor
=
elem_it
->
first
;
const
auto
&
lifecycle
=
elem_it
->
second
;
VLOG
(
4
)
<<
"process "
<<
tensor
<<
" reuse "
<<
lifecycle
.
first
<<
"->"
<<
lifecycle
.
second
;
// Collect newly born tensors.
if
(
lifecycle
.
first
==
age
)
{
born_tensors
.
insert
(
tensor
);
}
// Collect dead tensors whose memory can be reused.
else
if
(
lifecycle
.
second
<
age
)
{
// NOLINT
dead_tensors
.
insert
(
tensor
);
// remove to avoid duplicate process.
elem_it
->
second
.
first
=
-
1
;
// avoid duplicate search
}
}
// Reuse the dead tensors for born tensors
for
(
const
auto
&
tensor
:
born_tensors
)
{
// Skip the feed and fetch tensor for that they share data with others.
std
::
string
tensor2reuse
;
if
(
!
space_table
.
count
(
tensor
))
continue
;
size_t
space_required
=
space_table
.
at
(
tensor
);
if
(
FindSuitableTensorToReuse
(
tensor
,
space_required
,
tensor_nodes
,
&
free_existing_tensors
,
reused_space_table
,
var_clusters
,
&
tensor2reuse
))
{
if
(
tensor
!=
tensor2reuse
)
{
VLOG
(
4
)
<<
tensor
<<
" -> "
<<
tensor2reuse
;
}
ReuseATensor
(
tensor
,
tensor2reuse
,
space_required
,
&
free_existing_tensors
,
reuse_table
,
&
reused_space_table
);
}
else
{
VLOG
(
4
)
<<
"allocate "
<<
tensor
;
AllocateNewTensor
(
tensor
,
space_required
,
tensor_nodes
,
&
free_existing_tensors
,
&
reused_space_table
,
reuse_table
);
ReuseATensor
(
tensor
,
tensor
,
space_required
,
&
free_existing_tensors
,
reuse_table
,
&
reused_space_table
);
}
}
for
(
const
auto
&
tensor
:
dead_tensors
)
{
// free its memory.
FreeATensor
(
tensor
,
&
free_existing_tensors
,
reuse_table
);
}
}
EvaluateMemoryUsage
(
*
reuse_table
,
reused_space_table
,
var_batch_ave_size
,
&
(
memory_allocation
->
allocated
),
&
(
memory_allocation
->
saved
));
memory_allocation
->
sort_kind
=
sort_kind
;
}
void
BuildVarNodeTable
(
Graph
*
graph
,
std
::
unordered_map
<
std
::
string
,
Node
*>*
var_node_table
)
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
())
{
(
*
var_node_table
)[
node
->
Name
()]
=
node
;
}
}
}
// NOTE The optimized opdesc doesn't match ir::Graph.
void
UpdateOpDescsByReuse
(
Graph
*
graph
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
reuse_table
,
int
sort_kind
)
{
// TODO(Superjomn) change here to be compatible with the runtime order.
for
(
auto
*
node
:
TopologyVarientSort
(
*
graph
,
static_cast
<
framework
::
ir
::
SortKind
>
(
sort_kind
)))
{
if
(
node
->
IsOp
())
{
// Replace the original inputs/outputs with the reused tensors.
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
in_args
,
out_args
;
for
(
auto
argument
:
node
->
Op
()
->
Inputs
())
{
for
(
const
auto
&
x
:
argument
.
second
)
{
auto
name
=
x
;
if
(
reuse_table
.
count
(
x
)
&&
reuse_table
.
at
(
x
)
!=
x
)
{
name
=
reuse_table
.
at
(
x
);
}
in_args
[
argument
.
first
].
push_back
(
name
);
VLOG
(
4
)
<<
node
->
Name
()
<<
" input "
<<
x
<<
" -> "
<<
name
;
}
}
for
(
auto
argument
:
node
->
Op
()
->
Outputs
())
{
for
(
const
auto
&
x
:
argument
.
second
)
{
auto
name
=
x
;
if
(
reuse_table
.
count
(
x
)
&&
reuse_table
.
at
(
x
)
!=
x
)
{
name
=
reuse_table
.
at
(
x
);
}
out_args
[
argument
.
first
].
push_back
(
name
);
VLOG
(
4
)
<<
node
->
Name
()
<<
" output "
<<
x
<<
" -> "
<<
name
;
}
}
// Update arguments.
for
(
auto
&
arg
:
in_args
)
{
node
->
Op
()
->
SetInput
(
arg
.
first
,
arg
.
second
);
}
for
(
auto
&
arg
:
out_args
)
{
node
->
Op
()
->
SetOutput
(
arg
.
first
,
arg
.
second
);
}
node
->
Op
()
->
Flush
();
}
}
}
void
MemoryOptimizePass
::
PerformReusePlan
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
reuse_table
,
int
sort_kind
,
std
::
unordered_set
<
std
::
string
>*
vars2remove
)
const
{
std
::
unordered_map
<
std
::
string
,
Node
*>
var_node_table
;
BuildVarNodeTable
(
graph_
,
&
var_node_table
);
UpdateOpDescsByReuse
(
graph_
,
reuse_table
,
sort_kind
);
for
(
auto
&
item
:
reuse_table
)
{
if
(
item
.
first
!=
item
.
second
)
{
vars2remove
->
insert
(
item
.
first
);
}
}
VLOG
(
2
)
<<
"to remove vars "
<<
vars2remove
->
size
();
}
std
::
vector
<
std
::
string
>
split
(
const
std
::
string
&
line
,
char
delim
)
{
std
::
vector
<
std
::
string
>
res
;
std
::
string
field
;
std
::
stringstream
line_stream
(
line
);
while
(
std
::
getline
(
line_stream
,
field
,
delim
))
{
res
.
emplace_back
(
field
);
}
return
res
;
}
// Deserialize the batch var shapes from the cache file.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
DeseralizeBatchVarShapes
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"failed to open %s to read cache"
,
path
);
std
::
string
line
;
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batch_shapes
;
while
(
std
::
getline
(
file
,
line
))
{
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
batch
;
for
(
const
auto
&
var_info
:
split
(
line
,
';'
))
{
auto
fields
=
split
(
var_info
,
':'
);
PADDLE_ENFORCE_EQ
(
fields
.
size
(),
2UL
);
auto
var_name
=
fields
.
front
();
auto
shape_str
=
split
(
fields
[
1
],
','
);
std
::
vector
<
int
>
shape
;
for
(
const
auto
&
v
:
shape_str
)
shape
.
push_back
(
std
::
stoi
(
v
));
batch
[
var_name
]
=
shape
;
}
batch_shapes
.
push_back
(
batch
);
}
return
batch_shapes
;
}
// Calculate the average dim of each tensor from the batch shape cache.
std
::
unordered_map
<
std
::
string
,
size_t
>
GetBatchAverageSize
(
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
std
::
unordered_map
<
std
::
string
,
size_t
>
var2size
;
// The average size of the batches for each variable.
int
num_batch
=
0
;
for
(
const
auto
&
batch
:
batches
)
{
num_batch
++
;
for
(
const
auto
&
item
:
batch
)
{
int
dim
=
std
::
accumulate
(
item
.
second
.
begin
(),
item
.
second
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
var2size
[
item
.
first
]
+=
dim
;
}
}
for
(
auto
&
item
:
var2size
)
{
item
.
second
/=
num_batch
;
}
return
var2size
;
}
// Analysis the batch shapes loading from the cache file.
// By splitting the variables to different clusters by analyzing their batch
// size, we can pre-schedule the changes of difference LoDTensor when different
// length of input sequences is entered.
// This should works fine for the models operating on sentences.
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
AnalysisBatchShapesByBatchSize
(
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
// collect the batch size of each shape and combine to a stringstream in
// converient to generate a hash.
std
::
unordered_map
<
std
::
string
,
std
::
stringstream
>
var_batchsize_hashes
;
for
(
auto
&
batch
:
batches
)
{
for
(
auto
&
ele
:
batch
)
{
int
batch_size
=
ele
.
second
.
front
();
// TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes
[
ele
.
first
]
<<
batch_size
;
}
}
// Split to sets by batch size sequences.
std
::
unordered_map
<
size_t
/*hash*/
,
std
::
unordered_set
<
std
::
string
>>
shape_sets
;
for
(
auto
&
ele
:
var_batchsize_hashes
)
{
auto
hash
=
std
::
hash
<
std
::
string
>
()(
ele
.
second
.
str
());
shape_sets
[
hash
].
insert
(
ele
.
first
);
}
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
res
;
for
(
auto
&
ele
:
shape_sets
)
{
res
.
emplace_back
(
std
::
move
(
ele
.
second
));
}
VLOG
(
3
)
<<
"Cluster by batch_size and get "
<<
res
.
size
()
<<
" clusters"
;
return
res
;
}
// Analysis the batch shapes loading from the cache file, and split them to
// different clusters by their size.
// This should works fine for the overall models.
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
AnalysisBatchShapesBySimilarSize
(
const
space_table_t
&
space_table
,
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
,
int
interval
=
200000
)
{
PADDLE_ENFORCE_GT
(
interval
,
0
);
// cluster to different clusters.
size_t
max_size
=
0
;
for
(
auto
&
item
:
space_table
)
{
max_size
=
std
::
max
(
item
.
second
,
max_size
);
}
VLOG
(
4
)
<<
"tensor max size "
<<
max_size
;
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
res
;
// cluster by intervals.
for
(
size_t
interval_size
=
0
;
interval_size
<=
max_size
;
interval_size
+=
interval
)
{
std
::
unordered_set
<
std
::
string
>
cluster
;
for
(
auto
&
item
:
space_table
)
{
if
(
interval_size
<=
item
.
second
&&
interval_size
+
interval
>
item
.
second
)
{
cluster
.
insert
(
item
.
first
);
}
}
if
(
!
cluster
.
empty
())
{
res
.
push_back
(
cluster
);
}
}
VLOG
(
3
)
<<
"Cluster by interval and get "
<<
res
.
size
()
<<
" cluster"
;
return
res
;
}
std
::
string
MemoryOptimizePass
::
repr
()
const
{
return
"memory optimize pass"
;
}
void
MemoryOptimizePass
::
RunImpl
(
Argument
*
argument
)
{
// When force update, should not optimize memory.
if
(
!
argument
->
enable_memory_optim
()
||
argument
->
memory_optim_force_update
())
return
;
graph_
=
argument
->
main_graph_ptr
();
auto
path
=
GetMemoryCachePath
(
argument
->
model_dir_valid
()
?
argument
->
model_dir
()
:
""
,
argument
->
model_program_path_valid
()
?
argument
->
model_program_path
()
:
""
);
VLOG
(
3
)
<<
"Load memory cache from "
<<
path
;
if
(
inference
::
IsFileExists
(
path
))
{
VLOG
(
4
)
<<
"Performing memory optimize"
;
auto
batches
=
DeseralizeBatchVarShapes
(
path
);
auto
var_batch_ave_size
=
GetBatchAverageSize
(
batches
);
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
space_table_t
space_table
;
CollectVarMemorySize
(
var_batch_ave_size
,
&
tensor_nodes
,
&
space_table
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
reuse_table
;
double
max_saving_ratio
=
0.
;
std
::
vector
<
std
::
function
<
MemoryAllocation
()
>>
strategies
;
for
(
int
sort_kind
=
0
;
sort_kind
<
2
;
sort_kind
++
)
{
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_batch_size
=
AnalysisBatchShapesByBatchSize
(
batches
);
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_batch_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
);
// interval 1kb
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
*
1024
);
// interval 1MB
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
std
::
numeric_limits
<
int
>::
max
());
// no intervals
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
}
std
::
function
<
MemoryAllocation
()
>*
best_strategy
{
nullptr
};
// Try all strategies to get the best result.
for
(
auto
&
strategy
:
strategies
)
{
auto
allocation
=
strategy
();
string
::
PrettyLogDetail
(
"--- get strategy saving %f memory for workspace"
,
allocation
.
GetSavingRatio
());
if
(
allocation
.
GetSavingRatio
()
>
max_saving_ratio
)
{
max_saving_ratio
=
allocation
.
GetSavingRatio
();
best_strategy
=
&
strategy
;
}
}
if
(
!
best_strategy
)
{
LOG
(
ERROR
)
<<
"This model makes poor memory optimize, skip memory optimize"
;
return
;
}
auto
memory_allocation
=
(
*
best_strategy
)();
string
::
PrettyLogH2
(
"--- Saved %.2f%s memory for workspace(temporary variables)"
,
memory_allocation
.
GetSavingRatio
()
*
100
,
"%"
);
string
::
PrettyLogDetail
(
"--- Allocated %d MB"
,
memory_allocation
.
allocated
/
1024.
/
1024.
);
string
::
PrettyLogDetail
(
"--- Saved %d MB"
,
memory_allocation
.
saved
/
1024.
/
1024.
);
argument
->
main_graph
().
Set
(
framework
::
ir
::
kGraphToProgramVarsToRemove
,
new
std
::
unordered_set
<
std
::
string
>
);
auto
&
vars2remove
=
argument
->
main_graph
().
Get
<
std
::
unordered_set
<
std
::
string
>>
(
framework
::
ir
::
kGraphToProgramVarsToRemove
);
PerformReusePlan
(
reuse_table
,
memory_allocation
.
sort_kind
,
&
vars2remove
);
argument
->
SetMemoryOptimSortKind
(
memory_allocation
.
sort_kind
);
}
}
float
MemoryOptimizePass
::
MemoryAllocation
::
GetSavingRatio
()
const
{
return
(
saved
/
1024.
)
/
(
allocated
/
1024.
+
saved
/
1024.
);
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
0 → 100644
浏览文件 @
885c4e57
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
/*
* Memory optimization pass for inference with pre-analysis of memory usage
* without GC.
* Different from training, the inference memory reuse strategies doesn't
* include GC for that overhead is too much when batch size equals one.
*
* The inference memory reuse tries to pre-determine the tensor reusing strategy
* without runtime overhead.
*
* To improve the strategy's performance, a warm-up running is introduced:
* - Before officially deploy the inference program, one should warm it up and
* generate some runtime cache,
* - Run the inference program with several batches of data, it will persist
* some runtime information about memory of tensors to disk, we call the
* information the memory reusing cache,
* - With the memory reusing cache, user can deploy the inference to a
* service, before running the model, the inference program will load the
* memory cache, analysis it and generate the best memory reusing strategy,
* and adjust the execution of the network.
*
* With the warm-up and memory reusing cache design, the memory reusing
* algorithm can analysis the real memory consume of the tensors, even with the
* flexible LoDTensor and special shape changing operators such as
* sequence-pooling.
*/
class
MemoryOptimizePass
:
public
AnalysisPass
{
public:
using
space_table_t
=
std
::
unordered_map
<
std
::
string
,
size_t
>
;
using
lifecycle_t
=
std
::
pair
<
int
,
int
>
;
struct
MemoryAllocation
{
size_t
allocated
;
// allocated memory in byte.
size_t
saved
;
// saved memory in byte.
int
sort_kind
;
// the kind of the corresponding sorting algorithm.
// Get the memory saving ratio of temporary variables.
float
GetSavingRatio
()
const
;
};
virtual
~
MemoryOptimizePass
()
=
default
;
protected:
void
RunImpl
(
Argument
*
argument
)
override
;
private:
void
CollectLifeCycle
(
std
::
unordered_map
<
std
::
string
,
lifecycle_t
>
*
lifecycles
,
int
sort_kind
)
const
;
void
CollectVarMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>
&
batch_var_ave_dim
,
std
::
unordered_map
<
std
::
string
,
framework
::
ir
::
Node
*>
*
tensor_nodes
,
space_table_t
*
space_table
)
const
;
// Returns percentage of saved memory.
void
MakeReusePlan
(
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
var_clusters
,
const
std
::
unordered_map
<
std
::
string
,
size_t
>
&
var_batch_ave_size
,
const
space_table_t
&
space_table
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
*
reuse_table
,
int
sort_kind
,
MemoryAllocation
*
memory_allocation
)
const
;
void
PerformReusePlan
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
reuse_table
,
int
sort_kind
,
std
::
unordered_set
<
std
::
string
>
*
vars2remove
)
const
;
public:
std
::
string
repr
()
const
override
;
private:
mutable
framework
::
ir
::
Graph
*
graph_
{
nullptr
};
mutable
int
max_lifecycle_
{
-
1
};
};
static
std
::
string
GetMemoryCachePath
(
const
std
::
string
&
model_path
,
const
std
::
string
&
prog_path
)
{
auto
path
=
model_path
.
empty
()
?
prog_path
:
model_path
;
return
path
+
".memory_cache"
;
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/passes/passes.cc
浏览文件 @
885c4e57
...
...
@@ -13,24 +13,31 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
PassRegistry
::
PassRegistry
()
{
// Register manually to avoid the trivial `USE_OP` like macro for easier use
// and link.
passes_
.
emplace
(
"ir_analysis_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrAnalysisPass
));
passes_
.
emplace
(
"ir_graph_build_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrGraphBuildPass
));
passes_
.
emplace
(
"
ir_analysis_compos
e_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrAnalysisCompos
ePass
));
passes_
.
emplace
(
"
memory_optimiz
e_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
MemoryOptimiz
ePass
));
passes_
.
emplace
(
"ir_params_sync_among_devices_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrParamsSyncAmongDevicesPass
));
passes_
.
emplace
(
"ir_graph_to_program_pass"
,
std
::
unique_ptr
<
IrGraphToProgramPass
>
(
new
IrGraphToProgramPass
));
}
}
// namespace analysis
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
885c4e57
...
...
@@ -18,8 +18,10 @@ if(APPLE)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass
ir_pass_manager naive_executor analysis_predictor
${
GLOB_PASS_LIB
}
)
set
(
inference_deps
${
analysis_deps
}
paddle_inference_api paddle_fluid_api
analysis pass naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
tensorrt_engine tensorrt_converter
)
...
...
@@ -29,7 +31,8 @@ add_subdirectory(details)
cc_library
(
analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder
)
cc_library
(
paddle_pass_builder SRCS paddle_pass_builder.cc
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
${
inference_deps
}
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor
...
...
@@ -44,7 +47,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=
${
WORD2VEC_MODEL_DIR
}
--book_dirname=
${
PYTHON_TESTS_DIR
}
/book
)
set_tests_properties
(
test_api_impl PROPERTIES DEPENDS test_image_classification
)
endif
()
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
${
inference_deps
}
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
benchmark
${
inference_deps
}
ARGS --dirname=
${
WORD2VEC_MODEL_DIR
}
)
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
885c4e57
...
...
@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
Update
();
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
)
{
prog_file_
=
prog_file
;
params_file_
=
params_file
;
Update
();
}
void
contrib
::
AnalysisConfig
::
SetModel
(
const
std
::
string
&
prog_file_path
,
const
std
::
string
&
params_file_path
)
{
prog_file_
=
prog_file_path
;
params_file_
=
params_file_path
;
Update
();
}
void
contrib
::
AnalysisConfig
::
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
)
{
...
...
@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
memory_pool_init_size_mb_
=
memory_pool_init_size_mb
;
device_id_
=
device_id
;
#else
LOG
(
ERROR
)
<<
"Please compile with gpu to EnableGpu"
;
LOG
(
ERROR
)
<<
"Please compile with gpu to EnableGpu
()
"
;
use_gpu_
=
false
;
#endif
Update
();
}
void
contrib
::
AnalysisConfig
::
DisableGpu
()
{
use_gpu_
=
false
;
Update
();
}
void
contrib
::
AnalysisConfig
::
DisableGpu
()
{
use_gpu_
=
false
;
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
contrib
::
AnalysisConfig
&
other
)
{
#define CP_MEMBER(member__) member__ = other.member__;
...
...
@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER
(
use_gpu_
);
CP_MEMBER
(
device_id_
);
CP_MEMBER
(
memory_pool_init_size_mb_
);
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
memory_optim_force_update_
);
// TensorRT releated.
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
...
...
@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
}
#undef CP_MEMBER
Update
();
}
void
contrib
::
AnalysisConfig
::
EnableMKLDNN
()
{
...
...
@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"Please compile with MKLDNN first to use MKLDNN"
;
use_mkldnn_
=
false
;
#endif
Update
();
}
void
contrib
::
AnalysisConfig
::
EnableTensorRtEngine
(
int
workspace_size
,
int
max_batch_size
,
int
min_subgraph_size
)
{
#ifdef PADDLE_WITH_CUDA
if
(
!
use_gpu
())
{
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
return
;
}
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
Update
();
#else
LOG
(
ERROR
)
<<
"To use TensorRT engine, please compile inference lib with GPU first."
;
#endif
}
// TODO(Superjomn) refactor this, buggy.
void
contrib
::
AnalysisConfig
::
Update
()
{
auto
info
=
SerializeInfoCache
();
if
(
info
==
serialized_info_cache_
)
return
;
if
(
use_gpu_
)
{
// Transfer pass_builder and copy the existing compatible passes.
if
(
!
pass_builder_
||
((
use_gpu
()
^
pass_builder_
->
use_gpu
())))
{
if
(
use_gpu
())
{
pass_builder_
.
reset
(
new
GpuPassStrategy
);
if
(
use_tensorrt_
)
{
// Append after the Affine_channel_conv_fuse pass.
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
);
}
if
(
use_tensorrt_
)
{
if
(
!
use_gpu_
)
{
LOG
(
ERROR
)
<<
"TensorRT engine is not available when EnableGpu() not actived."
;
}
else
{
if
(
use_gpu
())
{
pass_builder_
.
reset
(
new
GpuPassStrategy
(
*
static_cast
<
GpuPassStrategy
*>
(
pass_builder_
.
get
())));
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
(
*
static_cast
<
CpuPassStrategy
*>
(
pass_builder_
.
get
())));
}
}
if
(
use_tensorrt_
)
{
const
auto
&
passes
=
pass_builder_
->
AllPasses
();
if
(
std
::
find
(
passes
.
begin
(),
passes
.
end
(),
"tensorrt_subgraph_pass"
)
==
std
::
end
(
passes
))
{
// Append after the Affine_channel_conv_fuse pass.
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
...
...
@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
#endif
}
if
(
enable_memory_optim_
)
{
pass_builder
()
->
AppendAnalysisPass
(
"memory_optimize_pass"
);
}
if
(
ir_debug_
)
{
pass_builder
()
->
TurnOnDebug
();
}
...
...
@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
std
::
string
contrib
::
AnalysisConfig
::
SerializeInfoCache
()
{
std
::
stringstream
ss
;
ss
<<
model_dir_
;
ss
<<
prog_file_
;
ss
<<
params_file_
;
ss
<<
use_gpu_
;
ss
<<
device_id_
;
ss
<<
memory_pool_init_size_mb_
;
ss
<<
use_tensorrt_
;
ss
<<
tensorrt_workspace_size_
;
ss
<<
tensorrt_max_batchsize_
;
ss
<<
tensorrt_min_subgraph_size_
;
ss
<<
enable_memory_optim_
;
ss
<<
memory_optim_force_update_
;
ss
<<
use_mkldnn_
;
for
(
auto
&
item
:
mkldnn_enabled_op_types_
)
ss
<<
item
;
ss
<<
";"
;
ss
<<
model_from_memory_
;
ss
<<
enable_ir_optim_
;
ss
<<
use_feed_fetch_ops_
;
ss
<<
ir_debug_
;
ss
<<
specify_input_name_
;
ss
<<
cpu_math_library_num_threads_
;
return
ss
.
str
();
}
void
contrib
::
AnalysisConfig
::
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
Update
();
}
float
contrib
::
AnalysisConfig
::
fraction_of_gpu_memory_for_pool
()
const
{
...
...
@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}
void
contrib
::
AnalysisConfig
::
EnableMemoryOptim
(
bool
force_update_cache
)
{
enable_memory_optim_
=
true
;
memory_optim_force_update_
=
force_update_cache
;
Update
();
}
bool
contrib
::
AnalysisConfig
::
enable_memory_optim
()
const
{
return
enable_memory_optim_
;
}
void
contrib
::
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
const
char
*
param_buffer
,
...
...
@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
prog_file_
=
std
::
string
(
prog_buffer
,
prog_buffer
+
prog_buffer_size
);
params_file_
=
std
::
string
(
param_buffer
,
param_buffer
+
param_buffer_size
);
model_from_memory_
=
true
;
Update
();
}
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
885c4e57
...
...
@@ -24,18 +24,21 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
DECLARE_bool
(
profile
);
namespace
paddle
{
...
...
@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
// Collect variable shapes for memory optimization.
if
(
need_collect_var_shapes_for_memory_optim
())
{
CollectVarShapes
();
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// All the containers in the scope will be hold in inference, but the
...
...
@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetUseGPU
(
config_
.
use_gpu
());
argument_
.
SetGPUDeviceId
(
config_
.
gpu_device_id
());
argument_
.
SetEnableMemoryOptim
(
config_
.
enable_memory_optim
());
argument_
.
SetMemoryOptimForceUpdate
(
config_
.
memory_optim_force_update_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
// Analyze inference_program
if
(
!
config_
.
model_dir
().
empty
())
{
...
...
@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
if
(
config_
.
use_gpu
()
&&
config_
.
tensorrt_engine_enabled
())
{
LOG
(
INFO
)
<<
"TensorRT subgraph engine is enabled"
;
argument_
.
SetUseTensorRT
(
true
);
argument_
.
SetTensorRtWorkspaceSize
(
config_
.
tensorrt_workspace_size_
);
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
...
...
@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
if
(
config_
.
use_mkldnn_
)
{
LOG
(
INFO
)
<<
"MKLDNN is enabled"
;
argument_
.
SetMKLDNNEnabledOpTypes
(
config_
.
mkldnn_enabled_op_types_
);
}
auto
passes
=
config_
.
pass_builder
()
->
AllPasses
();
if
(
!
config_
.
ir_optim
())
passes
.
clear
();
if
(
!
config_
.
ir_optim
())
{
passes
.
clear
();
LOG
(
INFO
)
<<
"ir_optim is turned off, no IR pass will be executed"
;
}
argument_
.
SetIrAnalysisPasses
(
passes
);
argument_
.
SetAnalysisPasses
(
config_
.
pass_builder
()
->
AnalysisPasses
());
argument_
.
SetScopeNotOwned
(
const_cast
<
framework
::
Scope
*>
(
scope_
.
get
()));
Analyzer
().
Run
(
&
argument_
);
...
...
@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
if
(
sub_scope_
)
{
scope_
->
DeleteScope
(
sub_scope_
);
}
// TODO(Superjomn) deduce the directory path.
std
::
string
out_path
=
inference
::
analysis
::
GetMemoryCachePath
(
config_
.
model_dir
(),
config_
.
prog_file
());
if
(
need_collect_var_shapes_for_memory_optim
())
{
SerializeBatchVarShapes
(
out_path
);
}
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
...
...
@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
}
void
AnalysisPredictor
::
CollectVarShapes
()
{
VLOG
(
4
)
<<
"Collecting var shapes"
;
if
(
batch_var_shapes_
.
size
()
>=
max_shape_collect_count_
)
return
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
var_shapes
;
for
(
auto
var_name
:
inference_program_
->
Block
(
0
).
LocalVarNames
())
{
auto
*
var
=
sub_scope_
->
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
if
(
var
->
Type
()
==
framework
::
VarTypeTrait
<
framework
::
LoDTensor
>::
kId
||
var
->
Type
()
==
framework
::
VarTypeTrait
<
framework
::
Tensor
>::
kId
)
{
auto
&
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
auto
shape
=
framework
::
vectorize
(
tensor
.
dims
());
var_shapes
[
var_name
].
assign
(
shape
.
begin
(),
shape
.
end
());
}
}
batch_var_shapes_
.
push_back
(
var_shapes
);
LOG_FIRST_N
(
INFO
,
1
)
<<
"Collected "
<<
batch_var_shapes_
.
size
()
<<
" batch of var shapes for analysis"
;
}
void
AnalysisPredictor
::
SerializeBatchVarShapes
(
const
std
::
string
&
path
)
{
LOG
(
INFO
)
<<
"serialize batch var shapes to "
<<
path
;
std
::
ofstream
file
(
path
);
if
(
!
file
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to serialize the var shapes to "
<<
path
;
return
;
}
// The sirialized data format:
// <tensor_name>:dim0,dim1,dim2,;
for
(
auto
&
batch
:
batch_var_shapes_
)
{
for
(
auto
&
ele
:
batch
)
{
file
<<
ele
.
first
<<
":"
;
for
(
size_t
i
=
0
;
i
<
ele
.
second
.
size
()
-
1
;
i
++
)
{
file
<<
ele
.
second
[
i
]
<<
","
;
}
file
<<
ele
.
second
.
back
()
<<
";"
;
}
file
<<
"
\n
"
;
}
}
bool
AnalysisPredictor
::
need_collect_var_shapes_for_memory_optim
()
{
if
(
need_collect_var_shapes_
>=
0
)
return
need_collect_var_shapes_
;
bool
need
=
false
;
// check if the cache exists
if
(
!
config_
.
enable_memory_optim
())
{
need
=
false
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
!
inference
::
IsFileExists
(
inference
::
analysis
::
GetMemoryCachePath
(
config_
.
model_dir
(),
config_
.
prog_file
())))
{
need
=
true
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
config_
.
memory_optim_force_update_
)
{
need
=
true
;
}
need_collect_var_shapes_
=
need
?
1
:
0
;
return
need
;
}
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
const
contrib
::
AnalysisConfig
&
config
)
{
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
885c4e57
...
...
@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
void
SetMkldnnThreadID
(
int
tid
);
protected:
// For memory optimization.
bool
need_collect_var_shapes_for_memory_optim
();
void
CollectVarShapes
();
void
SerializeBatchVarShapes
(
const
std
::
string
&
path
);
bool
PrepareProgram
(
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
);
bool
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
);
bool
CreateExecutor
();
...
...
@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
// A mutex help to make Clone thread safe.
std
::
mutex
clone_mutex_
;
// For memory optimization.
const
size_t
max_shape_collect_count_
{
1000
};
int
need_collect_var_shapes_
{
-
1
};
// -1 for default, 0 for false, 1 for true.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batch_var_shapes_
;
private:
// Some status here that help to determine the status inside the predictor.
bool
status_program_optimized_
{
false
};
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
885c4e57
...
...
@@ -16,8 +16,10 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
...
...
@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
}
}
TEST
(
AnalysisPredictor
,
memory_optim
)
{
AnalysisConfig
config
(
FLAGS_dirname
);
config
.
DisableGpu
();
config
.
EnableMemoryOptim
(
true
);
config
.
pass_builder
()
->
TurnOnDebug
();
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
());
// 2. Dummy Input Data
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
tensor
.
data
.
Reset
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
output
,
output1
;
{
// The first predictor help to cache the memory optimize strategy.
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
// Run several times to check the parameters are not reused by mistake.
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ASSERT_TRUE
(
predictor
->
Run
(
inputs
,
&
output
));
}
}
{
output
.
clear
();
// The second predictor to perform memory optimization.
config
.
EnableMemoryOptim
(
false
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
// Run with memory optimization
ASSERT_TRUE
(
predictor
->
Run
(
inputs
,
&
output
));
}
// Run native
ASSERT_TRUE
(
native_predictor
->
Run
(
inputs
,
&
output1
));
LOG
(
INFO
)
<<
"the output "
<<
inference
::
DescribeTensor
(
output
.
front
());
LOG
(
INFO
)
<<
"the native output "
<<
inference
::
DescribeTensor
(
output1
.
front
());
inference
::
CompareResult
(
output
,
output1
);
}
}
// namespace paddle
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
885c4e57
#!/bin/bash
set
-x
PADDLE_ROOT
=
$1
TURN_ON_MKL
=
$2
# use MKL or Openblas
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
885c4e57
...
...
@@ -15,7 +15,10 @@
#pragma once
#include <glog/logging.h>
#include <fstream>
#if !defined(_WIN32)
#include <sys/time.h>
#endif
#include <algorithm>
#include <chrono> // NOLINT
#include <iterator>
...
...
@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
return
true
;
}
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
)
{
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
,
int
max_num_of_data
=
15
)
{
std
::
stringstream
os
;
os
<<
"Tensor ["
<<
tensor
.
name
<<
"]
\n
"
;
os
<<
" - type: "
;
...
...
@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
static
bool
IsFileExists
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
bool
exists
=
file
.
is_open
();
file
.
close
();
return
exists
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
885c4e57
...
...
@@ -192,6 +192,13 @@ struct AnalysisConfig {
*/
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
/** Turn on memory optimize
* NOTE still in development, will release latter.
*/
void
EnableMemoryOptim
(
bool
force_update_cache
=
false
);
/** Tell whether the memory optimization is activated. */
bool
enable_memory_optim
()
const
;
friend
class
::
paddle
::
AnalysisPredictor
;
/** NOTE just for developer, not an official API, easily to be broken.
...
...
@@ -232,6 +239,10 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
int
tensorrt_min_subgraph_size_
{
3
};
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
memory_optim_force_update_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
...
...
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
885c4e57
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include <glog/logging.h>
namespace
paddle
{
...
...
@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"GPU not support MKLDNN yet"
;
}
void
PaddlePassBuilder
::
AppendAnalysisPass
(
const
std
::
string
&
pass
)
{
analysis_passes_
.
push_back
(
pass
);
}
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
885c4e57
...
...
@@ -45,6 +45,9 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */
void
DeletePass
(
const
std
::
string
&
pass_type
);
/** Append an analysis pass. */
void
AppendAnalysisPass
(
const
std
::
string
&
pass
);
/** Visualize the computation graph after each pass by generating a DOT
* language file, one can draw them with the Graphviz toolkit.
*/
...
...
@@ -54,8 +57,18 @@ class PaddlePassBuilder {
std
::
string
DebugString
();
const
std
::
vector
<
std
::
string
>
&
AllPasses
()
const
{
return
passes_
;
}
std
::
vector
<
std
::
string
>
AnalysisPasses
()
const
{
auto
passes
=
analysis_passes_
;
// To make sure the ir_graph_to_program should be the last pass so any
// modication of IR will persist to the program.
passes
.
push_back
(
"ir_graph_to_program_pass"
);
return
passes
;
}
protected:
std
::
vector
<
std
::
string
>
analysis_passes_
{
{
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
}};
std
::
vector
<
std
::
string
>
passes_
;
};
...
...
@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
/** The MKLDNN control exists in both CPU and GPU mode, because there can be
* still some CPU kernels running in CPU mode.
*/
virtual
void
EnableMKLDNN
()
=
0
;
virtual
void
EnableMKLDNN
()
{}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
...
...
@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
protected:
bool
use_gpu_
{
false
};
bool
use_mkldnn_
{
false
};
};
/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
...
...
@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
use_gpu_
=
false
;
}
explicit
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{}
virtual
~
CpuPassStrategy
()
=
default
;
void
EnableMKLDNN
()
override
{
// TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN
if
(
!
use_mkldnn_
)
{
passes_
.
insert
(
passes_
.
begin
(),
"mkldnn_placement_pass"
);
for
(
auto
&
pass
:
std
::
vector
<
std
::
string
>
(
{
"depthwise_conv_mkldnn_pass"
,
//
for
(
auto
&
pass
:
std
::
vector
<
std
::
string
>
(
{
"depthwise_conv_mkldnn_pass"
,
//
"conv_bias_mkldnn_fuse_pass"
,
//
"conv3d_bias_mkldnn_fuse_pass"
,
//
"conv_relu_mkldnn_fuse_pass"
,
//
"conv_elementwise_add_mkldnn_fuse_pass"
}))
{
passes_
.
push_back
(
pass
);
}
}
use_mkldnn_
=
true
;
#else
use_mkldnn_
=
false
;
#endif
}
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
passes_
)
{}
};
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
...
...
@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
use_gpu_
=
true
;
}
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
explicit
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{
use_gpu_
=
true
;
}
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
885c4e57
...
...
@@ -19,7 +19,7 @@ endfunction()
function
(
inference_analysis_api_test target install_dir filename
)
inference_analysis_test
(
${
target
}
SRCS
${
filename
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
benchmark
ARGS --infer_model=
${
install_dir
}
/model --infer_data=
${
install_dir
}
/data.txt
)
endfunction
()
...
...
@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc SERIAL
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
EXTRA_DEPS legacy_allocator
SERIAL
)
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
...
...
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
885c4e57
...
...
@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
std
::
string
turn_mask_pre
=
"turn_mask_"
;
auto
one_batch
=
data
->
NextBatch
();
PADDLE_ENFORCE
(
!
one_batch
.
response
.
empty
());
int
size
=
one_batch
.
response
[
0
].
size
();
CHECK_EQ
(
size
,
kMaxTurnLen
);
// turn tensor assignment
...
...
@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
...
...
@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST
(
Analyzer_dam
,
compare_with_memory_optim
)
{
// The small dam will core in CI, but works in local.
if
(
FLAGS_max_turn_num
==
9
)
{
contrib
::
AnalysisConfig
cfg
,
cfg1
;
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
// Run the first time to force to update memory cache
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
(
true
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
// Run second time to use the memory cache and perform memory optimization.
SetConfig
(
&
cfg1
);
cfg1
.
EnableMemoryOptim
();
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg1
),
input_slots_all
);
}
}
TEST
(
Analyzer_dam
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_dam
,
compare_mkldnn
)
{
compare
(
true
/* use_mkldnn */
);
}
#endif
...
...
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
浏览文件 @
885c4e57
...
...
@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST
(
Analyzer_Text_Classification
,
profile
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
pass_builder
()
->
TurnOnDebug
();
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
...
...
@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
TEST
(
Analyzer_Text_Classification
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
();
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
885c4e57
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
...
...
@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
FLAGS_infer_model
+
"/__params__"
);
cfg
->
DisableGpu
();
cfg
->
SwitchIrDebug
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchSpecifyInputNames
(
false
);
// TODO(TJ): fix fusion gru
cfg
->
pass_builder
()
->
DeletePass
(
"fc_gru_fuse_pass"
);
}
...
...
@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
}
// cfg.pass_builder()->TurnOnDebug();
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
...
...
@@ -103,8 +105,7 @@ void profile(bool use_mkldnn = false) {
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
CHECK_EQ
(
numel
,
refer
.
data
.
size
());
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
CHECK_LT
(
fabs
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
]
-
refer
.
data
[
i
]),
EXPECT_NEAR
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
],
refer
.
data
[
i
],
1e-5
);
}
}
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
885c4e57
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
#include <thread> // NOLINT
...
...
@@ -28,9 +29,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/inference/utils/benchmark.h"
...
...
@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
float
*
pdata
=
static_cast
<
float
*>
(
out
.
data
.
data
());
float
*
pdata_ref
=
static_cast
<
float
*>
(
ref_out
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
size
;
++
j
)
{
EXPECT_NEAR
(
pdata_ref
[
j
],
pdata
[
j
]
,
FLAGS_accuracy
);
CHECK_LE
(
std
::
abs
(
pdata_ref
[
j
]
-
pdata
[
j
])
,
FLAGS_accuracy
);
}
break
;
}
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
885c4e57
...
...
@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
}
}
TEST
(
TensorRT_mobilenet
,
profile
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"mobilenet"
;
profile
(
model_dir
,
true
,
false
);
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark.h
浏览文件 @
885c4e57
...
...
@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#pragma once
#include <fstream>
#include <iostream>
#include <string>
...
...
paddle/fluid/inference/utils/benchmark_tester.cc
浏览文件 @
885c4e57
...
...
@@ -16,7 +16,7 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
using
namespace
paddle
::
inference
;
using
namespace
paddle
::
inference
;
// NOLINT
TEST
(
Benchmark
,
basic
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
...
...
paddle/fluid/operators/controlflow/feed_op.cc
浏览文件 @
885c4e57
...
...
@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
<<
out_name
;
auto
&
feed_list
=
feed_var
->
Get
<
framework
::
FeedFetchList
>
();
PADDLE_ENFORCE_LT
(
static_cast
<
size_t
>
(
col
),
feed_list
.
size
());
auto
&
feed_item
=
feed_list
.
at
(
static_cast
<
size_t
>
(
col
));
auto
*
out_item
=
out_var
->
GetMutable
<
framework
::
FeedFetchType
>
();
...
...
paddle/fluid/string/pretty_log.h
浏览文件 @
885c4e57
...
...
@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
();
}
template
<
typename
...
Args
>
static
void
PrettyLogInfo
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
info
(),
fmt
,
args
...);
}
template
<
typename
...
Args
>
static
void
PrettyLogDetail
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
detail
(),
fmt
,
args
...);
}
template
<
typename
...
Args
>
static
void
PrettyLogH1
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
H1
(),
fmt
,
args
...);
}
template
<
typename
...
Args
>
static
void
PrettyLogH2
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
H2
(),
fmt
,
args
...);
}
}
// namespace string
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录