Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
885c4e57
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
885c4e57
编写于
1月 21, 2019
作者:
Y
Yan Chunwei
提交者:
GitHub
1月 21, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fea/infer memory optim2 (#14953)
上级
6597ccb0
变更
46
隐藏空白更改
内联
并排
Showing
46 changed file
with
1450 addition
and
92 deletion
+1450
-92
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+1
-0
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+141
-2
paddle/fluid/framework/ir/graph_helper.h
paddle/fluid/framework/ir/graph_helper.h
+17
-0
paddle/fluid/framework/ir/graph_to_program_pass.cc
paddle/fluid/framework/ir/graph_to_program_pass.cc
+24
-7
paddle/fluid/framework/ir/graph_to_program_pass.h
paddle/fluid/framework/ir/graph_to_program_pass.h
+4
-0
paddle/fluid/framework/ir/graph_viz_pass.cc
paddle/fluid/framework/ir/graph_viz_pass.cc
+1
-1
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+1
-1
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+5
-2
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+1
-0
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+10
-7
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+1
-1
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+4
-0
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+11
-0
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+7
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+1
-0
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+1
-1
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
...e/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+0
-1
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+4
-0
paddle/fluid/inference/analysis/passes/CMakeLists.txt
paddle/fluid/inference/analysis/passes/CMakeLists.txt
+10
-3
paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+12
-2
paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+3
-0
paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
...uid/inference/analysis/passes/ir_graph_to_program_pass.cc
+45
-0
paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
...luid/inference/analysis/passes/ir_graph_to_program_pass.h
+3
-17
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
...e/fluid/inference/analysis/passes/memory_optimize_pass.cc
+647
-0
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
...le/fluid/inference/analysis/passes/memory_optimize_pass.h
+106
-0
paddle/fluid/inference/analysis/passes/passes.cc
paddle/fluid/inference/analysis/passes/passes.cc
+10
-3
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+7
-4
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+93
-9
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+88
-4
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+10
-0
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+51
-0
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+1
-0
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+13
-2
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+11
-0
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+5
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+33
-13
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+2
-2
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+30
-0
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
...nference/tests/api/analyzer_text_classification_tester.cc
+2
-0
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+5
-4
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+3
-3
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+5
-0
paddle/fluid/inference/utils/benchmark.h
paddle/fluid/inference/utils/benchmark.h
+1
-1
paddle/fluid/inference/utils/benchmark_tester.cc
paddle/fluid/inference/utils/benchmark_tester.cc
+2
-2
paddle/fluid/operators/controlflow/feed_op.cc
paddle/fluid/operators/controlflow/feed_op.cc
+1
-0
paddle/fluid/string/pretty_log.h
paddle/fluid/string/pretty_log.h
+17
-0
未找到文件。
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
885c4e57
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
885c4e57
...
...
@@ -18,8 +18,10 @@ limitations under the License. */
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_traits.h"
DEFINE_string
(
print_sub_graph_dir
,
""
,
"FLAGS_print_sub_graph_dir is used "
...
...
@@ -41,7 +43,7 @@ void SortHelper(
}
}
VLOG
(
3
)
<<
"topology sort insert: "
<<
node
->
Name
()
VLOG
(
5
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
" "
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
ret
->
push_back
(
node
);
}
...
...
@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
return
ret
;
}
// Build operator inlink edge table.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationAdjList
(
const
Graph
&
graph
)
{
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list
;
for
(
auto
&
n
:
graph
.
Nodes
())
{
if
(
n
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
if
(
!
n
->
IsOp
()
)
continue
;
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
...
...
@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return
adj_list
;
}
// Build operator outlink edge table.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationOutAdjList
(
const
Graph
&
graph
)
{
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list
;
for
(
auto
&
n
:
graph
.
Nodes
())
{
if
(
!
n
->
IsOp
())
continue
;
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
for
(
auto
&
var
:
n
->
outputs
)
{
for
(
auto
&
adj_n
:
var
->
outputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
VLOG
(
40
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
}
}
}
return
adj_list
;
}
std
::
vector
<
ir
::
Node
*>
OpDFSSort
(
const
Graph
&
graph
)
{
auto
edge_table
=
BuildOperationOutAdjList
(
graph
);
std
::
stack
<
Node
*>
stack
;
for
(
auto
&
ele
:
edge_table
)
{
if
(
ele
.
first
->
inputs
.
empty
())
{
// find the input ops (those without input vars)
stack
.
push
(
ele
.
first
);
}
else
{
// find the ops with only persistable vars as inputs.
bool
all_persistable
=
true
;
for
(
auto
*
input
:
ele
.
first
->
inputs
)
{
if
(
!
(
input
->
IsVar
()
&&
input
->
Var
()
&&
input
->
Var
()
->
Persistable
()))
{
all_persistable
=
false
;
}
}
if
(
all_persistable
)
{
stack
.
push
(
ele
.
first
);
}
}
}
std
::
vector
<
Node
*>
res
;
// start from the feed op and DFS
std
::
unordered_set
<
Node
*>
unique_set
;
while
(
!
stack
.
empty
())
{
// will start from the last feed by default.
auto
cur
=
stack
.
top
();
stack
.
pop
();
unique_set
.
insert
(
cur
);
res
.
push_back
(
cur
);
for
(
auto
*
op
:
edge_table
[
cur
])
{
if
(
!
unique_set
.
count
(
op
))
{
stack
.
push
(
op
);
}
}
}
return
res
;
}
std
::
vector
<
ir
::
Node
*>
TopologyDfsSortOperations
(
const
Graph
&
graph
)
{
std
::
vector
<
ir
::
Node
*>
nodes
;
std
::
unordered_map
<
Node
*
,
int
>
in_degree
;
auto
set_out_ops_ready
=
[
&
](
Node
*
var
)
{
for
(
auto
*
op
:
var
->
outputs
)
{
--
in_degree
[
op
];
}
};
// build in_degree
for
(
auto
*
node
:
graph
.
Nodes
())
{
if
(
node
->
IsOp
())
{
in_degree
[
node
]
+=
node
->
inputs
.
size
();
}
else
if
(
node
->
IsVar
()
&&
node
->
inputs
.
empty
())
{
// put all the inputs of the whole graph ready.
set_out_ops_ready
(
node
);
}
}
std
::
deque
<
Node
*>
op_queue
;
// first visit
for
(
auto
&
node
:
OpDFSSort
(
graph
))
{
if
(
node
->
IsOp
())
{
op_queue
.
push_back
(
node
);
}
}
// traverse the graph
int
num_ops
=
op_queue
.
size
();
while
(
num_ops
)
{
for
(
auto
it
=
op_queue
.
begin
();
it
!=
op_queue
.
end
();
it
++
)
{
auto
*&
cur_op
=
*
it
;
if
(
!
cur_op
||
in_degree
[
cur_op
]
>
0
)
continue
;
// visit this node
// put all the output var of this op valid.
for
(
auto
*
out_var
:
cur_op
->
outputs
)
{
if
(
!
out_var
)
continue
;
set_out_ops_ready
(
out_var
);
}
VLOG
(
8
)
<<
"visit "
<<
cur_op
->
Name
();
nodes
.
push_back
(
cur_op
);
cur_op
=
nullptr
;
num_ops
--
;
}
}
return
nodes
;
}
size_t
GraphNum
(
const
Graph
&
graph
)
{
std
::
unordered_set
<
ir
::
Node
*>
nodes
(
graph
.
Nodes
());
std
::
unordered_set
<
ir
::
Node
*>
visited_nodes
;
...
...
@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
return
graph_count
;
}
void
CleanIndividualNodes
(
Graph
*
graph
)
{
std
::
unordered_set
<
Node
*>
nodes2rm
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
inputs
.
empty
()
&&
node
->
outputs
.
empty
())
{
nodes2rm
.
insert
(
node
);
}
}
for
(
auto
*
node
:
nodes2rm
)
{
graph
->
RemoveNode
(
node
);
}
}
std
::
vector
<
Node
*>
TopologyVarientSort
(
const
Graph
&
graph
,
SortKind
sort_kind
)
{
switch
(
sort_kind
)
{
case
SortKind
::
TS
:
return
framework
::
ir
::
TopologySortOperations
(
graph
);
default:
return
framework
::
ir
::
TopologyDfsSortOperations
(
graph
);
}
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_helper.h
浏览文件 @
885c4e57
...
...
@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
// `graph` cannot contain circle.
std
::
vector
<
ir
::
Node
*>
TopologySortOperations
(
const
Graph
&
graph
);
// Topological sort, but try to DFS.
std
::
vector
<
ir
::
Node
*>
TopologyDfsSortOperations
(
const
Graph
&
graph
);
// Different kinds to sort the operators in a graph to a sequence.
enum
class
SortKind
{
// Topological Search
TS
=
0
,
// Topological and Depth First Search
TDFS
};
// Several kinds of topological sort.
std
::
vector
<
Node
*>
TopologyVarientSort
(
const
Graph
&
graph
,
SortKind
sort_kind
);
// Clean the nodes that doesn't connect to others.
void
CleanIndividualNodes
(
Graph
*
graph
);
// Build an adjacency list of operations for the `graph`.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationAdjList
(
const
Graph
&
graph
);
...
...
paddle/fluid/framework/ir/graph_to_program_pass.cc
浏览文件 @
885c4e57
...
...
@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
...
...
@@ -29,6 +28,14 @@ namespace ir {
std
::
unique_ptr
<
Graph
>
GraphToProgramPass
::
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
{
// Remove the unneeded variables after memory optimization.
std
::
unordered_set
<
std
::
string
>
vars2remove
;
if
(
graph
->
Has
(
kGraphToProgramVarsToRemove
))
{
vars2remove
=
graph
->
Get
<
std
::
unordered_set
<
std
::
string
>>
(
kGraphToProgramVarsToRemove
);
VLOG
(
2
)
<<
"graph to program remove "
<<
vars2remove
.
size
()
<<
" nodes"
;
}
ProgramDesc
&
program
=
Get
<
ProgramDesc
>
(
"program"
);
std
::
unique_ptr
<
proto
::
ProgramDesc
>
program_pb
(
...
...
@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std
::
unordered_set
<
std
::
string
>
visited_vars
;
for
(
ir
::
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
IsVar
())
{
if
(
n
->
Var
()
&&
visited_vars
.
count
(
n
->
Var
()
->
Name
())
==
0
)
{
if
(
n
->
Var
()
&&
visited_vars
.
count
(
n
->
Var
()
->
Name
())
==
0
&&
!
vars2remove
.
count
(
n
->
Var
()
->
Name
()))
{
visited_vars
.
insert
(
n
->
Var
()
->
Name
());
block
->
add_vars
()
->
MergeFrom
(
*
n
->
Var
()
->
Proto
());
}
}
}
block
->
clear_ops
();
std
::
vector
<
ir
::
Node
*>
nodes
=
TopologySortOperations
(
*
graph
);
std
::
vector
<
ir
::
Node
*>
nodes
;
if
(
Has
(
kGraphToProgramSortKind
))
{
// Inference Memory Optimize relays on this branch.
int
sort_kind
=
Get
<
int
>
(
kGraphToProgramSortKind
);
nodes
=
TopologyVarientSort
(
*
graph
,
static_cast
<
framework
::
ir
::
SortKind
>
(
sort_kind
));
}
else
{
nodes
=
TopologySortOperations
(
*
graph
);
}
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
!
n
->
Op
())
{
continue
;
}
if
(
!
n
->
Op
())
continue
;
block
->
add_ops
()
->
MergeFrom
(
*
n
->
Op
()
->
Proto
());
}
program
.
CopyFrom
(
*
program_pb
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/ir/graph_to_program_pass.h
浏览文件 @
885c4e57
...
...
@@ -20,6 +20,10 @@ namespace paddle {
namespace
framework
{
namespace
ir
{
const
char
kGraphToProgramVarsToRemove
[]
=
"__graph_to_program_vars_to_remove__"
;
const
char
kGraphToProgramSortKind
[]
=
"__graph_to_program_sort_kind__"
;
class
GraphToProgramPass
:
public
Pass
{
protected:
std
::
unique_ptr
<
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
;
...
...
paddle/fluid/framework/ir/graph_viz_pass.cc
浏览文件 @
885c4e57
...
...
@@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
}
// namespace paddle
REGISTER_PASS
(
graph_viz_pass
,
paddle
::
framework
::
ir
::
GraphVizPass
)
.
RequirePassAttr
(
paddle
::
framework
::
ir
::
kGraphVizPath
);
.
RequirePassAttr
(
paddle
::
framework
::
ir
::
kGraphVizPath
);
\ No newline at end of file
paddle/fluid/framework/ir/node.h
浏览文件 @
885c4e57
...
...
@@ -64,7 +64,7 @@ class Node {
std
::
string
Name
()
const
{
return
name_
;
}
VarDesc
*
Var
()
{
VarDesc
*
Var
()
const
{
PADDLE_ENFORCE
(
IsVar
());
return
var_desc_
.
get
();
}
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
885c4e57
...
...
@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
"running Paddle Inference"
;
#endif // PADDLE_ON_INFERENCE
for
(
auto
&
op
:
ops_
)
{
VLOG
(
3
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
Type
()
<<
" on scope "
<<
scope_
;
VLOG
(
4
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
DebugStringEx
(
scope_
)
<<
" on scope "
<<
scope_
;
op
->
SetIsCalledByExecutor
(
false
);
op
->
Run
(
*
scope_
,
place_
);
}
...
...
@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
anc
=
anc
->
parent
();
}
int
num_vars
=
0
;
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
continue
;
}
num_vars
++
;
if
(
persistable
==
var
->
Persistable
())
{
if
(
persistable
)
{
...
...
@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
}
}
}
VLOG
(
4
)
<<
"naive executor create "
<<
num_vars
<<
" vars"
;
}
void
NaiveExecutor
::
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
885c4e57
...
...
@@ -18,6 +18,7 @@ cc_library(analysis SRCS
analyzer.cc
analysis_pass
DEPS
${
analysis_deps
}
analysis_helper
${
INFER_IR_PASSES
}
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
...
...
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
885c4e57
...
...
@@ -15,8 +15,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -24,13 +24,16 @@ namespace analysis {
Analyzer
::
Analyzer
()
{}
void
Analyzer
::
Run
(
Argument
*
argument
)
{
Run
Ir
Analysis
(
argument
);
}
void
Analyzer
::
Run
(
Argument
*
argument
)
{
RunAnalysis
(
argument
);
}
void
Analyzer
::
RunIrAnalysis
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
"ir_analysis_compose_pass"
});
for
(
auto
&
pass
:
passes
)
{
PassRegistry
::
Global
().
Retreive
(
pass
)
->
Run
(
argument
);
void
Analyzer
::
RunAnalysis
(
Argument
*
argument
)
{
PADDLE_ENFORCE
(
argument
->
analysis_passes_valid
(),
"analsis_passes is not valid in the argument."
);
for
(
auto
&
pass
:
argument
->
analysis_passes
())
{
string
::
PrettyLogH1
(
"--- Running analysis [%s]"
,
pass
);
auto
*
ptr
=
PassRegistry
::
Global
().
Retreive
(
pass
);
PADDLE_ENFORCE_NOT_NULL
(
ptr
,
"no analysis pass called %s"
,
pass
);
ptr
->
Run
(
argument
);
}
}
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
885c4e57
...
...
@@ -54,7 +54,7 @@ class Analyzer final {
DISABLE_COPY_AND_ASSIGN
(
Analyzer
);
protected:
void
Run
Ir
Analysis
(
Argument
*
argument
);
void
RunAnalysis
(
Argument
*
argument
);
};
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
885c4e57
...
...
@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
argument
.
SetModelDir
(
FLAGS_inference_model_dir
);
argument
.
SetIrAnalysisPasses
({
"infer_clean_graph_pass"
});
argument
.
SetUseGPU
(
false
);
argument
.
SetAnalysisPasses
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
});
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
...
...
@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
argument
.
SetModelDir
(
FLAGS_inference_model_dir
);
argument
.
SetIrAnalysisPasses
({
"infer_clean_graph_pass"
});
argument
.
SetUseGPU
(
false
);
argument
.
SetAnalysisPasses
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
});
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
885c4e57
...
...
@@ -110,16 +110,20 @@ struct Argument {
// The overall Scope to work on.
DECL_ARGUMENT_UNIQUE_FIELD
(
scope
,
Scope
,
framework
::
Scope
);
// The default program, loaded from disk.
DECL_ARGUMENT_UNIQUE_FIELD
(
main_program
,
MainProgram
,
framework
::
ProgramDesc
);
// The ir passes to perform in analysis phase.
DECL_ARGUMENT_FIELD
(
ir_analysis_passes
,
IrAnalysisPasses
,
std
::
vector
<
std
::
string
>
);
DECL_ARGUMENT_FIELD
(
analysis_passes
,
AnalysisPasses
,
std
::
vector
<
std
::
string
>
);
// Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD
(
mkldnn_enabled_op_types
,
MKLDNNEnabledOpTypes
,
std
::
unordered_set
<
std
::
string
>
);
// Passed from config.
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
...
...
@@ -127,6 +131,13 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
memory_optim_force_update
,
MemoryOptimForceUpdate
,
bool
);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD
(
memory_optim_sort_kind
,
MemoryOptimSortKind
,
int
);
// The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD
(
ir_analyzed_program
,
IrAnalyzedProgram
,
framework
::
proto
::
ProgramDesc
);
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
885c4e57
...
...
@@ -28,6 +28,13 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
#define GCC_ATTRIBUTE(attr__) ;
#else
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
885c4e57
...
...
@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE
(
graph
.
get
());
// Apply all the passes
for
(
const
auto
&
pass
:
passes_
)
{
if
(
pass
->
Type
()
==
"graph_viz_pass"
)
continue
;
PrettyLogEndl
(
Style
::
H2
(),
"--- Running IR pass [%s]"
,
pass
->
Type
());
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
}
...
...
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
浏览文件 @
885c4e57
cc_library
(
subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc
)
if
(
TENSORRT_FOUND
)
if
(
WITH_GPU AND
TENSORRT_FOUND
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller
)
set
(
analysis_deps
${
analysis_deps
}
...
...
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
浏览文件 @
885c4e57
...
...
@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto
subgraphs
=
SubgraphDetector
(
graph_
,
node_inside_subgraph_teller_
)();
for
(
auto
&
subgraph
:
subgraphs
)
{
if
(
subgraph
.
size
()
<=
(
size_t
)
min_subgraph_size_
)
continue
;
LOG
(
INFO
)
<<
"detect a subgraph size "
<<
subgraph
.
size
();
std
::
unordered_set
<
Node
*>
subgraph_uniq
(
subgraph
.
begin
(),
subgraph
.
end
());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
885c4e57
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
block_desc
.
Proto
()
->
set_parent_idx
(
-
1
);
block_desc
.
Proto
()
->
set_idx
(
0
);
string
::
PrettyLogDetail
(
"--- detect a sub-graph with %d nodes"
,
subgraph
.
size
());
for
(
auto
*
node
:
subgraph
)
{
auto
*
op
=
block_desc
.
AppendOp
();
*
op
->
Proto
()
=
*
node
->
Op
()
->
Proto
();
...
...
paddle/fluid/inference/analysis/passes/CMakeLists.txt
浏览文件 @
885c4e57
cc_library
(
ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass
)
cc_library
(
ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass
)
cc_library
(
ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass
)
cc_library
(
analysis_passes SRCS passes.cc DEPS
ir_graph_build_pass
ir_analysis_pass
ir_params_sync_among_devices_pass
memory_optim_pass
ir_graph_to_program_pass
)
set
(
analysis_deps
${
analysis_deps
}
ir_graph_build_pass
ir_analysis_pass
analysis_passes
subgraph_detector
CACHE INTERNAL
""
)
paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
浏览文件 @
885c4e57
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
namespace
paddle
{
...
...
@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
IRPassManager
the_ir_manager
(
argument
);
graph
=
the_ir_manager
.
Apply
(
std
::
move
(
graph
));
PADDLE_ENFORCE_GT
(
graph
->
Nodes
().
size
(),
0
);
argument
->
SetIrAnalyzedProgram
(
new
framework
::
proto
::
ProgramDesc
(
the_ir_manager
.
AcquireProgram
(
&
graph
,
argument
->
main_program
())));
argument
->
SetMainGraph
(
graph
.
release
());
CollectFusionStatis
(
argument
);
}
void
IrAnalysisPass
::
CollectFusionStatis
(
Argument
*
argument
)
{
if
(
!
argument
->
main_graph
().
Has
(
framework
::
ir
::
kFuseStatisAttr
))
{
LOG
(
INFO
)
<<
"argument has no fuse statis"
;
return
;
}
argument
->
SetFusionStatis
(
argument
->
main_graph
().
Get
<
Argument
::
fusion_statis_t
>
(
framework
::
ir
::
kFuseStatisAttr
));
}
std
::
string
IrAnalysisPass
::
repr
()
const
{
return
"ir-analysis-pass"
;
}
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
浏览文件 @
885c4e57
...
...
@@ -29,6 +29,9 @@ namespace analysis {
class
IrAnalysisPass
:
public
AnalysisPass
{
public:
void
RunImpl
(
Argument
*
argument
)
override
;
void
CollectFusionStatis
(
Argument
*
argument
);
std
::
string
repr
()
const
override
;
};
...
...
paddle/fluid/inference/analysis/passes/ir_
analysis_compose
_pass.cc
→
paddle/fluid/inference/analysis/passes/ir_
graph_to_program
_pass.cc
浏览文件 @
885c4e57
...
...
@@ -12,49 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
IrAnalysisComposePass
::
RunImpl
(
Argument
*
argument
)
{
ARGUMENT_CHECK_FIELD
(
argument
,
ir_analysis_passes
);
ApplyIrPasses
(
argument
);
CollectFusionStatis
(
argument
);
}
std
::
string
IrAnalysisComposePass
::
repr
()
const
{
return
"ir-analysis-compose-pass"
;
}
void
IrGraphToProgramPass
::
RunImpl
(
Argument
*
argument
)
{
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
,
});
for
(
const
auto
&
pass
:
passes
)
{
VLOG
(
2
)
<<
"Run pass "
<<
pass
;
auto
*
the_pass
=
PassRegistry
::
Global
().
Retreive
(
pass
);
the_pass
->
Run
(
argument
);
if
(
argument
->
memory_optim_sort_kind_valid
())
{
pass
->
Set
(
framework
::
ir
::
kGraphToProgramSortKind
,
new
int
(
argument
->
memory_optim_sort_kind
()));
}
}
void
IrAnalysisComposePass
::
CollectFusionStatis
(
Argument
*
argument
)
{
if
(
!
argument
->
main_graph
().
Has
(
framework
::
ir
::
kFuseStatisAttr
))
{
LOG
(
INFO
)
<<
"argument has no fuse statis"
;
return
;
}
argument
->
SetFusionStatis
(
argument
->
main_graph
().
Get
<
Argument
::
fusion_statis_t
>
(
framework
::
ir
::
kFuseStatisAttr
));
std
::
unique_ptr
<
Graph
>
graph
(
argument
->
main_graph_ptr
());
framework
::
ProgramDesc
desc
(
argument
->
main_program
());
pass
->
SetNotOwned
(
"program"
,
&
desc
)
;
auto
thegraph
=
pass
->
Apply
(
std
::
move
(
graph
))
;
thegraph
.
release
();
// the argument still own the graph.
argument
->
SetIrAnalyzedProgram
(
new
framework
::
proto
::
ProgramDesc
(
*
desc
.
Proto
()
));
}
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/passes/ir_
analysis_compose
_pass.h
→
paddle/fluid/inference/analysis/passes/ir_
graph_to_program
_pass.h
浏览文件 @
885c4e57
...
...
@@ -14,31 +14,17 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
/*
* The analysis pass to run a list of IR passes (like a function call).
* Currently, it should be the first pass of analysis phase.
*/
class
IrAnalysisComposePass
:
public
AnalysisPass
{
class
IrGraphToProgramPass
:
public
AnalysisPass
{
public:
void
RunImpl
(
Argument
*
argument
)
override
;
std
::
string
repr
()
const
override
;
void
RunImpl
(
Argument
*
argument
)
override
;
private:
void
ApplyIrPasses
(
Argument
*
argument
);
void
CollectFusionStatis
(
Argument
*
argument
);
// Assign a Scope for IR passes to modify the weights.
void
AssignScopeToModify
(
Argument
*
argument
);
std
::
string
repr
()
const
override
{
return
"ir-graph-to-param-pass"
;
}
};
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
0 → 100644
浏览文件 @
885c4e57
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include <algorithm>
#include <fstream>
#include <limits>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
Node
;
using
framework
::
ir
::
TopologyVarientSort
;
using
space_table_t
=
MemoryOptimizePass
::
space_table_t
;
// Collect the lifecycles of the tensors.
// Traverse the graph in topological order.
// The traversal order also affect the lifecycles, so different sort_kind is
// used.
void
MemoryOptimizePass
::
CollectLifeCycle
(
std
::
unordered_map
<
std
::
string
,
lifecycle_t
>*
lifecycles
,
int
sort_kind
)
const
{
max_lifecycle_
=
0
;
for
(
auto
*
op_node
:
framework
::
ir
::
TopologyVarientSort
(
*
graph_
,
static_cast
<
framework
::
ir
::
SortKind
>
(
sort_kind
)))
{
if
(
!
op_node
->
IsOp
())
continue
;
auto
reads
=
op_node
->
inputs
;
auto
writes
=
op_node
->
outputs
;
std
::
vector
<
Node
*>
requires
(
reads
.
begin
(),
reads
.
end
());
requires
.
insert
(
requires
.
end
(),
writes
.
begin
(),
writes
.
end
());
// Disable reuse of feed variables.
if
(
op_node
->
Name
()
==
"feed"
)
{
for
(
auto
*
node
:
op_node
->
outputs
)
{
auto
var
=
node
->
Name
();
lifecycles
->
emplace
(
var
,
std
::
make_pair
(
0
,
std
::
numeric_limits
<
int
>::
max
()));
}
}
else
{
// Normal operators.
for
(
const
Node
*
node
:
requires
)
{
if
(
node
->
Var
()
->
Persistable
())
continue
;
std
::
string
var
=
node
->
Name
();
if
(
!
lifecycles
->
count
(
var
))
{
(
*
lifecycles
)[
var
]
=
std
::
make_pair
(
max_lifecycle_
,
max_lifecycle_
);
}
else
{
(
*
lifecycles
)[
var
].
second
=
std
::
max
(
max_lifecycle_
,
lifecycles
->
at
(
var
).
second
);
// max()
}
}
}
++
max_lifecycle_
;
}
}
// TODO(Superjomn) Make this a general help method.
int
DataTypeToSpace
(
framework
::
proto
::
VarType_Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType_Type_BOOL
:
return
sizeof
(
bool
);
case
framework
::
proto
::
VarType_Type_FP32
:
return
sizeof
(
float
);
case
framework
::
proto
::
VarType_Type_INT32
:
return
sizeof
(
int32_t
);
case
framework
::
proto
::
VarType_Type_INT64
:
return
sizeof
(
int64_t
);
default:
PADDLE_THROW
(
"Unknown data type"
);
}
}
// Collect the memory size of the tensors.
void
MemoryOptimizePass
::
CollectVarMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
batch_var_ave_dim
,
std
::
unordered_map
<
std
::
string
,
Node
*>*
tensor_nodes
,
space_table_t
*
space_table
)
const
{
// Collect tensors from graph.
for
(
auto
*
node
:
graph_
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
()
->
GetType
()
==
framework
::
proto
::
VarType
::
Type
::
VarType_Type_LOD_TENSOR
)
{
// Parameters will not be reused.
if
(
node
->
Var
()
->
Persistable
())
continue
;
(
*
tensor_nodes
)[
node
->
Name
()]
=
node
;
(
*
space_table
)[
node
->
Name
()]
=
DataTypeToSpace
(
node
->
Var
()
->
GetDataType
())
*
batch_var_ave_dim
.
at
(
node
->
Name
());
}
}
}
// Find a sutable (big enough but smallest to avoid memory waste).
//
// Args:
// @tensor_nodes: the tensor nodes in the ir::Graph.
// @free_existing_tensors: the allocated tensor and are free.
// @space_table: the memory space of tensors.
// @tensor2use: the tensor that requires memory.
//
// Returns:
// true if found some existing tensor to reuse.
// false if no sutable tensor to reuse, one need to allocate a new tensor for
// this requirement.
// The suitable tensor for reuse is one that is approximately equal to the
// memory demand.
bool
FindSuitableTensorToReuse
(
const
std
::
string
&
tensor
,
int
space_required
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>&
tensor_nodes
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
const
space_table_t
&
space_table
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>&
var_clusters
,
std
::
string
*
tensor2use
)
__SHOULD_USE_RESULT__
;
bool
FindSuitableTensorToReuse
(
const
std
::
string
&
tensor
,
int
space_required
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>&
tensor_nodes
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
const
space_table_t
&
space_table
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>&
var_clusters
,
std
::
string
*
tensor2use
)
{
std
::
pair
<
std
::
string
,
size_t
>
best_fit
;
best_fit
.
second
=
std
::
numeric_limits
<
int
>::
max
();
VLOG
(
5
)
<<
"Split Tensors to "
<<
var_clusters
.
size
()
<<
" clusters"
;
// find the cluster this var belongs to.
const
std
::
unordered_set
<
std
::
string
>*
cluster
=
nullptr
;
for
(
const
auto
&
c
:
var_clusters
)
{
if
(
c
.
count
(
tensor
))
{
cluster
=
&
c
;
break
;
}
}
PADDLE_ENFORCE_NOT_NULL
(
cluster
,
"something wrong in memory optimization, the "
"variable %s not in the clusters."
,
tensor
);
for
(
auto
&
candidate
:
*
free_existing_tensors
)
{
// This is not a temporary tensor.
if
(
!
space_table
.
count
(
candidate
))
continue
;
// Not in the same cluster.
if
(
!
cluster
->
count
(
candidate
))
continue
;
size_t
space
=
space_table
.
at
(
candidate
);
size_t
space_diff
=
std
::
abs
<
size_t
>
(
space
-
space_required
);
if
(
space_diff
<
best_fit
.
second
)
{
best_fit
.
first
=
candidate
;
best_fit
.
second
=
space_diff
;
}
}
if
(
best_fit
.
second
<
std
::
numeric_limits
<
int
>::
max
())
{
*
tensor2use
=
best_fit
.
first
;
return
true
;
}
return
false
;
}
// Allocate new tensor instead of reusing the existing one.
void
AllocateNewTensor
(
const
std
::
string
&
name
,
size_t
space_required
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>&
tensor_nodes
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
space_table_t
*
space_table
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
)
{
// The newly born tensor is free to be used.
free_existing_tensors
->
insert
(
name
);
// Register the space it has.
PADDLE_ENFORCE
(
space_table
->
count
(
name
));
space_table
->
at
(
name
)
=
std
::
max
(
space_table
->
at
(
name
),
space_required
);
// The allocated new tensor use the memory of itself.
(
*
reuse_table
)[
name
]
=
name
;
}
// Free a tensor and make it resuable.
// @tensor: the tensor to free.
// @free_existing_tensors: the free and allocated tensors.
// @reuse_table: a map from a fake tensor to the existing allocated tensor.
void
FreeATensor
(
const
std
::
string
&
tensor
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
)
{
if
(
tensor
==
"feed"
||
tensor
==
"fetch"
)
return
;
// the really allocated tensor.
const
auto
&
free_tensor
=
reuse_table
->
at
(
tensor
);
free_existing_tensors
->
insert
(
free_tensor
);
}
// Reuse a free existing tensor.
void
ReuseATensor
(
const
std
::
string
&
tensor
,
const
std
::
string
&
tensor2reuse
,
size_t
memory_size
,
std
::
unordered_set
<
std
::
string
>*
free_existing_tensors
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
,
space_table_t
*
reused_space_table
)
{
auto
it
=
free_existing_tensors
->
find
(
tensor2reuse
);
PADDLE_ENFORCE
(
it
!=
free_existing_tensors
->
end
());
free_existing_tensors
->
erase
(
it
);
(
*
reuse_table
)[
tensor
]
=
tensor2reuse
;
// Update the memory size of a reused tensor, the memory will grow if the
// required memory is larger.
(
*
reused_space_table
)[
tensor2reuse
]
=
std
::
max
(
reused_space_table
->
at
(
tensor2reuse
),
memory_size
);
}
// Calculate the memory usage.
void
EvaluateMemoryUsage
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
reuse_table
,
const
space_table_t
&
space_table
,
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
var_batch_ave_size
,
size_t
*
allocated
,
size_t
*
saved
)
{
*
allocated
=
0
;
*
saved
=
0
;
for
(
auto
elem
:
reuse_table
)
{
if
(
elem
.
first
==
elem
.
second
)
{
*
allocated
+=
space_table
.
at
(
elem
.
first
);
VLOG
(
4
)
<<
elem
.
first
<<
" <-> "
<<
elem
.
second
<<
" "
<<
space_table
.
at
(
elem
.
first
)
<<
" "
<<
space_table
.
at
(
elem
.
second
);
}
else
{
*
saved
+=
space_table
.
at
(
elem
.
first
);
VLOG
(
4
)
<<
"reuse "
<<
elem
.
first
<<
" -> "
<<
elem
.
second
;
}
}
VLOG
(
4
)
<<
"allocated "
<<
*
allocated
;
VLOG
(
4
)
<<
"saved "
<<
*
saved
;
}
// Return saved ratio.
void
MemoryOptimizePass
::
MakeReusePlan
(
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>&
var_clusters
,
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
var_batch_ave_size
,
const
space_table_t
&
space_table
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
reuse_table
,
int
sort_kind
,
MemoryAllocation
*
memory_allocation
)
const
{
// Clear the existing plan.
reuse_table
->
clear
();
// The `space_table` stores the real memory size for each tensor.
// The `reused_space_table` stores the maximum memory size required by a
// tensor during the memory reusing, the small tensor might be reused by a
// larger tensor, and the memory size of the small one will grow.
auto
reused_space_table
=
space_table
;
std
::
unordered_map
<
std
::
string
,
lifecycle_t
>
life_cycles
;
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
// The allocated tensors whose memory can be reused, they will live across the
// program execution.
std
::
unordered_set
<
std
::
string
>
existing_tensors
;
// The existing tensor that has been allocated, and is also free to reuse.
std
::
unordered_set
<
std
::
string
>
free_existing_tensors
;
CollectLifeCycle
(
&
life_cycles
,
sort_kind
);
for
(
int
age
=
0
;
age
<
max_lifecycle_
;
++
age
)
{
std
::
unordered_set
<
std
::
string
>
born_tensors
;
std
::
unordered_set
<
std
::
string
>
dead_tensors
;
// Gather the dead and born tensors.
for
(
auto
elem_it
=
life_cycles
.
begin
();
elem_it
!=
life_cycles
.
end
();
elem_it
++
)
{
if
(
elem_it
->
second
.
first
==
-
1
)
{
continue
;
}
const
auto
&
tensor
=
elem_it
->
first
;
const
auto
&
lifecycle
=
elem_it
->
second
;
VLOG
(
4
)
<<
"process "
<<
tensor
<<
" reuse "
<<
lifecycle
.
first
<<
"->"
<<
lifecycle
.
second
;
// Collect newly born tensors.
if
(
lifecycle
.
first
==
age
)
{
born_tensors
.
insert
(
tensor
);
}
// Collect dead tensors whose memory can be reused.
else
if
(
lifecycle
.
second
<
age
)
{
// NOLINT
dead_tensors
.
insert
(
tensor
);
// remove to avoid duplicate process.
elem_it
->
second
.
first
=
-
1
;
// avoid duplicate search
}
}
// Reuse the dead tensors for born tensors
for
(
const
auto
&
tensor
:
born_tensors
)
{
// Skip the feed and fetch tensor for that they share data with others.
std
::
string
tensor2reuse
;
if
(
!
space_table
.
count
(
tensor
))
continue
;
size_t
space_required
=
space_table
.
at
(
tensor
);
if
(
FindSuitableTensorToReuse
(
tensor
,
space_required
,
tensor_nodes
,
&
free_existing_tensors
,
reused_space_table
,
var_clusters
,
&
tensor2reuse
))
{
if
(
tensor
!=
tensor2reuse
)
{
VLOG
(
4
)
<<
tensor
<<
" -> "
<<
tensor2reuse
;
}
ReuseATensor
(
tensor
,
tensor2reuse
,
space_required
,
&
free_existing_tensors
,
reuse_table
,
&
reused_space_table
);
}
else
{
VLOG
(
4
)
<<
"allocate "
<<
tensor
;
AllocateNewTensor
(
tensor
,
space_required
,
tensor_nodes
,
&
free_existing_tensors
,
&
reused_space_table
,
reuse_table
);
ReuseATensor
(
tensor
,
tensor
,
space_required
,
&
free_existing_tensors
,
reuse_table
,
&
reused_space_table
);
}
}
for
(
const
auto
&
tensor
:
dead_tensors
)
{
// free its memory.
FreeATensor
(
tensor
,
&
free_existing_tensors
,
reuse_table
);
}
}
EvaluateMemoryUsage
(
*
reuse_table
,
reused_space_table
,
var_batch_ave_size
,
&
(
memory_allocation
->
allocated
),
&
(
memory_allocation
->
saved
));
memory_allocation
->
sort_kind
=
sort_kind
;
}
void
BuildVarNodeTable
(
Graph
*
graph
,
std
::
unordered_map
<
std
::
string
,
Node
*>*
var_node_table
)
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
())
{
(
*
var_node_table
)[
node
->
Name
()]
=
node
;
}
}
}
// NOTE The optimized opdesc doesn't match ir::Graph.
void
UpdateOpDescsByReuse
(
Graph
*
graph
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
reuse_table
,
int
sort_kind
)
{
// TODO(Superjomn) change here to be compatible with the runtime order.
for
(
auto
*
node
:
TopologyVarientSort
(
*
graph
,
static_cast
<
framework
::
ir
::
SortKind
>
(
sort_kind
)))
{
if
(
node
->
IsOp
())
{
// Replace the original inputs/outputs with the reused tensors.
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
in_args
,
out_args
;
for
(
auto
argument
:
node
->
Op
()
->
Inputs
())
{
for
(
const
auto
&
x
:
argument
.
second
)
{
auto
name
=
x
;
if
(
reuse_table
.
count
(
x
)
&&
reuse_table
.
at
(
x
)
!=
x
)
{
name
=
reuse_table
.
at
(
x
);
}
in_args
[
argument
.
first
].
push_back
(
name
);
VLOG
(
4
)
<<
node
->
Name
()
<<
" input "
<<
x
<<
" -> "
<<
name
;
}
}
for
(
auto
argument
:
node
->
Op
()
->
Outputs
())
{
for
(
const
auto
&
x
:
argument
.
second
)
{
auto
name
=
x
;
if
(
reuse_table
.
count
(
x
)
&&
reuse_table
.
at
(
x
)
!=
x
)
{
name
=
reuse_table
.
at
(
x
);
}
out_args
[
argument
.
first
].
push_back
(
name
);
VLOG
(
4
)
<<
node
->
Name
()
<<
" output "
<<
x
<<
" -> "
<<
name
;
}
}
// Update arguments.
for
(
auto
&
arg
:
in_args
)
{
node
->
Op
()
->
SetInput
(
arg
.
first
,
arg
.
second
);
}
for
(
auto
&
arg
:
out_args
)
{
node
->
Op
()
->
SetOutput
(
arg
.
first
,
arg
.
second
);
}
node
->
Op
()
->
Flush
();
}
}
}
void
MemoryOptimizePass
::
PerformReusePlan
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
reuse_table
,
int
sort_kind
,
std
::
unordered_set
<
std
::
string
>*
vars2remove
)
const
{
std
::
unordered_map
<
std
::
string
,
Node
*>
var_node_table
;
BuildVarNodeTable
(
graph_
,
&
var_node_table
);
UpdateOpDescsByReuse
(
graph_
,
reuse_table
,
sort_kind
);
for
(
auto
&
item
:
reuse_table
)
{
if
(
item
.
first
!=
item
.
second
)
{
vars2remove
->
insert
(
item
.
first
);
}
}
VLOG
(
2
)
<<
"to remove vars "
<<
vars2remove
->
size
();
}
std
::
vector
<
std
::
string
>
split
(
const
std
::
string
&
line
,
char
delim
)
{
std
::
vector
<
std
::
string
>
res
;
std
::
string
field
;
std
::
stringstream
line_stream
(
line
);
while
(
std
::
getline
(
line_stream
,
field
,
delim
))
{
res
.
emplace_back
(
field
);
}
return
res
;
}
// Deserialize the batch var shapes from the cache file.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
DeseralizeBatchVarShapes
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"failed to open %s to read cache"
,
path
);
std
::
string
line
;
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batch_shapes
;
while
(
std
::
getline
(
file
,
line
))
{
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
batch
;
for
(
const
auto
&
var_info
:
split
(
line
,
';'
))
{
auto
fields
=
split
(
var_info
,
':'
);
PADDLE_ENFORCE_EQ
(
fields
.
size
(),
2UL
);
auto
var_name
=
fields
.
front
();
auto
shape_str
=
split
(
fields
[
1
],
','
);
std
::
vector
<
int
>
shape
;
for
(
const
auto
&
v
:
shape_str
)
shape
.
push_back
(
std
::
stoi
(
v
));
batch
[
var_name
]
=
shape
;
}
batch_shapes
.
push_back
(
batch
);
}
return
batch_shapes
;
}
// Calculate the average dim of each tensor from the batch shape cache.
std
::
unordered_map
<
std
::
string
,
size_t
>
GetBatchAverageSize
(
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
std
::
unordered_map
<
std
::
string
,
size_t
>
var2size
;
// The average size of the batches for each variable.
int
num_batch
=
0
;
for
(
const
auto
&
batch
:
batches
)
{
num_batch
++
;
for
(
const
auto
&
item
:
batch
)
{
int
dim
=
std
::
accumulate
(
item
.
second
.
begin
(),
item
.
second
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
var2size
[
item
.
first
]
+=
dim
;
}
}
for
(
auto
&
item
:
var2size
)
{
item
.
second
/=
num_batch
;
}
return
var2size
;
}
// Analysis the batch shapes loading from the cache file.
// By splitting the variables to different clusters by analyzing their batch
// size, we can pre-schedule the changes of difference LoDTensor when different
// length of input sequences is entered.
// This should works fine for the models operating on sentences.
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
AnalysisBatchShapesByBatchSize
(
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
// collect the batch size of each shape and combine to a stringstream in
// converient to generate a hash.
std
::
unordered_map
<
std
::
string
,
std
::
stringstream
>
var_batchsize_hashes
;
for
(
auto
&
batch
:
batches
)
{
for
(
auto
&
ele
:
batch
)
{
int
batch_size
=
ele
.
second
.
front
();
// TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes
[
ele
.
first
]
<<
batch_size
;
}
}
// Split to sets by batch size sequences.
std
::
unordered_map
<
size_t
/*hash*/
,
std
::
unordered_set
<
std
::
string
>>
shape_sets
;
for
(
auto
&
ele
:
var_batchsize_hashes
)
{
auto
hash
=
std
::
hash
<
std
::
string
>
()(
ele
.
second
.
str
());
shape_sets
[
hash
].
insert
(
ele
.
first
);
}
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
res
;
for
(
auto
&
ele
:
shape_sets
)
{
res
.
emplace_back
(
std
::
move
(
ele
.
second
));
}
VLOG
(
3
)
<<
"Cluster by batch_size and get "
<<
res
.
size
()
<<
" clusters"
;
return
res
;
}
// Analysis the batch shapes loading from the cache file, and split them to
// different clusters by their size.
// This should works fine for the overall models.
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
AnalysisBatchShapesBySimilarSize
(
const
space_table_t
&
space_table
,
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
,
int
interval
=
200000
)
{
PADDLE_ENFORCE_GT
(
interval
,
0
);
// cluster to different clusters.
size_t
max_size
=
0
;
for
(
auto
&
item
:
space_table
)
{
max_size
=
std
::
max
(
item
.
second
,
max_size
);
}
VLOG
(
4
)
<<
"tensor max size "
<<
max_size
;
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
res
;
// cluster by intervals.
for
(
size_t
interval_size
=
0
;
interval_size
<=
max_size
;
interval_size
+=
interval
)
{
std
::
unordered_set
<
std
::
string
>
cluster
;
for
(
auto
&
item
:
space_table
)
{
if
(
interval_size
<=
item
.
second
&&
interval_size
+
interval
>
item
.
second
)
{
cluster
.
insert
(
item
.
first
);
}
}
if
(
!
cluster
.
empty
())
{
res
.
push_back
(
cluster
);
}
}
VLOG
(
3
)
<<
"Cluster by interval and get "
<<
res
.
size
()
<<
" cluster"
;
return
res
;
}
std
::
string
MemoryOptimizePass
::
repr
()
const
{
return
"memory optimize pass"
;
}
void
MemoryOptimizePass
::
RunImpl
(
Argument
*
argument
)
{
// When force update, should not optimize memory.
if
(
!
argument
->
enable_memory_optim
()
||
argument
->
memory_optim_force_update
())
return
;
graph_
=
argument
->
main_graph_ptr
();
auto
path
=
GetMemoryCachePath
(
argument
->
model_dir_valid
()
?
argument
->
model_dir
()
:
""
,
argument
->
model_program_path_valid
()
?
argument
->
model_program_path
()
:
""
);
VLOG
(
3
)
<<
"Load memory cache from "
<<
path
;
if
(
inference
::
IsFileExists
(
path
))
{
VLOG
(
4
)
<<
"Performing memory optimize"
;
auto
batches
=
DeseralizeBatchVarShapes
(
path
);
auto
var_batch_ave_size
=
GetBatchAverageSize
(
batches
);
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
space_table_t
space_table
;
CollectVarMemorySize
(
var_batch_ave_size
,
&
tensor_nodes
,
&
space_table
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
reuse_table
;
double
max_saving_ratio
=
0.
;
std
::
vector
<
std
::
function
<
MemoryAllocation
()
>>
strategies
;
for
(
int
sort_kind
=
0
;
sort_kind
<
2
;
sort_kind
++
)
{
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_batch_size
=
AnalysisBatchShapesByBatchSize
(
batches
);
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_batch_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
);
// interval 1kb
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
*
1024
);
// interval 1MB
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
std
::
numeric_limits
<
int
>::
max
());
// no intervals
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
}
std
::
function
<
MemoryAllocation
()
>*
best_strategy
{
nullptr
};
// Try all strategies to get the best result.
for
(
auto
&
strategy
:
strategies
)
{
auto
allocation
=
strategy
();
string
::
PrettyLogDetail
(
"--- get strategy saving %f memory for workspace"
,
allocation
.
GetSavingRatio
());
if
(
allocation
.
GetSavingRatio
()
>
max_saving_ratio
)
{
max_saving_ratio
=
allocation
.
GetSavingRatio
();
best_strategy
=
&
strategy
;
}
}
if
(
!
best_strategy
)
{
LOG
(
ERROR
)
<<
"This model makes poor memory optimize, skip memory optimize"
;
return
;
}
auto
memory_allocation
=
(
*
best_strategy
)();
string
::
PrettyLogH2
(
"--- Saved %.2f%s memory for workspace(temporary variables)"
,
memory_allocation
.
GetSavingRatio
()
*
100
,
"%"
);
string
::
PrettyLogDetail
(
"--- Allocated %d MB"
,
memory_allocation
.
allocated
/
1024.
/
1024.
);
string
::
PrettyLogDetail
(
"--- Saved %d MB"
,
memory_allocation
.
saved
/
1024.
/
1024.
);
argument
->
main_graph
().
Set
(
framework
::
ir
::
kGraphToProgramVarsToRemove
,
new
std
::
unordered_set
<
std
::
string
>
);
auto
&
vars2remove
=
argument
->
main_graph
().
Get
<
std
::
unordered_set
<
std
::
string
>>
(
framework
::
ir
::
kGraphToProgramVarsToRemove
);
PerformReusePlan
(
reuse_table
,
memory_allocation
.
sort_kind
,
&
vars2remove
);
argument
->
SetMemoryOptimSortKind
(
memory_allocation
.
sort_kind
);
}
}
float
MemoryOptimizePass
::
MemoryAllocation
::
GetSavingRatio
()
const
{
return
(
saved
/
1024.
)
/
(
allocated
/
1024.
+
saved
/
1024.
);
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
0 → 100644
浏览文件 @
885c4e57
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
/*
* Memory optimization pass for inference with pre-analysis of memory usage
* without GC.
* Different from training, the inference memory reuse strategies doesn't
* include GC for that overhead is too much when batch size equals one.
*
* The inference memory reuse tries to pre-determine the tensor reusing strategy
* without runtime overhead.
*
* To improve the strategy's performance, a warm-up running is introduced:
* - Before officially deploy the inference program, one should warm it up and
* generate some runtime cache,
* - Run the inference program with several batches of data, it will persist
* some runtime information about memory of tensors to disk, we call the
* information the memory reusing cache,
* - With the memory reusing cache, user can deploy the inference to a
* service, before running the model, the inference program will load the
* memory cache, analysis it and generate the best memory reusing strategy,
* and adjust the execution of the network.
*
* With the warm-up and memory reusing cache design, the memory reusing
* algorithm can analysis the real memory consume of the tensors, even with the
* flexible LoDTensor and special shape changing operators such as
* sequence-pooling.
*/
class
MemoryOptimizePass
:
public
AnalysisPass
{
public:
using
space_table_t
=
std
::
unordered_map
<
std
::
string
,
size_t
>
;
using
lifecycle_t
=
std
::
pair
<
int
,
int
>
;
struct
MemoryAllocation
{
size_t
allocated
;
// allocated memory in byte.
size_t
saved
;
// saved memory in byte.
int
sort_kind
;
// the kind of the corresponding sorting algorithm.
// Get the memory saving ratio of temporary variables.
float
GetSavingRatio
()
const
;
};
virtual
~
MemoryOptimizePass
()
=
default
;
protected:
void
RunImpl
(
Argument
*
argument
)
override
;
private:
void
CollectLifeCycle
(
std
::
unordered_map
<
std
::
string
,
lifecycle_t
>
*
lifecycles
,
int
sort_kind
)
const
;
void
CollectVarMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>
&
batch_var_ave_dim
,
std
::
unordered_map
<
std
::
string
,
framework
::
ir
::
Node
*>
*
tensor_nodes
,
space_table_t
*
space_table
)
const
;
// Returns percentage of saved memory.
void
MakeReusePlan
(
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
var_clusters
,
const
std
::
unordered_map
<
std
::
string
,
size_t
>
&
var_batch_ave_size
,
const
space_table_t
&
space_table
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
*
reuse_table
,
int
sort_kind
,
MemoryAllocation
*
memory_allocation
)
const
;
void
PerformReusePlan
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
reuse_table
,
int
sort_kind
,
std
::
unordered_set
<
std
::
string
>
*
vars2remove
)
const
;
public:
std
::
string
repr
()
const
override
;
private:
mutable
framework
::
ir
::
Graph
*
graph_
{
nullptr
};
mutable
int
max_lifecycle_
{
-
1
};
};
static
std
::
string
GetMemoryCachePath
(
const
std
::
string
&
model_path
,
const
std
::
string
&
prog_path
)
{
auto
path
=
model_path
.
empty
()
?
prog_path
:
model_path
;
return
path
+
".memory_cache"
;
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/passes/passes.cc
浏览文件 @
885c4e57
...
...
@@ -13,24 +13,31 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
PassRegistry
::
PassRegistry
()
{
// Register manually to avoid the trivial `USE_OP` like macro for easier use
// and link.
passes_
.
emplace
(
"ir_analysis_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrAnalysisPass
));
passes_
.
emplace
(
"ir_graph_build_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrGraphBuildPass
));
passes_
.
emplace
(
"
ir_analysis_compos
e_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrAnalysisCompos
ePass
));
passes_
.
emplace
(
"
memory_optimiz
e_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
MemoryOptimiz
ePass
));
passes_
.
emplace
(
"ir_params_sync_among_devices_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrParamsSyncAmongDevicesPass
));
passes_
.
emplace
(
"ir_graph_to_program_pass"
,
std
::
unique_ptr
<
IrGraphToProgramPass
>
(
new
IrGraphToProgramPass
));
}
}
// namespace analysis
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
885c4e57
...
...
@@ -18,8 +18,10 @@ if(APPLE)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass
ir_pass_manager naive_executor analysis_predictor
${
GLOB_PASS_LIB
}
)
set
(
inference_deps
${
analysis_deps
}
paddle_inference_api paddle_fluid_api
analysis pass naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
tensorrt_engine tensorrt_converter
)
...
...
@@ -29,7 +31,8 @@ add_subdirectory(details)
cc_library
(
analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder
)
cc_library
(
paddle_pass_builder SRCS paddle_pass_builder.cc
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
${
inference_deps
}
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor
...
...
@@ -44,7 +47,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=
${
WORD2VEC_MODEL_DIR
}
--book_dirname=
${
PYTHON_TESTS_DIR
}
/book
)
set_tests_properties
(
test_api_impl PROPERTIES DEPENDS test_image_classification
)
endif
()
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
${
inference_deps
}
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
benchmark
${
inference_deps
}
ARGS --dirname=
${
WORD2VEC_MODEL_DIR
}
)
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
885c4e57
...
...
@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
Update
();
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
)
{
prog_file_
=
prog_file
;
params_file_
=
params_file
;
Update
();
}
void
contrib
::
AnalysisConfig
::
SetModel
(
const
std
::
string
&
prog_file_path
,
const
std
::
string
&
params_file_path
)
{
prog_file_
=
prog_file_path
;
params_file_
=
params_file_path
;
Update
();
}
void
contrib
::
AnalysisConfig
::
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
)
{
...
...
@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
memory_pool_init_size_mb_
=
memory_pool_init_size_mb
;
device_id_
=
device_id
;
#else
LOG
(
ERROR
)
<<
"Please compile with gpu to EnableGpu"
;
LOG
(
ERROR
)
<<
"Please compile with gpu to EnableGpu
()
"
;
use_gpu_
=
false
;
#endif
Update
();
}
void
contrib
::
AnalysisConfig
::
DisableGpu
()
{
use_gpu_
=
false
;
Update
();
}
void
contrib
::
AnalysisConfig
::
DisableGpu
()
{
use_gpu_
=
false
;
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
contrib
::
AnalysisConfig
&
other
)
{
#define CP_MEMBER(member__) member__ = other.member__;
...
...
@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER
(
use_gpu_
);
CP_MEMBER
(
device_id_
);
CP_MEMBER
(
memory_pool_init_size_mb_
);
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
memory_optim_force_update_
);
// TensorRT releated.
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
...
...
@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
}
#undef CP_MEMBER
Update
();
}
void
contrib
::
AnalysisConfig
::
EnableMKLDNN
()
{
...
...
@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"Please compile with MKLDNN first to use MKLDNN"
;
use_mkldnn_
=
false
;
#endif
Update
();
}
void
contrib
::
AnalysisConfig
::
EnableTensorRtEngine
(
int
workspace_size
,
int
max_batch_size
,
int
min_subgraph_size
)
{
#ifdef PADDLE_WITH_CUDA
if
(
!
use_gpu
())
{
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
return
;
}
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
Update
();
#else
LOG
(
ERROR
)
<<
"To use TensorRT engine, please compile inference lib with GPU first."
;
#endif
}
// TODO(Superjomn) refactor this, buggy.
void
contrib
::
AnalysisConfig
::
Update
()
{
auto
info
=
SerializeInfoCache
();
if
(
info
==
serialized_info_cache_
)
return
;
if
(
use_gpu_
)
{
pass_builder_
.
reset
(
new
GpuPassStrategy
);
// Transfer pass_builder and copy the existing compatible passes.
if
(
!
pass_builder_
||
((
use_gpu
()
^
pass_builder_
->
use_gpu
())))
{
if
(
use_gpu
())
{
pass_builder_
.
reset
(
new
GpuPassStrategy
);
if
(
use_tensorrt_
)
{
// Append after the Affine_channel_conv_fuse pass.
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
);
}
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
);
if
(
use_gpu
())
{
pass_builder_
.
reset
(
new
GpuPassStrategy
(
*
static_cast
<
GpuPassStrategy
*>
(
pass_builder_
.
get
())));
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
(
*
static_cast
<
CpuPassStrategy
*>
(
pass_builder_
.
get
())));
}
}
if
(
use_tensorrt_
)
{
if
(
!
use_gpu_
)
{
LOG
(
ERROR
)
<<
"TensorRT engine is not available when EnableGpu() not actived."
;
}
else
{
const
auto
&
passes
=
pass_builder_
->
AllPasses
();
if
(
std
::
find
(
passes
.
begin
(),
passes
.
end
(),
"tensorrt_subgraph_pass"
)
==
std
::
end
(
passes
))
{
// Append after the Affine_channel_conv_fuse pass.
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
...
...
@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
#endif
}
if
(
enable_memory_optim_
)
{
pass_builder
()
->
AppendAnalysisPass
(
"memory_optimize_pass"
);
}
if
(
ir_debug_
)
{
pass_builder
()
->
TurnOnDebug
();
}
...
...
@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
std
::
string
contrib
::
AnalysisConfig
::
SerializeInfoCache
()
{
std
::
stringstream
ss
;
ss
<<
model_dir_
;
ss
<<
prog_file_
;
ss
<<
params_file_
;
ss
<<
use_gpu_
;
ss
<<
device_id_
;
ss
<<
memory_pool_init_size_mb_
;
ss
<<
use_tensorrt_
;
ss
<<
tensorrt_workspace_size_
;
ss
<<
tensorrt_max_batchsize_
;
ss
<<
tensorrt_min_subgraph_size_
;
ss
<<
enable_memory_optim_
;
ss
<<
memory_optim_force_update_
;
ss
<<
use_mkldnn_
;
for
(
auto
&
item
:
mkldnn_enabled_op_types_
)
ss
<<
item
;
ss
<<
";"
;
ss
<<
model_from_memory_
;
ss
<<
enable_ir_optim_
;
ss
<<
use_feed_fetch_ops_
;
ss
<<
ir_debug_
;
ss
<<
specify_input_name_
;
ss
<<
cpu_math_library_num_threads_
;
return
ss
.
str
();
}
void
contrib
::
AnalysisConfig
::
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
Update
();
}
float
contrib
::
AnalysisConfig
::
fraction_of_gpu_memory_for_pool
()
const
{
...
...
@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}
void
contrib
::
AnalysisConfig
::
EnableMemoryOptim
(
bool
force_update_cache
)
{
enable_memory_optim_
=
true
;
memory_optim_force_update_
=
force_update_cache
;
Update
();
}
bool
contrib
::
AnalysisConfig
::
enable_memory_optim
()
const
{
return
enable_memory_optim_
;
}
void
contrib
::
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
const
char
*
param_buffer
,
...
...
@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
prog_file_
=
std
::
string
(
prog_buffer
,
prog_buffer
+
prog_buffer_size
);
params_file_
=
std
::
string
(
param_buffer
,
param_buffer
+
param_buffer_size
);
model_from_memory_
=
true
;
Update
();
}
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
885c4e57
...
...
@@ -24,18 +24,21 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
DECLARE_bool
(
profile
);
namespace
paddle
{
...
...
@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
// Collect variable shapes for memory optimization.
if
(
need_collect_var_shapes_for_memory_optim
())
{
CollectVarShapes
();
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// All the containers in the scope will be hold in inference, but the
...
...
@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetUseGPU
(
config_
.
use_gpu
());
argument_
.
SetGPUDeviceId
(
config_
.
gpu_device_id
());
argument_
.
SetEnableMemoryOptim
(
config_
.
enable_memory_optim
());
argument_
.
SetMemoryOptimForceUpdate
(
config_
.
memory_optim_force_update_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
// Analyze inference_program
if
(
!
config_
.
model_dir
().
empty
())
{
...
...
@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
if
(
config_
.
use_gpu
()
&&
config_
.
tensorrt_engine_enabled
())
{
LOG
(
INFO
)
<<
"TensorRT subgraph engine is enabled"
;
argument_
.
SetUseTensorRT
(
true
);
argument_
.
SetTensorRtWorkspaceSize
(
config_
.
tensorrt_workspace_size_
);
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
...
...
@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
if
(
config_
.
use_mkldnn_
)
{
LOG
(
INFO
)
<<
"MKLDNN is enabled"
;
argument_
.
SetMKLDNNEnabledOpTypes
(
config_
.
mkldnn_enabled_op_types_
);
}
auto
passes
=
config_
.
pass_builder
()
->
AllPasses
();
if
(
!
config_
.
ir_optim
())
passes
.
clear
();
if
(
!
config_
.
ir_optim
())
{
passes
.
clear
();
LOG
(
INFO
)
<<
"ir_optim is turned off, no IR pass will be executed"
;
}
argument_
.
SetIrAnalysisPasses
(
passes
);
argument_
.
SetAnalysisPasses
(
config_
.
pass_builder
()
->
AnalysisPasses
());
argument_
.
SetScopeNotOwned
(
const_cast
<
framework
::
Scope
*>
(
scope_
.
get
()));
Analyzer
().
Run
(
&
argument_
);
...
...
@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
if
(
sub_scope_
)
{
scope_
->
DeleteScope
(
sub_scope_
);
}
// TODO(Superjomn) deduce the directory path.
std
::
string
out_path
=
inference
::
analysis
::
GetMemoryCachePath
(
config_
.
model_dir
(),
config_
.
prog_file
());
if
(
need_collect_var_shapes_for_memory_optim
())
{
SerializeBatchVarShapes
(
out_path
);
}
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
...
...
@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
}
void
AnalysisPredictor
::
CollectVarShapes
()
{
VLOG
(
4
)
<<
"Collecting var shapes"
;
if
(
batch_var_shapes_
.
size
()
>=
max_shape_collect_count_
)
return
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
var_shapes
;
for
(
auto
var_name
:
inference_program_
->
Block
(
0
).
LocalVarNames
())
{
auto
*
var
=
sub_scope_
->
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
if
(
var
->
Type
()
==
framework
::
VarTypeTrait
<
framework
::
LoDTensor
>::
kId
||
var
->
Type
()
==
framework
::
VarTypeTrait
<
framework
::
Tensor
>::
kId
)
{
auto
&
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
auto
shape
=
framework
::
vectorize
(
tensor
.
dims
());
var_shapes
[
var_name
].
assign
(
shape
.
begin
(),
shape
.
end
());
}
}
batch_var_shapes_
.
push_back
(
var_shapes
);
LOG_FIRST_N
(
INFO
,
1
)
<<
"Collected "
<<
batch_var_shapes_
.
size
()
<<
" batch of var shapes for analysis"
;
}
void
AnalysisPredictor
::
SerializeBatchVarShapes
(
const
std
::
string
&
path
)
{
LOG
(
INFO
)
<<
"serialize batch var shapes to "
<<
path
;
std
::
ofstream
file
(
path
);
if
(
!
file
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to serialize the var shapes to "
<<
path
;
return
;
}
// The sirialized data format:
// <tensor_name>:dim0,dim1,dim2,;
for
(
auto
&
batch
:
batch_var_shapes_
)
{
for
(
auto
&
ele
:
batch
)
{
file
<<
ele
.
first
<<
":"
;
for
(
size_t
i
=
0
;
i
<
ele
.
second
.
size
()
-
1
;
i
++
)
{
file
<<
ele
.
second
[
i
]
<<
","
;
}
file
<<
ele
.
second
.
back
()
<<
";"
;
}
file
<<
"
\n
"
;
}
}
bool
AnalysisPredictor
::
need_collect_var_shapes_for_memory_optim
()
{
if
(
need_collect_var_shapes_
>=
0
)
return
need_collect_var_shapes_
;
bool
need
=
false
;
// check if the cache exists
if
(
!
config_
.
enable_memory_optim
())
{
need
=
false
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
!
inference
::
IsFileExists
(
inference
::
analysis
::
GetMemoryCachePath
(
config_
.
model_dir
(),
config_
.
prog_file
())))
{
need
=
true
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
config_
.
memory_optim_force_update_
)
{
need
=
true
;
}
need_collect_var_shapes_
=
need
?
1
:
0
;
return
need
;
}
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
const
contrib
::
AnalysisConfig
&
config
)
{
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
885c4e57
...
...
@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
void
SetMkldnnThreadID
(
int
tid
);
protected:
// For memory optimization.
bool
need_collect_var_shapes_for_memory_optim
();
void
CollectVarShapes
();
void
SerializeBatchVarShapes
(
const
std
::
string
&
path
);
bool
PrepareProgram
(
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
);
bool
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
);
bool
CreateExecutor
();
...
...
@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
// A mutex help to make Clone thread safe.
std
::
mutex
clone_mutex_
;
// For memory optimization.
const
size_t
max_shape_collect_count_
{
1000
};
int
need_collect_var_shapes_
{
-
1
};
// -1 for default, 0 for false, 1 for true.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batch_var_shapes_
;
private:
// Some status here that help to determine the status inside the predictor.
bool
status_program_optimized_
{
false
};
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
885c4e57
...
...
@@ -16,8 +16,10 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
...
...
@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
}
}
TEST
(
AnalysisPredictor
,
memory_optim
)
{
AnalysisConfig
config
(
FLAGS_dirname
);
config
.
DisableGpu
();
config
.
EnableMemoryOptim
(
true
);
config
.
pass_builder
()
->
TurnOnDebug
();
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
());
// 2. Dummy Input Data
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
tensor
.
data
.
Reset
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
output
,
output1
;
{
// The first predictor help to cache the memory optimize strategy.
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
// Run several times to check the parameters are not reused by mistake.
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ASSERT_TRUE
(
predictor
->
Run
(
inputs
,
&
output
));
}
}
{
output
.
clear
();
// The second predictor to perform memory optimization.
config
.
EnableMemoryOptim
(
false
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
// Run with memory optimization
ASSERT_TRUE
(
predictor
->
Run
(
inputs
,
&
output
));
}
// Run native
ASSERT_TRUE
(
native_predictor
->
Run
(
inputs
,
&
output1
));
LOG
(
INFO
)
<<
"the output "
<<
inference
::
DescribeTensor
(
output
.
front
());
LOG
(
INFO
)
<<
"the native output "
<<
inference
::
DescribeTensor
(
output1
.
front
());
inference
::
CompareResult
(
output
,
output1
);
}
}
// namespace paddle
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
885c4e57
#!/bin/bash
set
-x
PADDLE_ROOT
=
$1
TURN_ON_MKL
=
$2
# use MKL or Openblas
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
885c4e57
...
...
@@ -15,7 +15,10 @@
#pragma once
#include <glog/logging.h>
#include <fstream>
#if !defined(_WIN32)
#include <sys/time.h>
#endif
#include <algorithm>
#include <chrono> // NOLINT
#include <iterator>
...
...
@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
return
true
;
}
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
)
{
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
,
int
max_num_of_data
=
15
)
{
std
::
stringstream
os
;
os
<<
"Tensor ["
<<
tensor
.
name
<<
"]
\n
"
;
os
<<
" - type: "
;
...
...
@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
static
bool
IsFileExists
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
bool
exists
=
file
.
is_open
();
file
.
close
();
return
exists
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
885c4e57
...
...
@@ -192,6 +192,13 @@ struct AnalysisConfig {
*/
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
/** Turn on memory optimize
* NOTE still in development, will release latter.
*/
void
EnableMemoryOptim
(
bool
force_update_cache
=
false
);
/** Tell whether the memory optimization is activated. */
bool
enable_memory_optim
()
const
;
friend
class
::
paddle
::
AnalysisPredictor
;
/** NOTE just for developer, not an official API, easily to be broken.
...
...
@@ -232,6 +239,10 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
int
tensorrt_min_subgraph_size_
{
3
};
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
memory_optim_force_update_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
...
...
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
885c4e57
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include <glog/logging.h>
namespace
paddle
{
...
...
@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"GPU not support MKLDNN yet"
;
}
void
PaddlePassBuilder
::
AppendAnalysisPass
(
const
std
::
string
&
pass
)
{
analysis_passes_
.
push_back
(
pass
);
}
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
885c4e57
...
...
@@ -45,6 +45,9 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */
void
DeletePass
(
const
std
::
string
&
pass_type
);
/** Append an analysis pass. */
void
AppendAnalysisPass
(
const
std
::
string
&
pass
);
/** Visualize the computation graph after each pass by generating a DOT
* language file, one can draw them with the Graphviz toolkit.
*/
...
...
@@ -54,8 +57,18 @@ class PaddlePassBuilder {
std
::
string
DebugString
();
const
std
::
vector
<
std
::
string
>
&
AllPasses
()
const
{
return
passes_
;
}
std
::
vector
<
std
::
string
>
AnalysisPasses
()
const
{
auto
passes
=
analysis_passes_
;
// To make sure the ir_graph_to_program should be the last pass so any
// modication of IR will persist to the program.
passes
.
push_back
(
"ir_graph_to_program_pass"
);
return
passes
;
}
protected:
std
::
vector
<
std
::
string
>
analysis_passes_
{
{
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
}};
std
::
vector
<
std
::
string
>
passes_
;
};
...
...
@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
/** The MKLDNN control exists in both CPU and GPU mode, because there can be
* still some CPU kernels running in CPU mode.
*/
virtual
void
EnableMKLDNN
()
=
0
;
virtual
void
EnableMKLDNN
()
{}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
...
...
@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
protected:
bool
use_gpu_
{
false
};
bool
use_mkldnn_
{
false
};
};
/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
...
...
@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
use_gpu_
=
false
;
}
explicit
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{}
virtual
~
CpuPassStrategy
()
=
default
;
void
EnableMKLDNN
()
override
{
// TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN
passes_
.
insert
(
passes_
.
begin
(),
"mkldnn_placement_pass"
);
for
(
auto
&
pass
:
std
::
vector
<
std
::
string
>
({
"depthwise_conv_mkldnn_pass"
,
//
"conv_bias_mkldnn_fuse_pass"
,
//
"conv3d_bias_mkldnn_fuse_pass"
,
//
"conv_relu_mkldnn_fuse_pass"
,
//
"conv_elementwise_add_mkldnn_fuse_pass"
}))
{
passes_
.
push_back
(
pass
);
if
(
!
use_mkldnn_
)
{
passes_
.
insert
(
passes_
.
begin
(),
"mkldnn_placement_pass"
);
for
(
auto
&
pass
:
std
::
vector
<
std
::
string
>
(
{
"depthwise_conv_mkldnn_pass"
,
//
"conv_bias_mkldnn_fuse_pass"
,
//
"conv3d_bias_mkldnn_fuse_pass"
,
//
"conv_relu_mkldnn_fuse_pass"
,
//
"conv_elementwise_add_mkldnn_fuse_pass"
}))
{
passes_
.
push_back
(
pass
);
}
}
use_mkldnn_
=
true
;
#else
use_mkldnn_
=
false
;
#endif
}
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
passes_
)
{}
};
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
...
...
@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
use_gpu_
=
true
;
}
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
explicit
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{
use_gpu_
=
true
;
}
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
885c4e57
...
...
@@ -19,7 +19,7 @@ endfunction()
function
(
inference_analysis_api_test target install_dir filename
)
inference_analysis_test
(
${
target
}
SRCS
${
filename
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
benchmark
ARGS --infer_model=
${
install_dir
}
/model --infer_data=
${
install_dir
}
/data.txt
)
endfunction
()
...
...
@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc SERIAL
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
EXTRA_DEPS legacy_allocator
SERIAL
)
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
...
...
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
885c4e57
...
...
@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
std
::
string
turn_mask_pre
=
"turn_mask_"
;
auto
one_batch
=
data
->
NextBatch
();
PADDLE_ENFORCE
(
!
one_batch
.
response
.
empty
());
int
size
=
one_batch
.
response
[
0
].
size
();
CHECK_EQ
(
size
,
kMaxTurnLen
);
// turn tensor assignment
...
...
@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
...
...
@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST
(
Analyzer_dam
,
compare_with_memory_optim
)
{
// The small dam will core in CI, but works in local.
if
(
FLAGS_max_turn_num
==
9
)
{
contrib
::
AnalysisConfig
cfg
,
cfg1
;
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
// Run the first time to force to update memory cache
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
(
true
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
// Run second time to use the memory cache and perform memory optimization.
SetConfig
(
&
cfg1
);
cfg1
.
EnableMemoryOptim
();
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg1
),
input_slots_all
);
}
}
TEST
(
Analyzer_dam
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_dam
,
compare_mkldnn
)
{
compare
(
true
/* use_mkldnn */
);
}
#endif
...
...
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
浏览文件 @
885c4e57
...
...
@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST
(
Analyzer_Text_Classification
,
profile
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
pass_builder
()
->
TurnOnDebug
();
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
...
...
@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
TEST
(
Analyzer_Text_Classification
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
();
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
885c4e57
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
...
...
@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
FLAGS_infer_model
+
"/__params__"
);
cfg
->
DisableGpu
();
cfg
->
SwitchIrDebug
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchSpecifyInputNames
(
false
);
// TODO(TJ): fix fusion gru
cfg
->
pass_builder
()
->
DeletePass
(
"fc_gru_fuse_pass"
);
}
...
...
@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
}
// cfg.pass_builder()->TurnOnDebug();
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
...
...
@@ -103,9 +105,8 @@ void profile(bool use_mkldnn = false) {
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
CHECK_EQ
(
numel
,
refer
.
data
.
size
());
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
CHECK_LT
(
fabs
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
]
-
refer
.
data
[
i
]),
1e-5
);
EXPECT_NEAR
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
],
refer
.
data
[
i
],
1e-5
);
}
}
}
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
885c4e57
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
#include <thread> // NOLINT
...
...
@@ -28,9 +29,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/inference/utils/benchmark.h"
...
...
@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
float
*
pdata
=
static_cast
<
float
*>
(
out
.
data
.
data
());
float
*
pdata_ref
=
static_cast
<
float
*>
(
ref_out
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
size
;
++
j
)
{
EXPECT_NEAR
(
pdata_ref
[
j
],
pdata
[
j
]
,
FLAGS_accuracy
);
CHECK_LE
(
std
::
abs
(
pdata_ref
[
j
]
-
pdata
[
j
])
,
FLAGS_accuracy
);
}
break
;
}
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
885c4e57
...
...
@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
}
}
TEST
(
TensorRT_mobilenet
,
profile
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"mobilenet"
;
profile
(
model_dir
,
true
,
false
);
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark.h
浏览文件 @
885c4e57
...
...
@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#pragma once
#include <fstream>
#include <iostream>
#include <string>
...
...
paddle/fluid/inference/utils/benchmark_tester.cc
浏览文件 @
885c4e57
...
...
@@ -16,7 +16,7 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
using
namespace
paddle
::
inference
;
using
namespace
paddle
::
inference
;
// NOLINT
TEST
(
Benchmark
,
basic
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
...
...
@@ -36,4 +36,4 @@ TEST(Benchmark, PersistToFile) {
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
}
\ No newline at end of file
}
paddle/fluid/operators/controlflow/feed_op.cc
浏览文件 @
885c4e57
...
...
@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
<<
out_name
;
auto
&
feed_list
=
feed_var
->
Get
<
framework
::
FeedFetchList
>
();
PADDLE_ENFORCE_LT
(
static_cast
<
size_t
>
(
col
),
feed_list
.
size
());
auto
&
feed_item
=
feed_list
.
at
(
static_cast
<
size_t
>
(
col
));
auto
*
out_item
=
out_var
->
GetMutable
<
framework
::
FeedFetchType
>
();
...
...
paddle/fluid/string/pretty_log.h
浏览文件 @
885c4e57
...
...
@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
();
}
template
<
typename
...
Args
>
static
void
PrettyLogInfo
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
info
(),
fmt
,
args
...);
}
template
<
typename
...
Args
>
static
void
PrettyLogDetail
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
detail
(),
fmt
,
args
...);
}
template
<
typename
...
Args
>
static
void
PrettyLogH1
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
H1
(),
fmt
,
args
...);
}
template
<
typename
...
Args
>
static
void
PrettyLogH2
(
const
char
*
fmt
,
const
Args
&
...
args
)
{
PrettyLogEndl
(
Style
::
H2
(),
fmt
,
args
...);
}
}
// namespace string
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录