Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
a66115be
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a66115be
编写于
1月 26, 2019
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add-async-ssa-graph-executor
test=develop
上级
fab8457e
d303270a
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
255 addition
and
168 deletion
+255
-168
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+0
-4
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+1
-1
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+3
-1
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
...e/fluid/inference/analysis/passes/memory_optimize_pass.cc
+117
-71
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
...le/fluid/inference/analysis/passes/memory_optimize_pass.h
+1
-1
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+20
-4
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+13
-11
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+1
-1
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+5
-13
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+21
-3
paddle/fluid/operators/ngraph/CMakeLists.txt
paddle/fluid/operators/ngraph/CMakeLists.txt
+1
-0
paddle/fluid/operators/ngraph/ngraph_bridge.cc
paddle/fluid/operators/ngraph/ngraph_bridge.cc
+18
-18
paddle/fluid/operators/ngraph/ngraph_bridge.h
paddle/fluid/operators/ngraph/ngraph_bridge.h
+6
-6
paddle/fluid/operators/ngraph/ngraph_engine.cc
paddle/fluid/operators/ngraph/ngraph_engine.cc
+6
-7
python/paddle/fluid/contrib/int8_inference/utility.py
python/paddle/fluid/contrib/int8_inference/utility.py
+29
-5
python/paddle/fluid/contrib/tests/test_calibration.py
python/paddle/fluid/contrib/tests/test_calibration.py
+13
-22
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
a66115be
...
...
@@ -129,10 +129,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library
(
proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version
)
if
(
WITH_NGRAPH
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
endif
(
WITH_NGRAPH
)
cc_library
(
op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
a66115be
...
...
@@ -28,7 +28,7 @@ struct ExecutionStrategy {
// If we set this to 1, we will delete all variables when finish a batch. and
// this will loss 15%+ performance.
// Please be aware about this parameters.
size_t
num_iteration_per_drop_scope_
{
1
00
};
size_t
num_iteration_per_drop_scope_
{
1
};
ExecutorType
type_
{
kDefault
};
bool
dry_run_
{
false
};
size_t
num_iteration_per_run_
{
1
};
// only use with async_ssa_graph_executor
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
a66115be
...
...
@@ -133,7 +133,9 @@ struct Argument {
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
memory_optim_force_update
,
MemoryOptimForceUpdate
,
bool
);
DECL_ARGUMENT_FIELD
(
static_memory_optim
,
StaticMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
static_memory_optim_force_update
,
StaticMemoryOptimForceUpdate
,
bool
);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD
(
memory_optim_sort_kind
,
MemoryOptimSortKind
,
int
);
...
...
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
浏览文件 @
a66115be
...
...
@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
return
batch_shapes
;
}
// Replace the -1 in shape to a real number to fake the shape.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
FakeBatchVarShapes
(
const
framework
::
ProgramDesc
&
program
)
{
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
res
;
res
.
emplace_back
();
auto
&
record
=
res
.
front
();
const
int
fake_batch_size
=
3
;
for
(
auto
*
var
:
program
.
Block
(
0
).
AllVars
())
{
if
(
var
->
GetType
()
==
framework
::
proto
::
VarType
::
Type
::
VarType_Type_LOD_TENSOR
)
{
auto
shape
=
var
->
GetShape
();
for
(
auto
&
v
:
shape
)
{
if
(
v
<
0
)
v
=
fake_batch_size
;
}
record
[
var
->
Name
()].
assign
(
shape
.
begin
(),
shape
.
end
());
}
}
return
res
;
}
// Calculate the average dim of each tensor from the batch shape cache.
std
::
unordered_map
<
std
::
string
,
size_t
>
GetBatchAverageSize
(
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
...
...
@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
std
::
unordered_map
<
std
::
string
,
std
::
stringstream
>
var_batchsize_hashes
;
for
(
auto
&
batch
:
batches
)
{
for
(
auto
&
ele
:
batch
)
{
PADDLE_ENFORCE
(
!
ele
.
second
.
empty
());
int
batch_size
=
ele
.
second
.
front
();
// TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes
[
ele
.
first
]
<<
batch_size
;
...
...
@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
std
::
string
MemoryOptimizePass
::
repr
()
const
{
return
"memory optimize pass"
;
}
std
::
pair
<
size_t
,
size_t
>
GetRange
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
ave_size
)
{
auto
res
=
std
::
make_pair
(
std
::
numeric_limits
<
size_t
>::
max
(),
std
::
numeric_limits
<
size_t
>::
min
());
for
(
auto
&
item
:
ave_size
)
{
res
.
first
=
std
::
min
(
item
.
second
,
res
.
first
);
res
.
second
=
std
::
max
(
item
.
second
,
res
.
second
);
}
return
res
;
}
void
MemoryOptimizePass
::
RunImpl
(
Argument
*
argument
)
{
// When force update, should not optimize memory.
if
(
!
argument
->
enable_memory_optim
()
||
argument
->
memory_optim_force_update
())
if
(
!
argument
->
enable_memory_optim
()
||
argument
->
static_memory_optim_force_update
())
return
;
graph_
=
argument
->
main_graph_ptr
();
...
...
@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
argument
->
model_program_path_valid
()
?
argument
->
model_program_path
()
:
""
);
VLOG
(
3
)
<<
"Load memory cache from "
<<
path
;
if
(
inference
::
IsFileExists
(
path
))
{
VLOG
(
4
)
<<
"Performing memory optimize"
;
auto
batches
=
DeseralizeBatchVarShapes
(
path
);
auto
var_batch_ave_size
=
GetBatchAverageSize
(
batches
);
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batches
;
if
(
argument
->
static_memory_optim
()
&&
inference
::
IsFileExists
(
path
))
{
string
::
PrettyLogInfo
(
"--- Performing static memory optimize"
);
batches
=
DeseralizeBatchVarShapes
(
path
);
}
else
{
string
::
PrettyLogInfo
(
"--- Performing dynamic memory optimize"
);
batches
=
FakeBatchVarShapes
(
argument
->
main_program
());
}
auto
var_batch_ave_size
=
GetBatchAverageSize
(
batches
);
// Get min and max memory size.
const
auto
range
=
GetRange
(
var_batch_ave_size
);
const
int
cluster_size
=
std
::
max
(
static_cast
<
int
>
((
range
.
second
-
range
.
first
)
/
100
/*cluster num*/
),
1024
);
const
int
cluster_size1
=
std
::
max
(
static_cast
<
int
>
((
range
.
second
-
range
.
first
)
/
1000
/*cluster num*/
),
1024
);
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
space_table_t
space_table
;
CollectVarMemorySize
(
var_batch_ave_size
,
&
tensor_nodes
,
&
space_table
);
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
space_table_t
space_table
;
CollectVarMemorySize
(
var_batch_ave_size
,
&
tensor_nodes
,
&
space_table
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
reuse_table
;
double
max_saving_ratio
=
0.
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
reuse_table
;
double
max_saving_ratio
=
0.
;
std
::
vector
<
std
::
function
<
MemoryAllocation
()
>>
strategies
;
std
::
vector
<
std
::
function
<
MemoryAllocation
()
>>
strategies
;
for
(
int
sort_kind
=
0
;
sort_kind
<
2
;
sort_kind
++
)
{
for
(
int
sort_kind
=
0
;
sort_kind
<
2
;
sort_kind
++
)
{
if
(
argument
->
static_memory_optim
())
{
// This strategy only make scene in static memory optimize.
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_batch_size
=
AnalysisBatchShapesByBatchSize
(
batches
);
...
...
@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
}
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
);
// interval 1kb
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
cluster_size
);
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
cluster_size1
);
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
std
::
numeric_limits
<
int
>::
max
());
// no intervals
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
}
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
*
1024
);
// interval 1MB
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
std
::
function
<
MemoryAllocation
()
>*
best_strategy
{
nullptr
};
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
std
::
numeric_limits
<
int
>::
max
());
// no intervals
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
// Try all strategies to get the best result.
for
(
auto
&
strategy
:
strategies
)
{
auto
allocation
=
strategy
();
string
::
PrettyLogDetail
(
"--- get strategy saving %f memory for workspace"
,
allocation
.
GetSavingRatio
());
if
(
allocation
.
GetSavingRatio
()
>
max_saving_ratio
)
{
max_saving_ratio
=
allocation
.
GetSavingRatio
();
best_strategy
=
&
strategy
;
}
}
if
(
!
best_strategy
)
{
LOG
(
ERROR
)
<<
"This model makes poor memory optimize, skip memory optimize"
;
return
;
}
auto
memory_allocation
=
(
*
best_strategy
)();
std
::
function
<
MemoryAllocation
()
>*
best_strategy
{
nullptr
};
string
::
PrettyLogInfo
(
"--- Saved %.2f%s memory for workspace(temporary variables)"
,
memory_allocation
.
GetSavingRatio
()
*
100
,
"%"
);
// Try all strategies to get the best result.
for
(
auto
&
strategy
:
strategies
)
{
auto
allocation
=
strategy
();
string
::
PrettyLogDetail
(
"--- get strategy saving %f memory for workspace"
,
allocation
.
GetSavingRatio
());
if
(
allocation
.
GetSavingRatio
()
>
max_saving_ratio
)
{
max_saving_ratio
=
allocation
.
GetSavingRatio
();
best_strategy
=
&
strategy
;
}
}
if
(
!
best_strategy
)
{
LOG
(
ERROR
)
<<
"This model makes poor memory optimize, skip memory optimize"
;
return
;
}
auto
memory_allocation
=
(
*
best_strategy
)();
string
::
PrettyLogH2
(
"--- Saved %.2f%s memory for workspace(temporary variables)"
,
memory_allocation
.
GetSavingRatio
()
*
100
,
"%"
);
string
::
PrettyLogDetail
(
"--- Allocated %d MB"
,
memory_allocation
.
allocated
/
1024.
/
1024.
);
string
::
PrettyLogDetail
(
"--- Saved %d MB"
,
memory_allocation
.
saved
/
1024.
/
1024.
);
argument
->
main_graph
().
Set
(
framework
::
ir
::
kGraphToProgramVarsToRemove
,
new
std
::
unordered_set
<
std
::
string
>
);
auto
&
vars2remove
=
argument
->
main_graph
().
Get
<
std
::
unordered_set
<
std
::
string
>>
(
framework
::
ir
::
kGraphToProgramVarsToRemove
);
PerformReusePlan
(
reuse_table
,
memory_allocation
.
sort_kind
,
&
vars2remove
);
argument
->
SetMemoryOptimSortKind
(
memory_allocation
.
sort_kind
);
}
argument
->
main_graph
().
Set
(
framework
::
ir
::
kGraphToProgramVarsToRemove
,
new
std
::
unordered_set
<
std
::
string
>
);
auto
&
vars2remove
=
argument
->
main_graph
().
Get
<
std
::
unordered_set
<
std
::
string
>>
(
framework
::
ir
::
kGraphToProgramVarsToRemove
);
PerformReusePlan
(
reuse_table
,
memory_allocation
.
sort_kind
,
&
vars2remove
);
argument
->
SetMemoryOptimSortKind
(
memory_allocation
.
sort_kind
);
}
float
MemoryOptimizePass
::
MemoryAllocation
::
GetSavingRatio
()
const
{
...
...
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
浏览文件 @
a66115be
...
...
@@ -15,7 +15,7 @@
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/
inference/analysis/passes/memory_optimize_pass
.h"
#include "paddle/fluid/
platform/port
.h"
namespace
paddle
{
namespace
inference
{
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
a66115be
...
...
@@ -95,7 +95,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER
(
memory_pool_init_size_mb_
);
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
memory_optim_force_update_
);
CP_MEMBER
(
static_memory_optim_
);
CP_MEMBER
(
static_memory_optim_force_update_
);
// TensorRT releated.
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
...
...
@@ -238,7 +239,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
ss
<<
tensorrt_min_subgraph_size_
;
ss
<<
enable_memory_optim_
;
ss
<<
memory_optim_force_update_
;
ss
<<
static_memory_optim_
;
ss
<<
static_memory_optim_force_update_
;
ss
<<
use_mkldnn_
;
for
(
auto
&
item
:
mkldnn_enabled_op_types_
)
ss
<<
item
;
...
...
@@ -278,9 +280,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}
void
contrib
::
AnalysisConfig
::
EnableMemoryOptim
(
bool
force_update_cache
)
{
void
contrib
::
AnalysisConfig
::
EnableMemoryOptim
(
bool
static_optim
,
bool
force_update_static_cache
)
{
enable_memory_optim_
=
true
;
memory_optim_force_update_
=
force_update_cache
;
static_memory_optim_
=
static_optim
;
static_memory_optim_force_update_
=
force_update_static_cache
;
Update
();
}
...
...
@@ -300,4 +304,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update
();
}
NativeConfig
contrib
::
AnalysisConfig
::
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
config
.
prog_file
=
prog_file_
;
config
.
param_file
=
params_file_
;
config
.
use_gpu
=
use_gpu_
;
config
.
device
=
device_id_
;
config
.
fraction_of_gpu_memory
=
fraction_of_gpu_memory_for_pool
();
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
}
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
a66115be
...
...
@@ -298,15 +298,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
outputs
->
resize
(
fetchs_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetchs_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetchs_
[
i
]
->
GetAttr
(
"col"
));
outputs
->
resize
(
fetch
e
s_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetch
e
s_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetch
e
s_
[
i
]
->
GetAttr
(
"col"
));
PADDLE_ENFORCE
((
size_t
)
idx
==
i
);
framework
::
LoDTensor
&
fetch
=
framework
::
GetFetchVariable
(
*
scope
,
"fetch"
,
idx
);
auto
type
=
fetch
.
type
();
auto
output
=
&
(
outputs
->
at
(
i
));
output
->
name
=
fetchs_
[
idx
]
->
Input
(
"X"
)[
0
];
output
->
name
=
fetch
e
s_
[
idx
]
->
Input
(
"X"
)[
0
];
if
(
type
==
framework
::
proto
::
VarType
::
FP32
)
{
GetFetchOne
<
float
>
(
fetch
,
output
);
output
->
dtype
=
PaddleDType
::
FLOAT32
;
...
...
@@ -327,7 +327,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetUseGPU
(
config_
.
use_gpu
());
argument_
.
SetGPUDeviceId
(
config_
.
gpu_device_id
());
argument_
.
SetEnableMemoryOptim
(
config_
.
enable_memory_optim
());
argument_
.
SetMemoryOptimForceUpdate
(
config_
.
memory_optim_force_update_
);
argument_
.
SetStaticMemoryOptim
(
config_
.
static_memory_optim_
);
argument_
.
SetStaticMemoryOptimForceUpdate
(
config_
.
static_memory_optim_force_update_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
// Analyze inference_program
if
(
!
config_
.
model_dir
().
empty
())
{
...
...
@@ -422,10 +424,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
fetchs_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetchs_
.
resize
(
idx
+
1
);
if
(
fetch
e
s_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetch
e
s_
.
resize
(
idx
+
1
);
}
fetchs_
[
idx
]
=
op
;
fetch
e
s_
[
idx
]
=
op
;
}
}
}
...
...
@@ -638,12 +640,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
// check if the cache exists
if
(
!
config_
.
enable_memory_optim
())
{
need
=
false
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
}
else
if
(
config_
.
static_memory_optim_
&&
!
inference
::
IsFileExists
(
inference
::
analysis
::
GetMemoryCachePath
(
config_
.
model_dir
(),
config_
.
prog_file
())))
{
need
=
true
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
config_
.
memory_optim_force_update_
)
{
}
else
if
(
config_
.
static_memory_optim_
&&
config_
.
static_
memory_optim_force_update_
)
{
need
=
true
;
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
a66115be
...
...
@@ -115,7 +115,7 @@ class AnalysisPredictor : public PaddlePredictor {
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
std
::
vector
<
framework
::
OpDesc
*>
fetchs_
;
std
::
vector
<
framework
::
OpDesc
*>
fetch
e
s_
;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
a66115be
...
...
@@ -162,17 +162,7 @@ struct AnalysisConfig {
/** Transform the AnalysisConfig to NativeConfig.
*/
NativeConfig
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
config
.
prog_file
=
prog_file_
;
config
.
param_file
=
params_file_
;
config
.
use_gpu
=
use_gpu_
;
config
.
device
=
device_id_
;
config
.
fraction_of_gpu_memory
=
fraction_of_gpu_memory_for_pool
();
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
}
NativeConfig
ToNativeConfig
()
const
;
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
*/
...
...
@@ -195,7 +185,8 @@ struct AnalysisConfig {
/** Turn on memory optimize
* NOTE still in development, will release latter.
*/
void
EnableMemoryOptim
(
bool
force_update_cache
=
false
);
void
EnableMemoryOptim
(
bool
static_optim
=
false
,
bool
force_update_static_cache
=
false
);
/** Tell whether the memory optimization is activated. */
bool
enable_memory_optim
()
const
;
...
...
@@ -241,7 +232,8 @@ struct AnalysisConfig {
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
memory_optim_force_update_
{
false
};
bool
static_memory_optim_
{
false
};
bool
static_memory_optim_force_update_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
...
...
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
a66115be
...
...
@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) {
}
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST
(
Analyzer_dam
,
compare_with_memory_optim
)
{
TEST
(
Analyzer_dam
,
compare_with_
static_
memory_optim
)
{
// The small dam will core in CI, but works in local.
if
(
FLAGS_max_turn_num
==
9
)
{
contrib
::
AnalysisConfig
cfg
,
cfg1
;
...
...
@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
SetInput
(
&
input_slots_all
);
// Run the first time to force to update memory cache
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
(
true
);
cfg
.
EnableMemoryOptim
(
true
,
true
/*force update*/
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
...
...
@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
// Run second time to use the memory cache and perform memory optimization.
SetConfig
(
&
cfg1
);
cfg1
.
EnableMemoryOptim
();
cfg1
.
EnableMemoryOptim
(
true
,
false
/*do not force update*/
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg1
),
...
...
@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
}
}
TEST
(
Analyzer_dam
,
compare_with_dynamic_memory_optim
)
{
// The small dam will core in CI, but works in local.
if
(
FLAGS_max_turn_num
==
9
)
{
contrib
::
AnalysisConfig
cfg
,
cfg1
;
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
// Run the first time to force to update memory cache
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
();
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
TEST
(
Analyzer_dam
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
...
...
paddle/fluid/operators/ngraph/CMakeLists.txt
浏览文件 @
a66115be
if
(
WITH_NGRAPH
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
cc_library
(
ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto
)
op_library
(
ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context
)
endif
()
paddle/fluid/
framework
/ngraph_bridge.cc
→
paddle/fluid/
operators/ngraph
/ngraph_bridge.cc
浏览文件 @
a66115be
...
...
@@ -17,39 +17,39 @@ limitations under the License. */
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
operators
{
namespace
NG_OPS
=
paddle
::
operators
::
ngraphs
;
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
OperatorBase
>&
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{
{
"elementwise_add"
,
NG_OPS
::
BuildElementwiseAddNode
},
{
"elementwise_add_grad"
,
NG_OPS
::
BuildElementwiseAddGradNode
},
{
"fill_constant"
,
paddle
::
operators
::
ngraphs
::
BuildFillConstantNode
},
{
"mean"
,
paddle
::
operators
::
ngraphs
::
BuildMeanNode
},
{
"mean_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMeanGradNode
},
{
"mul"
,
paddle
::
operators
::
ngraphs
::
BuildMulNode
},
{
"mul_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMulGradNode
},
{
"softmax"
,
paddle
::
operators
::
ngraphs
::
BuildSoftmaxNode
},
{
"softmax_grad"
,
paddle
::
operators
::
ngraphs
::
BuildSoftmaxGradNode
},
{
"scale"
,
paddle
::
operators
::
ngraphs
::
BuildScaleNode
},
{
"relu"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"top_k"
,
paddle
::
operators
::
ngraphs
::
BuildTopKNode
}};
void
NgraphBridge
::
BuildNgNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
)
{
{
"fill_constant"
,
NG_OPS
::
BuildFillConstantNode
},
{
"mean"
,
NG_OPS
::
BuildMeanNode
},
{
"mean_grad"
,
NG_OPS
::
BuildMeanGradNode
},
{
"mul"
,
NG_OPS
::
BuildMulNode
},
{
"mul_grad"
,
NG_OPS
::
BuildMulGradNode
},
{
"softmax"
,
NG_OPS
::
BuildSoftmaxNode
},
{
"softmax_grad"
,
NG_OPS
::
BuildSoftmaxGradNode
},
{
"scale"
,
NG_OPS
::
BuildScaleNode
},
{
"relu"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"top_k"
,
NG_OPS
::
BuildTopKNode
}};
void
NgraphBridge
::
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
)
{
auto
&
op_type
=
op
->
Type
();
NG_NODE_MAP
[
op_type
](
op
,
ngb_node_map_
);
}
}
// namespace
framework
}
// namespace
operators
}
// namespace paddle
paddle/fluid/
framework
/ngraph_bridge.h
→
paddle/fluid/
operators/ngraph
/ngraph_bridge.h
浏览文件 @
a66115be
...
...
@@ -21,16 +21,16 @@ limitations under the License. */
#include "ngraph/node.hpp"
namespace
paddle
{
namespace
framework
{
#include "paddle/fluid/framework/operator.h"
class
OperatorBase
;
namespace
paddle
{
namespace
operators
{
class
NgraphBridge
{
public:
static
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
OperatorBase
>&
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NG_NODE_MAP
;
...
...
@@ -41,7 +41,7 @@ class NgraphBridge {
var_node_map
)
:
ngb_node_map_
(
var_node_map
)
{}
void
BuildNgNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
);
void
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
);
private:
std
::
shared_ptr
<
...
...
@@ -49,5 +49,5 @@ class NgraphBridge {
ngb_node_map_
;
};
}
// namespace
framework
}
// namespace
operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ngraph_engine.cc
浏览文件 @
a66115be
...
...
@@ -24,11 +24,11 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
namespace
paddle
{
...
...
@@ -88,15 +88,14 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int
pivot
=
left
;
while
(
pivot
<
right
)
{
auto
op_type
=
ops
.
at
(
pivot
)
->
Type
();
if
(
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
find
(
op_type
)
==
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
end
())
{
if
(
NgraphBridge
::
NG_NODE_MAP
.
find
(
op_type
)
==
NgraphBridge
::
NG_NODE_MAP
.
end
())
{
++
pivot
;
}
else
{
int
start
=
pivot
,
end
=
start
;
while
(
pivot
<
right
&&
(
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
find
(
ops
.
at
(
pivot
)
->
Type
())
!=
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
(
NgraphBridge
::
NG_NODE_MAP
.
find
(
ops
.
at
(
pivot
)
->
Type
())
!=
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
++
pivot
;
++
end
;
}
...
...
@@ -283,7 +282,7 @@ void NgraphEngine::BuildNgNodes() {
}
}
}
framework
::
NgraphBridge
ngb
(
var_node_map_
);
NgraphBridge
ngb
(
var_node_map_
);
for
(
auto
&
op
:
fused_ops_
)
{
ngb
.
BuildNgNode
(
op
);
}
...
...
python/paddle/fluid/contrib/int8_inference/utility.py
浏览文件 @
a66115be
...
...
@@ -32,10 +32,13 @@ class Calibrator(object):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
program
=
kwargs
[
'program'
]
self
.
iterations
=
kwargs
[
'iterations'
]
self
.
pretrained_model
=
kwargs
[
'pretrained_model'
]
self
.
debug
=
kwargs
[
'debug'
]
self
.
debug
=
kwargs
[
'debug'
]
if
'debug'
in
kwargs
else
False
self
.
algo
=
kwargs
[
'algo'
]
self
.
output
=
kwargs
[
'output'
]
self
.
feed_var_names
=
kwargs
[
'feed_var_names'
]
self
.
fetch_list
=
kwargs
[
'fetch_list'
]
self
.
exe
=
kwargs
[
'exe'
]
self
.
_conv_input_var_name
=
[]
self
.
_conv_output_var_name
=
[]
...
...
@@ -54,17 +57,38 @@ class Calibrator(object):
self
.
_u8_output_var
=
[]
self
.
_s8_output_var
=
[]
self
.
_persistable_vars
=
[]
self
.
_sampling_data
=
{}
def
generate_sampling_program
(
self
):
self
.
__init_analysis
()
self
.
__generate_output_program
()
def
generate_quantized_data
(
self
,
sampling_data
):
self
.
__sampling
(
sampling_data
)
def
save_int8_model
(
self
):
self
.
__sampling
(
s
elf
.
_s
ampling_data
)
self
.
__save_scale
()
self
.
__update_program
()
self
.
__update_output_program_attr
()
self
.
__display_debug
()
self
.
__save_offline_model
()
def
sample_data
(
self
):
'''
Sampling the tensor data of variable.
'''
for
i
in
self
.
sampling_program
.
list_vars
():
if
i
.
name
in
self
.
sampling_vars
:
np_data
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
i
.
name
)
.
get_tensor
())
if
i
.
name
not
in
self
.
_sampling_data
:
self
.
_sampling_data
[
i
.
name
]
=
[]
self
.
_sampling_data
[
i
.
name
].
append
(
np_data
)
def
__save_offline_model
(
self
):
'''
Save the quantized model to the disk.
'''
fluid
.
io
.
save_inference_model
(
self
.
output
,
self
.
feed_var_names
,
self
.
fetch_list
,
self
.
exe
,
self
.
sampling_program
)
def
__display_debug
(
self
):
if
self
.
debug
:
...
...
python/paddle/fluid/contrib/tests/test_calibration.py
浏览文件 @
a66115be
...
...
@@ -26,7 +26,7 @@ import paddle.fluid.profiler as profiler
from
PIL
import
Image
,
ImageEnhance
import
math
sys
.
path
.
append
(
'..'
)
import
int8_inference.utility
as
ut
import
int8_inference.utility
as
int8_utility
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
...
...
@@ -120,13 +120,13 @@ class TestCalibration(unittest.TestCase):
def
setUp
(
self
):
# TODO(guomingz): Put the download process in the cmake.
# Download and unzip test data set
imagenet_dl_url
=
'http://paddle-inference-dist.
bj
.bcebos.com/int8/calibration_test_data.tar.gz'
imagenet_dl_url
=
'http://paddle-inference-dist.
cdn
.bcebos.com/int8/calibration_test_data.tar.gz'
zip_file_name
=
imagenet_dl_url
.
split
(
'/'
)[
-
1
]
cmd
=
'rm -rf data {} && mkdir data && wget {} && tar xvf {} -C data'
.
format
(
zip_file_name
,
imagenet_dl_url
,
zip_file_name
)
os
.
system
(
cmd
)
# resnet50 fp32 data
resnet50_fp32_model_url
=
'http://paddle-inference-dist.
bj
.bcebos.com/int8/resnet50_int8_model.tar.gz'
resnet50_fp32_model_url
=
'http://paddle-inference-dist.
cdn
.bcebos.com/int8/resnet50_int8_model.tar.gz'
resnet50_zip_name
=
resnet50_fp32_model_url
.
split
(
'/'
)[
-
1
]
resnet50_unzip_folder_name
=
'resnet50_fp32'
cmd
=
'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'
.
format
(
...
...
@@ -135,8 +135,7 @@ class TestCalibration(unittest.TestCase):
resnet50_zip_name
,
resnet50_unzip_folder_name
)
os
.
system
(
cmd
)
self
.
iterations
=
100
self
.
skip_batch_num
=
5
self
.
iterations
=
50
def
run_program
(
self
,
model_path
,
generate_int8
=
False
,
algo
=
'direct'
):
image_shape
=
[
3
,
224
,
224
]
...
...
@@ -163,16 +162,15 @@ class TestCalibration(unittest.TestCase):
print
(
"Start calibration ..."
)
calibrator
=
ut
.
Calibrator
(
calibrator
=
int8_utility
.
Calibrator
(
program
=
infer_program
,
pretrained_model
=
model_path
,
iterations
=
100
,
debug
=
Fals
e
,
algo
=
algo
)
sampling_data
=
{}
algo
=
algo
,
exe
=
ex
e
,
output
=
int8_model
,
feed_var_names
=
feed_dict
,
fetch_list
=
fetch_targets
)
calibrator
.
generate_sampling_program
()
test_info
=
[]
cnt
=
0
for
batch_id
,
data
in
enumerate
(
val_reader
()):
...
...
@@ -192,13 +190,7 @@ class TestCalibration(unittest.TestCase):
feed_dict
[
1
]:
label
},
fetch_list
=
fetch_targets
)
if
generate_int8
:
for
i
in
calibrator
.
sampling_program
.
list_vars
():
if
i
.
name
in
calibrator
.
sampling_vars
:
np_data
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
i
.
name
)
.
get_tensor
())
if
i
.
name
not
in
sampling_data
:
sampling_data
[
i
.
name
]
=
[]
sampling_data
[
i
.
name
].
append
(
np_data
)
calibrator
.
sample_data
()
test_info
.
append
(
np
.
mean
(
acc1
)
*
len
(
data
))
cnt
+=
len
(
data
)
...
...
@@ -209,9 +201,8 @@ class TestCalibration(unittest.TestCase):
break
if
generate_int8
:
calibrator
.
generate_quantized_data
(
sampling_data
)
fluid
.
io
.
save_inference_model
(
int8_model
,
feed_dict
,
fetch_targets
,
exe
,
calibrator
.
sampling_program
)
calibrator
.
save_int8_model
()
print
(
"Calibration is done and the corresponding files were generated at {}"
.
format
(
os
.
path
.
abspath
(
"calibration_out"
)))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录