Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a6fbf7ec
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a6fbf7ec
编写于
9月 28, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'feature/refine_generate_proposals_op' into rewrite_allocation
上级
58ed412f
593ad763
变更
45
展开全部
隐藏空白更改
内联
并排
Showing
45 changed file
with
2535 addition
and
308 deletion
+2535
-308
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+1
-0
paddle/fluid/API.spec
paddle/fluid/API.spec
+6
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+5
-2
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+6
-6
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+150
-0
paddle/fluid/framework/naive_executor.h
paddle/fluid/framework/naive_executor.h
+63
-0
paddle/fluid/framework/naive_executor_test.cc
paddle/fluid/framework/naive_executor_test.cc
+70
-0
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+8
-2
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+31
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-1
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+1
-1
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+13
-7
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+211
-31
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+49
-10
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+67
-0
paddle/fluid/inference/api/api.cc
paddle/fluid/inference/api/api.cc
+22
-16
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-1
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+13
-9
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+3
-3
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+111
-0
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+46
-0
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+138
-0
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+53
-1
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+6
-1
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+4
-1
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+282
-3
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+2
-1
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+3
-1
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+2
-4
paddle/fluid/operators/detection/generate_proposals_op.cc
paddle/fluid/operators/detection/generate_proposals_op.cc
+97
-97
paddle/fluid/operators/detection/generate_proposals_op.cu
paddle/fluid/operators/detection/generate_proposals_op.cu
+88
-74
paddle/fluid/operators/gather.h
paddle/fluid/operators/gather.h
+2
-4
paddle/fluid/string/pretty_log.h
paddle/fluid/string/pretty_log.h
+4
-4
python/CMakeLists.txt
python/CMakeLists.txt
+1
-0
python/paddle/fluid/contrib/__init__.py
python/paddle/fluid/contrib/__init__.py
+3
-0
python/paddle/fluid/contrib/quantize/__init__.py
python/paddle/fluid/contrib/quantize/__init__.py
+20
-0
python/paddle/fluid/contrib/quantize/quantize_transpiler.py
python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+557
-0
python/paddle/fluid/contrib/tests/CMakeLists.txt
python/paddle/fluid/contrib/tests/CMakeLists.txt
+6
-0
python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
...on/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+272
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+0
-1
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+2
-4
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+7
-8
python/paddle/fluid/transpiler/__init__.py
python/paddle/fluid/transpiler/__init__.py
+6
-2
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
...paddle/fluid/transpiler/memory_optimization_transpiler.py
+101
-11
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
cmake/external/anakin.cmake
浏览文件 @
a6fbf7ec
...
...
@@ -52,6 +52,7 @@ ExternalProject_Add(
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS
${
CMAKE_ARGS_PREFIX
}
-DUSE_LOGGER=YES
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=
${
THIRD_PARTY_PATH
}
/install/protobuf
...
...
paddle/fluid/API.spec
浏览文件 @
a6fbf7ec
...
...
@@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
], varargs=None, keywords=None, defaults=(None, False, 0
))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
, 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False
))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
...
...
@@ -299,13 +299,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
], varargs=None, keywords=None, defaults=(None, False, 0
))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
, 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False
))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
...
@@ -56,9 +56,9 @@ else()
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
endif
()
if
(
NOT WIN32
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version
)
else
()
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
endif
(
NOT WIN32
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
...
...
@@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op
)
endif
()
if
(
NOT WIN32
)
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
...
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_viz_pass base
)
pass_library
(
fc_fuse_pass inference
)
if
(
WITH_MKLDNN
)
pass_library
(
conv_relu_mkldnn_fuse_pass inference
)
endif
()
if
(
WITH_MKLDNN
)
pass_library
(
conv_relu_mkldnn_fuse_pass inference
)
endif
()
pass_library
(
attention_lstm_fuse_pass inference
)
pass_library
(
infer_clean_graph_pass inference
)
pass_library
(
fc_lstm_fuse_pass inference
)
...
...
@@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
if
(
WITH_MKLDNN
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
endif
()
if
(
WITH_MKLDNN
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
endif
()
paddle/fluid/framework/naive_executor.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
framework
{
// These code can be shared with Executor.
static
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
)
{
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR
)
{
var
->
GetMutable
<
LoDTensor
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
SELECTED_ROWS
)
{
var
->
GetMutable
<
SelectedRows
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FEED_MINIBATCH
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
var
->
GetMutable
<
LoDTensorArray
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
PLACE_LIST
)
{
var
->
GetMutable
<
platform
::
PlaceList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
READER
)
{
var
->
GetMutable
<
ReaderHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
CHANNEL
)
{
var
->
GetMutable
<
ChannelHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
RAW
)
{
// GetMutable will be called in operator
}
else
{
PADDLE_THROW
(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]"
,
var_type
);
}
}
void
NaiveExecutor
::
Prepare
(
Scope
*
parent_scope
,
const
ProgramDesc
&
program_desc
,
int
block_id
,
bool
with_feed_fetch_ops
)
{
if
(
!
parent_scope
)
{
scope_
=
new
framework
::
Scope
;
}
else
{
scope_
=
&
parent_scope
->
NewScope
();
}
CreateVariables
(
program_desc
,
scope_
,
block_id
);
CreateOps
(
program_desc
,
block_id
,
with_feed_fetch_ops
);
}
void
NaiveExecutor
::
Run
()
{
for
(
auto
&
op
:
ops_
)
{
VLOG
(
4
)
<<
"run "
<<
op
->
Type
();
op
->
Run
(
*
scope_
,
place_
);
}
}
void
NaiveExecutor
::
CreateVariables
(
const
ProgramDesc
&
desc
,
Scope
*
scope
,
int
block_id
)
{
PADDLE_ENFORCE
(
scope
);
auto
&
global_block
=
desc
.
Block
(
block_id
);
const
Scope
*
ancestor_scope
=
scope
;
while
(
ancestor_scope
->
parent
())
{
ancestor_scope
=
ancestor_scope
->
parent
();
}
if
(
ancestor_scope
!=
scope
)
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
continue
;
}
// Create persistable vars in ancestor scope.
if
(
var
->
Persistable
())
{
auto
*
ptr
=
const_cast
<
Scope
*>
(
ancestor_scope
)
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
// Create temporary variables in local scope.
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
}
else
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
}
}
void
NaiveExecutor
::
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
bool
with_feed_fetch_ops
)
{
for
(
const
auto
&
op_desc
:
desc
.
Block
(
block_id
).
AllOps
())
{
if
(
!
with_feed_fetch_ops
&&
(
op_desc
->
Type
()
==
"feed"
||
op_desc
->
Type
()
==
"fetch"
))
{
string
::
PrettyLogEndl
(
string
::
Style
::
detail
(),
"--- skip [%s], %s -> %s"
,
op_desc
->
Input
(
"X"
)[
0
],
op_desc
->
Type
(),
op_desc
->
Output
(
"Out"
)[
0
]);
continue
;
}
ops_
.
emplace_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
}
LoDTensor
*
NaiveExecutor
::
FindTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
scope_
,
"Need to init scope first"
);
auto
*
var
=
scope_
->
FindVar
(
name
);
PADDLE_ENFORCE
(
var
,
"No variable [%s] in the scope"
);
auto
*
tensor
=
const_cast
<
LoDTensor
*>
(
&
var
->
Get
<
LoDTensor
>
());
return
tensor
;
}
void
NaiveExecutor
::
CleanFeedFetchOps
()
{
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops
;
for
(
auto
&
op
:
ops_
)
{
if
(
op
->
Type
()
!=
"feed"
&&
op
->
Type
()
!=
"fetch"
)
{
ops
.
emplace_back
(
std
::
move
(
op
));
}
}
ops_
.
swap
(
ops
);
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor.h
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
/*
* Simple, intuitive and effective. Only single thread is supported, and
* currently designed for inference.
*/
class
NaiveExecutor
{
public:
explicit
NaiveExecutor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
// Create child scope.
// Create variables.
// @with_feed_fetch_ops: whether to work with the feed and fetch operators.
void
Prepare
(
Scope
*
parent_scope
,
const
ProgramDesc
&
program_desc
,
int
block_id
,
bool
with_feed_fetch_ops
);
// Run all the operators.
void
Run
();
// Get an tensor to operating directly, without the need for feed_ops.
LoDTensor
*
FindTensor
(
const
std
::
string
&
name
);
Scope
*
scope
()
{
return
scope_
;
}
void
CleanFeedFetchOps
();
protected:
void
CreateVariables
(
const
ProgramDesc
&
desc
,
Scope
*
scope
,
int
block_id
);
void
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
bool
with_feed_fetch_ops
);
private:
const
platform
::
Place
place_
;
// Catch the required resource to avoid recreate.
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
Scope
*
scope_
;
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor_test.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include <gtest/gtest.h>
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
framework
{
TEST
(
NaiveExecutor
,
Basic
)
{
ProgramDesc
program
;
auto
*
main_block
=
program
.
MutableBlock
(
0
);
auto
*
a
=
main_block
->
Var
(
"a"
);
// input
auto
*
b
=
main_block
->
Var
(
"b"
);
// input
auto
*
c
=
main_block
->
Var
(
"c"
);
// input
a
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
b
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
c
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
auto
*
add
=
main_block
->
AppendOp
();
add
->
SetType
(
"elementwise_add"
);
add
->
SetInput
(
"X"
,
{
"a"
});
add
->
SetInput
(
"Y"
,
{
"b"
});
add
->
SetOutput
(
"Out"
,
{
"c"
});
auto
place
=
platform
::
CPUPlace
();
NaiveExecutor
exe
(
place
);
exe
.
Prepare
(
nullptr
,
program
,
0
,
false
/*with feed fetch ops*/
);
auto
*
a_tensor
=
exe
.
FindTensor
(
"a"
);
auto
*
b_tensor
=
exe
.
FindTensor
(
"b"
);
auto
*
c_tensor
=
exe
.
FindTensor
(
"c"
);
a_tensor
->
Resize
({
1
,
4
});
b_tensor
->
Resize
({
1
,
4
});
c_tensor
->
Resize
({
1
,
4
});
b_tensor
->
mutable_data
<
float
>
(
place
);
a_tensor
->
mutable_data
<
float
>
(
place
);
float
a_arr
[]
=
{
0
,
1
,
2
,
3
};
float
b_arr
[]
=
{
0.0
,
.1
,
.2
,
.3
};
std
::
copy_n
(
a_arr
,
4
,
a_tensor
->
mutable_data
<
float
>
(
place
));
std
::
copy_n
(
b_arr
,
4
,
b_tensor
->
mutable_data
<
float
>
(
place
));
exe
.
Run
();
auto
*
c_data
=
c_tensor
->
mutable_data
<
float
>
(
place
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
EXPECT_NEAR
(
c_data
[
i
],
1.1
*
i
,
1e-3
);
}
}
}
// namespace framework
}
// namespace paddle
USE_OP
(
elementwise_add
);
paddle/fluid/framework/operator.cc
浏览文件 @
a6fbf7ec
...
...
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform
::
SetDeviceId
(
dev_id
);
#endif
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
if
(
platform
::
IsProfileEnabled
())
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
}
RunImpl
(
scope
,
place
);
if
(
VLOG_IS_ON
(
3
))
{
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
a6fbf7ec
...
...
@@ -20,6 +20,13 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
// The mutex is not needed by training and inference, only for distribution.
#if PADDLE_WITH_DISTRIBUTE
#define WITH_LOCK 1
#else
#define WITH_LOCK 0
#endif
DEFINE_bool
(
benchmark
,
false
,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
...
...
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
Scope
::~
Scope
()
{
DropKids
();
}
Scope
&
Scope
::
NewScope
()
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
kids_
.
push_back
(
new
Scope
(
this
));
return
*
kids_
.
back
();
}
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
)
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
return
VarInternal
(
name
);
}
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
if
(
name
!=
nullptr
)
{
*
name
=
new_name
;
...
...
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
}
Variable
*
Scope
::
FindVar
(
const
std
::
string
&
name
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
return
FindVarInternal
(
name
);
}
const
Scope
*
Scope
::
FindScope
(
const
Variable
*
var
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
return
FindScopeInternal
(
var
);
}
void
Scope
::
DropKids
()
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
for
(
Scope
*
s
:
kids_
)
delete
s
;
kids_
.
clear
();
}
bool
Scope
::
HasKid
(
const
Scope
*
scope
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
return
it
!=
this
->
kids_
.
end
();
}
std
::
vector
<
std
::
string
>
Scope
::
LocalVarNames
()
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
std
::
vector
<
std
::
string
>
known_vars
;
known_vars
.
reserve
(
this
->
vars_
.
size
());
for
(
auto
&
p
:
vars_
)
{
...
...
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
}
void
Scope
::
DeleteScope
(
Scope
*
scope
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
PADDLE_ENFORCE
(
it
!=
this
->
kids_
.
end
(),
"Cannot find %p as kid scope"
,
scope
);
this
->
kids_
.
erase
(
it
);
...
...
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
}
void
Scope
::
EraseVars
(
const
std
::
vector
<
std
::
string
>&
var_names
)
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
std
::
set
<
std
::
string
>
var_set
(
var_names
.
begin
(),
var_names
.
end
());
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
if
(
var_set
.
find
(
it
->
first
)
!=
var_set
.
end
())
{
...
...
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void
Scope
::
Rename
(
const
std
::
string
&
origin_name
,
const
std
::
string
&
new_name
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
RenameInternal
(
origin_name
,
new_name
);
}
std
::
string
Scope
::
Rename
(
const
std
::
string
&
origin_name
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
RenameInternal
(
origin_name
,
new_name
);
return
new_name
;
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
...
@@ -53,7 +53,7 @@ if(NOT APPLE)
endif
()
if
(
WITH_TESTING
)
# tests/book depends the models that generated by python/paddle/fluid/tests/book
# tests/book depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory
(
tests/book
)
if
(
WITH_INFERENCE_API_TEST
)
add_subdirectory
(
tests/api
)
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
a6fbf7ec
cc_library
(
ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass
)
set
(
analysis_deps
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log
)
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log
)
cc_library
(
analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
...
@@ -18,10 +18,10 @@ if(APPLE)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
${
GLOB_PASS_LIB
}
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
analysis_predictor
)
endif
()
function
(
inference_api_test TARGET_NAME
)
...
...
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
endif
(
WITH_TESTING
)
endfunction
(
inference_api_test
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api
)
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api
)
...
...
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
inference_api_test
(
test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
${
inference_deps
}
paddle_inference_api
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
cc_library
(
paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter
)
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter
zero_copy_tensor_dummy
)
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
endif
()
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
cc_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
cc_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml
scope zero_copy_tensor_dummy
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
scope
)
function
(
anakin_target target_name
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endfunction
()
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
a6fbf7ec
...
...
@@ -16,11 +16,15 @@
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -28,8 +32,11 @@ DECLARE_bool(profile);
namespace
paddle
{
using
contrib
::
AnalysisConfig
;
bool
AnalysisPredictor
::
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
)
{
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
)
{
VLOG
(
3
)
<<
"Predictor::init()"
;
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
...
...
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
LOG
(
WARNING
)
<<
"ir optimize only supports CPU currently"
;
LOG
(
WARNING
)
<<
"ir optimize only supports CPU currently, enable_ir_optim "
"is turned false."
;
config_
.
enable_ir_optim
=
false
;
}
else
{
place_
=
paddle
::
platform
::
CPUPlace
();
...
...
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
scope_
.
reset
(
new
paddle
::
framework
::
Scope
());
}
executor_
.
reset
(
new
paddle
::
framework
::
Executor
(
place_
));
executor_
.
reset
(
new
paddle
::
framework
::
Naive
Executor
(
place_
));
// Initialize the inference program
if
(
!
config_
.
model_dir
.
empty
())
{
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_
=
paddle
::
inference
::
Load
(
executor_
.
get
(),
scope_
.
get
(),
config_
.
model_dir
);
}
else
if
(
!
config_
.
prog_file
.
empty
()
&&
!
config_
.
param_file
.
empty
())
{
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_
=
paddle
::
inference
::
Load
(
executor_
.
get
(),
scope_
.
get
(),
config_
.
prog_file
,
config_
.
param_file
);
if
(
!
program
)
{
if
(
!
LoadProgramDesc
())
return
false
;
OptimizeInferenceProgram
();
}
else
{
LOG
(
ERROR
)
<<
"fail to load inference model from "
<<
config_
.
model_dir
;
inference_program_
=
program
;
}
executor_
->
Prepare
(
scope_
.
get
(),
*
inference_program_
,
0
,
config_
.
use_feed_fetch_ops
);
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch
();
return
true
;
}
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
VLOG
(
3
)
<<
"Predictor::predict"
;
inference
::
Timer
timer
;
timer
.
tic
();
// set feed variable
std
::
vector
<
framework
::
LoDTensor
>
feeds
;
framework
::
Scope
*
scope
=
sub_scope_
?
sub_scope_
:
scope_
.
get
();
if
(
!
SetFeed
(
inputs
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to set feed"
;
return
false
;
}
// Run the inference program
// if share variables, we need not create variables
executor_
->
Run
();
OptimizeInferenceProgram
();
if
(
config_
.
_use_mkldnn
)
{
executor_
->
EnableMKLDNN
(
*
inference_program_
);
// get fetch variable
if
(
!
GetFetch
(
output_data
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
ctx_
=
executor_
->
Prepare
(
*
inference_program_
,
0
);
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
return
true
;
}
VLOG
(
5
)
<<
"to create variables"
;
PADDLE_ENFORCE
(
scope_
.
get
());
executor_
->
CreateVariables
(
*
inference_program_
,
sub_scope_
?
sub_scope_
:
scope_
.
get
(),
0
);
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch
();
bool
AnalysisPredictor
::
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::set_feed"
;
if
(
inputs
.
size
()
!=
feeds_
.
size
())
{
LOG
(
ERROR
)
<<
"wrong feed input size, need "
<<
feeds_
.
size
()
<<
" but get "
<<
inputs
.
size
();
return
false
;
}
// Cache the inputs memory for better concurrency performance.
feed_tensors_
.
resize
(
inputs
.
size
());
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
&
input
=
feed_tensors_
[
i
];
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
void
*
input_ptr
;
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
platform
::
CPUPlace
());
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
platform
::
CPUPlace
());
}
else
{
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
return
false
;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
lod
.
emplace_back
(
level
);
}
input
.
set_lod
(
lod
);
int
idx
=
-
1
;
if
(
config_
.
specify_input_name
)
{
idx
=
feed_names_
[
inputs
[
i
].
name
];
}
else
{
idx
=
boost
::
get
<
int
>
(
feeds_
[
i
]
->
GetAttr
(
"col"
));
}
framework
::
SetFeedVariable
(
scope
,
input
,
"feed"
,
idx
);
}
return
true
;
}
template
<
typename
T
>
void
AnalysisPredictor
::
GetFetchOne
(
const
framework
::
LoDTensor
&
fetch
,
PaddleTensor
*
output
)
{
// set shape.
auto
shape
=
framework
::
vectorize
(
fetch
.
dims
());
output
->
shape
.
assign
(
shape
.
begin
(),
shape
.
end
());
// set data.
const
T
*
data
=
fetch
.
data
<
T
>
();
int
num_elems
=
inference
::
VecReduceToInt
(
shape
);
output
->
data
.
Resize
(
num_elems
*
sizeof
(
T
));
// The fetched tensor output by fetch op, should always in CPU memory, so just
// copy.
memcpy
(
output
->
data
.
data
(),
data
,
num_elems
*
sizeof
(
T
));
// set lod
output
->
lod
.
clear
();
for
(
auto
&
level
:
fetch
.
lod
())
{
output
->
lod
.
emplace_back
(
level
.
begin
(),
level
.
end
());
}
}
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
outputs
->
resize
(
fetchs_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetchs_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetchs_
[
i
]
->
GetAttr
(
"col"
));
PADDLE_ENFORCE
((
size_t
)
idx
==
i
);
framework
::
LoDTensor
&
fetch
=
framework
::
GetFetchVariable
(
*
scope
,
"fetch"
,
idx
);
auto
type
=
fetch
.
type
();
auto
output
=
&
(
outputs
->
at
(
i
));
if
(
type
==
typeid
(
float
))
{
GetFetchOne
<
float
>
(
fetch
,
output
);
output
->
dtype
=
PaddleDType
::
FLOAT32
;
}
else
if
(
type
==
typeid
(
int64_t
))
{
GetFetchOne
<
int64_t
>
(
fetch
,
output
);
output
->
dtype
=
PaddleDType
::
INT64
;
}
else
{
LOG
(
ERROR
)
<<
"unknown type, only support float32 and int64 now."
;
}
}
return
true
;
}
...
...
@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
new
std
::
string
(
config_
.
prog_file
));
argument_
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
}
argument_
.
origin_program_desc
.
reset
(
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
PADDLE_ENFORCE
(
...
...
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
contrib
::
AnalysisConfig
&
config
)
{
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
...
...
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
}
std
::
unique_ptr
<
PaddlePredictor
>
predictor
(
new
AnalysisPredictor
(
config
));
if
(
!
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
Init
(
nullptr
))
{
if
(
!
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
Init
(
nullptr
))
{
return
nullptr
;
}
return
predictor
;
}
void
AnalysisPredictor
::
PrepareFeedFetch
()
{
for
(
auto
*
op
:
inference_program_
->
Block
(
0
).
AllOps
())
{
if
(
op
->
Type
()
==
"feed"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
feeds_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
feeds_
.
resize
(
idx
+
1
);
}
feeds_
[
idx
]
=
op
;
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
fetchs_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetchs_
.
resize
(
idx
+
1
);
}
fetchs_
[
idx
]
=
op
;
}
}
}
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetInputTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
std
::
unique_ptr
<
ZeroCopyTensor
>
res
(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
true
;
res
->
SetName
(
name
);
return
res
;
}
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetOutputTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
std
::
unique_ptr
<
ZeroCopyTensor
>
res
(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
false
;
res
->
SetName
(
name
);
return
res
;
}
bool
AnalysisPredictor
::
ZeroCopyRun
()
{
executor_
->
Run
();
return
true
;
}
bool
AnalysisPredictor
::
LoadProgramDesc
()
{
// Initialize the inference program
std
::
unique_ptr
<
framework
::
Executor
>
tmp_exe
(
new
framework
::
Executor
(
platform
::
CPUPlace
()));
if
(
!
config_
.
model_dir
.
empty
())
{
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_
=
paddle
::
inference
::
Load
(
static_cast
<
framework
::
Executor
*>
(
tmp_exe
.
get
()),
scope_
.
get
(),
config_
.
model_dir
);
}
else
if
(
!
config_
.
prog_file
.
empty
()
&&
!
config_
.
param_file
.
empty
())
{
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_
=
paddle
::
inference
::
Load
(
static_cast
<
framework
::
Executor
*>
(
tmp_exe
.
get
()),
scope_
.
get
(),
config_
.
prog_file
,
config_
.
param_file
);
}
else
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"not valid model path '%s' or program path '%s'."
,
config_
.
model_dir
,
config_
.
param_file
);
return
false
;
}
return
true
;
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
auto
*
x
=
new
AnalysisPredictor
(
config_
);
x
->
Init
(
scope_
,
inference_program_
);
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
}
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
const
contrib
::
AnalysisConfig
&
config
)
{
const
contrib
::
AnalysisConfig
&
config
)
{
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
a6fbf7ec
...
...
@@ -12,42 +12,81 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
using
inference
::
analysis
::
Argument
;
using
inference
::
analysis
::
Analyzer
;
using
framework
::
proto
::
ProgramDesc
;
using
framework
::
NaiveExecutor
;
using
contrib
::
AnalysisConfig
;
/* This predictor is based on the original native predictor with IR and Analysis
* support. It will optimize IR and Parameters in the runtime.
* TODO(Superjomn) Replace the Navive predictor?
*/
class
AnalysisPredictor
:
public
Native
PaddlePredictor
{
class
AnalysisPredictor
:
public
PaddlePredictor
{
public:
explicit
AnalysisPredictor
(
const
contrib
::
AnalysisConfig
&
config
)
:
NativePaddlePredictor
(
config
),
config_
(
config
)
{}
explicit
AnalysisPredictor
(
const
AnalysisConfig
&
config
)
:
config_
(
config
)
{}
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
);
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
=
nullptr
);
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
override
{
return
NativePaddlePredictor
::
Run
(
inputs
,
output_data
,
batch_size
);
}
bool
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
=
-
1
)
override
;
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
override
;
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
const
std
::
string
&
name
)
override
;
bool
ZeroCopyRun
()
override
;
void
PrepareFeedFetch
();
void
OptimizeInferenceProgram
();
Argument
&
analysis_argument
()
{
return
argument_
;
}
Argument
&
analysis_argument
()
{
return
argument_
;
}
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
framework
::
Scope
*
scope
()
{
return
executor_
->
scope
();
}
framework
::
ProgramDesc
&
program
()
{
return
*
inference_program_
;
}
protected:
bool
LoadProgramDesc
();
bool
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
input_datas
,
framework
::
Scope
*
scope
);
bool
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
output_data
,
framework
::
Scope
*
scope
);
template
<
typename
T
>
void
GetFetchOne
(
const
framework
::
LoDTensor
&
fetchs
,
PaddleTensor
*
output_data
);
private:
contrib
::
AnalysisConfig
config_
;
Argument
argument_
;
std
::
unique_ptr
<
NaiveExecutor
>
executor_
;
platform
::
Place
place_
;
std
::
shared_ptr
<
framework
::
Scope
>
scope_
;
framework
::
Scope
*
sub_scope_
{
nullptr
};
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
std
::
vector
<
framework
::
OpDesc
*>
fetchs_
;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
};
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor_tester.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
namespace
paddle
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
TEST
(
AnalysisPredictor
,
ZeroCopy
)
{
AnalysisConfig
config
;
config
.
model_dir
=
FLAGS_dirname
+
"/word2vec.inference.model"
;
config
.
use_feed_fetch_ops
=
false
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
auto
w0
=
predictor
->
GetInputTensor
(
"firstw"
);
auto
w1
=
predictor
->
GetInputTensor
(
"secondw"
);
auto
w2
=
predictor
->
GetInputTensor
(
"thirdw"
);
auto
w3
=
predictor
->
GetInputTensor
(
"forthw"
);
w0
->
Reshape
({
4
,
1
});
w1
->
Reshape
({
4
,
1
});
w2
->
Reshape
({
4
,
1
});
w3
->
Reshape
({
4
,
1
});
auto
*
w0_data
=
w0
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
auto
*
w1_data
=
w1
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
auto
*
w2_data
=
w2
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
auto
*
w3_data
=
w3
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
w0_data
[
i
]
=
i
;
w1_data
[
i
]
=
i
;
w2_data
[
i
]
=
i
;
w3_data
[
i
]
=
i
;
}
predictor
->
ZeroCopyRun
();
auto
out
=
predictor
->
GetOutputTensor
(
"fc_1.tmp_2"
);
PaddlePlace
place
;
int
size
=
0
;
auto
*
out_data
=
out
->
data
<
float
>
(
&
place
,
&
size
);
LOG
(
INFO
)
<<
"output size: "
<<
size
/
sizeof
(
float
);
LOG
(
INFO
)
<<
"output_data: "
<<
out_data
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/api.cc
浏览文件 @
a6fbf7ec
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace
paddle
{
...
...
@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
}
}
PaddleBuf
::
PaddleBuf
(
PaddleBuf
&&
other
)
PaddleBuf
::
PaddleBuf
(
PaddleBuf
&&
other
)
:
data_
(
other
.
data_
),
length_
(
other
.
length_
),
memory_owned_
(
other
.
memory_owned_
)
{
...
...
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
other
.
length_
=
0
;
}
PaddleBuf
::
PaddleBuf
(
const
PaddleBuf
&
other
)
{
*
this
=
other
;
}
PaddleBuf
::
PaddleBuf
(
const
PaddleBuf
&
other
)
{
*
this
=
other
;
}
PaddleBuf
&
PaddleBuf
::
operator
=
(
const
PaddleBuf
&
other
)
{
PaddleBuf
&
PaddleBuf
::
operator
=
(
const
PaddleBuf
&
other
)
{
if
(
!
other
.
memory_owned_
)
{
data_
=
other
.
data_
;
length_
=
other
.
length_
;
...
...
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
return
*
this
;
}
PaddleBuf
&
PaddleBuf
::
operator
=
(
PaddleBuf
&&
other
)
{
PaddleBuf
&
PaddleBuf
::
operator
=
(
PaddleBuf
&&
other
)
{
// only the buffer with external memory can be copied
data_
=
other
.
data_
;
length_
=
other
.
length_
;
...
...
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
}
}
void
PaddleBuf
::
Reset
(
void
*
data
,
size_t
length
)
{
void
PaddleBuf
::
Reset
(
void
*
data
,
size_t
length
)
{
Free
();
memory_owned_
=
false
;
data_
=
data
;
...
...
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void
PaddleBuf
::
Free
()
{
if
(
memory_owned_
&&
data_
)
{
PADDLE_ENFORCE_GT
(
length_
,
0
);
free
(
static_cast
<
char
*>
(
data_
));
free
(
static_cast
<
char
*>
(
data_
));
data_
=
nullptr
;
length_
=
0
;
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
a6fbf7ec
...
...
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
VLOG
(
4
)
<<
"Run prepared context"
;
executor_
->
RunPreparedContext
(
ctx_
.
get
(),
scope
,
false
,
/* don't create local scope each time*/
false
/* don't create variable ea
t
ch time */
);
false
/* don't create variable each time */
);
VLOG
(
4
)
<<
"Finish prepared context"
;
// get fetch variable
if
(
!
GetFetch
(
output_data
,
scope
))
{
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
a6fbf7ec
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
...
...
@@ -30,6 +30,8 @@
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
~
NativePaddlePredictor
()
override
;
framework
::
Scope
*
scope
()
{
return
sub_scope_
?
sub_scope_
:
scope_
.
get
();
}
protected:
bool
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
input_datas
,
framework
::
Scope
*
scope
);
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
a6fbf7ec
...
...
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig
GetConfig
()
{
NativeConfig
config
;
config
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config
.
model_dir
=
FLAGS_dirname
+
"
/
word2vec.inference.model"
;
LOG
(
INFO
)
<<
"dirname "
<<
config
.
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
#ifdef PADDLE_WITH_CUDA
...
...
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"image_classification_resnet.inference.model"
;
FLAGS_dirname
+
"
/
image_classification_resnet.inference.model"
;
const
bool
is_combined
=
false
;
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
...
...
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"image_classification_resnet.inference.model"
;
FLAGS_dirname
+
"
/
image_classification_resnet.inference.model"
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
std
::
vector
<
framework
::
LoDTensor
>
jobs
(
num_jobs
);
...
...
paddle/fluid/inference/api/details/zero_copy_tensor.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
void
ZeroCopyTensor
::
Reshape
(
const
std
::
vector
<
int
>
&
shape
)
{
PADDLE_ENFORCE
(
!
name_
.
empty
(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."
);
PADDLE_ENFORCE
(
input_or_output_
,
"Can't reshape the output tensor, it is readonly"
);
PADDLE_ENFORCE
(
scope_
);
auto
*
scope
=
static_cast
<
framework
::
Scope
*>
(
scope_
);
auto
*
var
=
scope
->
FindVar
(
name_
);
PADDLE_ENFORCE
(
var
,
"No tensor called [%s] in the runtime scope"
,
name_
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
(
shape
));
}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
switch
(
static_cast
<
int
>
(
place
))
{
case
static_cast
<
int
>
(
PaddlePlace
::
kCPU
):
{
return
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
}
case
static_cast
<
int
>
(
PaddlePlace
::
kGPU
):
{
return
tensor
->
mutable_data
<
T
>
(
platform
::
CUDAPlace
());
}
default:
PADDLE_THROW
(
"Unsupported place: %d"
,
static_cast
<
int
>
(
place
));
break
;
}
return
nullptr
;
}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
auto
*
res
=
tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
*
place
=
PaddlePlace
::
kCPU
;
}
else
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
*
place
=
PaddlePlace
::
kGPU
;
}
else
{
*
place
=
PaddlePlace
::
kUNK
;
}
*
size
=
tensor
->
numel
();
return
res
;
}
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
int
*
size
);
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
int
*
size
);
template
float
*
ZeroCopyTensor
::
mutable_data
<
float
>(
PaddlePlace
place
);
template
int64_t
*
ZeroCopyTensor
::
mutable_data
<
int64_t
>(
PaddlePlace
place
);
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
PADDLE_ENFORCE
(
!
name_
.
empty
(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."
);
PADDLE_ENFORCE
(
scope_
);
auto
*
scope
=
static_cast
<
framework
::
Scope
*>
(
scope_
);
auto
*
var
=
scope
->
FindVar
(
name_
);
PADDLE_ENFORCE
(
var
,
"No tensor called [%s] in the runtime scope"
,
name_
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
return
tensor
;
}
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
PADDLE_ENFORCE
(
tensor
,
"not found tensor called %s in the scope"
,
name_
);
return
framework
::
vectorize
(
tensor
->
dims
());
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
framework
::
LoD
lod
;
for
(
auto
&
level
:
x
)
{
lod
.
emplace_back
(
level
);
}
tensor
->
set_lod
(
lod
);
}
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
for
(
auto
&
level
:
tensor
->
lod
())
{
res
.
emplace_back
(
level
);
}
return
res
;
}
}
// namespace paddle
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace
paddle
{
void
ZeroCopyTensor
::
Reshape
(
const
std
::
vector
<
int
>
&
shape
)
{}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
return
nullptr
;
}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
{
return
nullptr
;
}
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
int
*
size
);
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
int
*
size
);
template
float
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
);
template
int64_t
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
);
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
return
nullptr
;
}
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
{
return
{};
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{}
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
return
std
::
vector
<
std
::
vector
<
size_t
>>
();
}
}
// namespace paddle
paddle/fluid/inference/api/helper.h
浏览文件 @
a6fbf7ec
...
...
@@ -21,8 +21,10 @@
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
}
}
template
<
typename
T
>
static
int
ZeroCopyTensorAssignData
(
ZeroCopyTensor
*
tensor
,
const
std
::
vector
<
std
::
vector
<
T
>>
&
data
)
{
int
size
{
0
};
auto
*
ptr
=
tensor
->
mutable_data
<
T
>
(
PaddlePlace
::
kCPU
);
int
c
=
0
;
for
(
const
auto
&
f
:
data
)
{
for
(
T
v
:
f
)
{
ptr
[
c
++
]
=
v
;
}
}
return
size
;
}
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
)
{
std
::
stringstream
os
;
os
<<
"Tensor ["
<<
tensor
.
name
<<
"]
\n
"
;
...
...
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
template
<
typename
T
>
std
::
string
LoDTensorSummary
(
const
framework
::
LoDTensor
&
tensor
)
{
std
::
stringstream
ss
;
ss
<<
"
\n
---- tensor ---"
<<
'\n'
;
ss
<<
"lod: ["
;
for
(
const
auto
&
level
:
tensor
.
lod
())
{
ss
<<
"[ "
;
for
(
auto
i
:
level
)
{
ss
<<
i
<<
", "
;
}
ss
<<
"]"
;
}
ss
<<
"]
\n
"
;
ss
<<
"shape: ["
;
int
size
=
1
;
for
(
int
i
=
0
;
i
<
tensor
.
dims
().
size
();
i
++
)
{
int
dim
=
tensor
.
dims
()[
i
];
ss
<<
dim
<<
", "
;
size
*=
dim
;
}
ss
<<
"]
\n
"
;
ss
<<
"data: "
;
for
(
int
i
=
0
;
i
<
std
::
min
(
20
,
size
);
i
++
)
{
ss
<<
tensor
.
data
<
T
>
()[
i
]
<<
" "
;
}
ss
<<
"
\n
"
;
return
ss
.
str
();
}
static
bool
CompareLoD
(
const
framework
::
LoD
&
a
,
const
framework
::
LoD
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"lod size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
auto
&
al
=
a
[
i
];
auto
&
bl
=
b
[
i
];
if
(
al
.
size
()
!=
bl
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"level size %d != %d"
,
al
.
size
(),
bl
.
size
());
return
false
;
}
}
return
true
;
}
static
bool
CompareShape
(
const
std
::
vector
<
int64_t
>
&
a
,
const
std
::
vector
<
int64_t
>
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
if
(
a
[
i
]
!=
b
[
i
])
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape %d-th element not match %d != %d"
,
i
,
a
[
i
],
b
[
i
]);
return
false
;
}
}
return
true
;
}
static
bool
CompareTensorData
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
auto
a_shape
=
framework
::
vectorize
(
a
.
dims
());
auto
b_shape
=
framework
::
vectorize
(
b
.
dims
());
size_t
a_size
=
std
::
accumulate
(
a_shape
.
begin
(),
a_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
size_t
b_size
=
std
::
accumulate
(
b_shape
.
begin
(),
b_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
if
(
a_size
!=
b_size
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data size not match, %d != %d"
,
a_size
,
b_size
);
}
for
(
size_t
i
=
0
;
i
<
a_size
;
i
++
)
{
if
(
a
.
type
()
==
typeid
(
float
))
{
const
auto
*
a_data
=
a
.
data
<
float
>
();
const
auto
*
b_data
=
b
.
data
<
float
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
else
if
(
a
.
type
()
==
typeid
(
int64_t
))
{
const
auto
*
a_data
=
a
.
data
<
int64_t
>
();
const
auto
*
b_data
=
b
.
data
<
int64_t
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
}
return
true
;
}
static
bool
CompareTensor
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
if
(
!
CompareLoD
(
a
.
lod
(),
b
.
lod
()))
{
return
false
;
}
if
(
!
CompareShape
(
framework
::
vectorize
(
a
.
dims
()),
framework
::
vectorize
(
b
.
dims
())))
{
return
false
;
}
if
(
!
CompareTensorData
(
a
,
b
))
{
return
false
;
}
return
true
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
a6fbf7ec
...
...
@@ -101,6 +101,40 @@ struct PaddleTensor {
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
// Tensor+LoD equals LoDTensor
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
// Tensor without copy, currently only supports AnalysisPredictor.
class
ZeroCopyTensor
{
public:
void
Reshape
(
const
std
::
vector
<
int
>&
shape
);
// Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size.
// Once can directly call this data to feed the data.
// This is for write the input tensor.
template
<
typename
T
>
T
*
mutable_data
(
PaddlePlace
place
);
// Get the memory directly, will return the place and memory size by pointer.
// This is for reading the output tensor.
template
<
typename
T
>
T
*
data
(
PaddlePlace
*
place
,
int
*
size
);
std
::
vector
<
int64_t
>
shape
();
void
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>&
x
);
std
::
vector
<
std
::
vector
<
size_t
>>
lod
()
const
;
protected:
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
void
*
FindTensor
()
const
;
private:
std
::
string
name_
;
bool
input_or_output_
;
friend
class
AnalysisPredictor
;
void
*
scope_
{
nullptr
};
};
/*
* A simple Inference API for Paddle.
*/
...
...
@@ -120,6 +154,19 @@ class PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
// Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly,
// without copy.
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
}
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
}
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
virtual
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
=
0
;
...
...
@@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig {
IrPassMode
ir_mode
{
IrPassMode
::
kExclude
};
std
::
vector
<
std
::
string
>
ir_passes
;
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
bool
use_feed_fetch_ops
{
true
};
// NOTE this is just for internal development, please not use it. NOT
// stable
// yet.
bool
_use_mkldnn
{
false
};
};
...
...
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
浏览文件 @
a6fbf7ec
...
...
@@ -18,6 +18,8 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
using
contrib
::
AnalysisConfig
;
struct
DataRecord
{
std
::
vector
<
int64_t
>
data
;
std
::
vector
<
size_t
>
lod
;
...
...
@@ -78,6 +80,7 @@ struct DataRecord {
}
}
}
DataRecord
NextBatch
()
{
DataRecord
data
;
data
.
data
=
batched_datas
[
batch_iter
];
...
...
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
SetConfig
(
&
cfg
);
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_gru_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
...
...
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
浏览文件 @
a6fbf7ec
...
...
@@ -16,6 +16,7 @@
namespace
paddle
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
word_data_all
,
mention_data_all
;
...
...
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
SetConfig
(
&
cfg
);
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_gru_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
a6fbf7ec
...
...
@@ -12,12 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_bool
(
with_precision_check
,
true
,
"turn on test"
);
namespace
paddle
{
namespace
inference
{
using
namespace
framework
;
// NOLINT
using
namespace
contrib
;
// NOLINT
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
link_step_data_all
;
...
...
@@ -29,10 +33,12 @@ struct DataRecord {
size_t
batch_iter
{
0
};
size_t
batch_size
{
1
};
DataRecord
()
=
default
;
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
:
batch_size
(
batch_size
)
{
Load
(
path
);
}
DataRecord
NextBatch
()
{
DataRecord
data
;
size_t
batch_end
=
batch_iter
+
batch_size
;
...
...
@@ -101,6 +107,7 @@ struct DataRecord {
num_samples
=
num_lines
;
}
};
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
int
batch_size
)
{
PaddleTensor
lod_attention_tensor
,
init_zero_tensor
,
lod_tensor_tensor
,
...
...
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
void
SetConfig
(
contrib
::
AnalysisConfig
*
cfg
)
{
void
PrepareZeroCopyInputs
(
ZeroCopyTensor
*
lod_attention_tensor
,
ZeroCopyTensor
*
cell_init_tensor
,
ZeroCopyTensor
*
data_tensor
,
ZeroCopyTensor
*
hidden_init_tensor
,
ZeroCopyTensor
*
week_tensor
,
ZeroCopyTensor
*
minute_tensor
,
DataRecord
*
data_record
,
int
batch_size
)
{
auto
one_batch
=
data_record
->
NextBatch
();
std
::
vector
<
int
>
rnn_link_data_shape
(
{
static_cast
<
int
>
(
one_batch
.
rnn_link_data
.
size
()),
static_cast
<
int
>
(
one_batch
.
rnn_link_data
.
front
().
size
())});
lod_attention_tensor
->
Reshape
({
1
,
2
});
lod_attention_tensor
->
SetLoD
({
one_batch
.
lod1
,
one_batch
.
lod2
});
cell_init_tensor
->
Reshape
({
batch_size
,
15
});
cell_init_tensor
->
SetLoD
({
one_batch
.
lod3
});
hidden_init_tensor
->
Reshape
({
batch_size
,
15
});
hidden_init_tensor
->
SetLoD
({
one_batch
.
lod3
});
data_tensor
->
Reshape
(
rnn_link_data_shape
);
data_tensor
->
SetLoD
({
one_batch
.
lod1
});
week_tensor
->
Reshape
(
{
static_cast
<
int
>
(
one_batch
.
rnn_week_datas
.
size
()),
static_cast
<
int
>
(
one_batch
.
rnn_week_datas
.
front
().
size
())});
week_tensor
->
SetLoD
({
one_batch
.
lod3
});
minute_tensor
->
Reshape
(
{
static_cast
<
int
>
(
one_batch
.
rnn_minute_datas
.
size
()),
static_cast
<
int
>
(
one_batch
.
rnn_minute_datas
.
front
().
size
())});
minute_tensor
->
SetLoD
({
one_batch
.
lod3
});
// assign data
float
arr0
[]
=
{
0
,
0
};
std
::
vector
<
float
>
zeros
(
batch_size
*
15
,
0
);
std
::
copy_n
(
arr0
,
2
,
lod_attention_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
std
::
copy_n
(
arr0
,
2
,
data_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
std
::
copy_n
(
zeros
.
begin
(),
zeros
.
size
(),
cell_init_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
std
::
copy_n
(
zeros
.
begin
(),
zeros
.
size
(),
hidden_init_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
ZeroCopyTensorAssignData
(
data_tensor
,
one_batch
.
rnn_link_data
);
ZeroCopyTensorAssignData
(
week_tensor
,
one_batch
.
rnn_week_datas
);
ZeroCopyTensorAssignData
(
minute_tensor
,
one_batch
.
rnn_minute_datas
);
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
use_gpu
=
false
;
...
...
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
SetConfig
(
&
cfg
);
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
2
);
// bi-directional LSTM
...
...
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
4
/* num_threads */
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
bool
CompareTensors
(
framework
::
Scope
&
a_scope
,
framework
::
Scope
&
b_scope
,
const
std
::
vector
<
std
::
string
>
&
tensors
)
{
for
(
auto
&
x
:
tensors
)
{
auto
*
a_var
=
a_scope
.
FindVar
(
x
);
auto
*
b_var
=
b_scope
.
FindVar
(
x
);
if
(
a_var
&&
b_var
)
{
if
(
a_var
->
Type
()
==
typeid
(
framework
::
LoDTensor
)
||
a_var
->
Type
()
==
typeid
(
framework
::
Tensor
))
{
LOG
(
INFO
)
<<
"comparing tensor "
<<
x
;
auto
&
a_t
=
a_var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
b_t
=
b_var
->
Get
<
framework
::
LoDTensor
>
();
if
(
!
inference
::
CompareTensor
(
a_t
,
b_t
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor %s not match in two scopes"
,
x
);
}
}
else
{
LOG
(
INFO
)
<<
"skip no tensor "
<<
x
;
}
}
else
{
LOG
(
INFO
)
<<
"skip tensor "
<<
x
;
}
}
return
true
;
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// on the complex RNN1 model.
TEST
(
Analyzer_rnn1
,
ZeroCopy
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
use_feed_fetch_ops
=
false
;
PaddlePlace
place
;
int
output_size
{
0
};
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
// the analysis predictor needs feed/fetch.
auto
analysis_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
data
);
NEW_TENSOR
(
week
);
NEW_TENSOR
(
minute
);
NEW_TENSOR
(
hidden_init
);
// Prepare data for AnalysisPredictor
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
PrepareZeroCopyInputs
(
data_lod_attention_tensor
.
get
(),
cell_init_tensor
.
get
(),
data_tensor
.
get
(),
hidden_init_tensor
.
get
(),
week_tensor
.
get
(),
minute_tensor
.
get
(),
&
data
,
FLAGS_batch_size
);
// Prepare data for NativePredictor
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
native_inputs
;
SetInput
(
&
native_inputs
);
std
::
vector
<
PaddleTensor
>
native_outputs
;
std
::
vector
<
PaddleTensor
>
analysis_outputs
;
auto
output_tensor
=
predictor
->
GetOutputTensor
(
"final_output.tmp_1"
);
// Run analysis predictor
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
2
);
// bi-directional LSTM
ASSERT_EQ
(
fuse_statis
.
at
(
"seq_concat_fc_fuse"
),
1
);
ASSERT_EQ
(
num_ops
,
13
);
// After graph optimization, only 13 operators exists.
Timer
timer
;
double
total_time
{
0
};
double
native_total_time
{
0
};
double
analysis_total_time
{
0.
};
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
timer
.
tic
();
predictor
->
ZeroCopyRun
();
total_time
+=
timer
.
toc
();
}
auto
*
output_data
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
ASSERT_GT
(
output_size
,
0
);
// more than one output!
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
// Run native predictor.
timer
.
tic
();
ASSERT_TRUE
(
native_predictor
->
Run
(
native_inputs
.
front
(),
&
native_outputs
));
native_total_time
+=
timer
.
toc
();
}
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
timer
.
tic
();
ASSERT_TRUE
(
analysis_predictor
->
Run
(
native_inputs
.
front
(),
&
analysis_outputs
));
analysis_total_time
+=
timer
.
toc
();
}
if
(
!
FLAGS_with_precision_check
)
{
return
;
}
int
native_output_size
=
VecReduceToInt
(
native_outputs
.
front
().
shape
);
EXPECT_EQ
(
native_output_size
,
output_size
);
// Compare tensors between analysis and zerocopy
auto
*
p0
=
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
());
auto
*
p1
=
static_cast
<
AnalysisPredictor
*>
(
analysis_predictor
.
get
());
auto
*
p2
=
static_cast
<
NativePaddlePredictor
*>
(
native_predictor
.
get
());
std
::
vector
<
std
::
string
>
tensor_names
;
for
(
auto
&
var_desc
:
p0
->
program
().
Block
(
0
).
AllVars
())
{
tensor_names
.
push_back
(
var_desc
->
Name
());
}
LOG
(
INFO
)
<<
"Comparing tensors"
;
ASSERT_TRUE
(
CompareTensors
(
*
p0
->
scope
(),
*
p1
->
scope
(),
{
"final_output.tmp_1"
}));
ASSERT_TRUE
(
CompareTensors
(
*
p0
->
scope
(),
*
p2
->
scope
(),
{
"final_output.tmp_1"
}));
LOG
(
INFO
)
<<
"output1 "
<<
inference
::
LoDTensorSummary
<
float
>
(
p0
->
scope
()
->
FindVar
(
"final_output.tmp_1"
)
->
Get
<
framework
::
LoDTensor
>
());
LOG
(
INFO
)
<<
"output2 "
<<
inference
::
LoDTensorSummary
<
float
>
(
p1
->
scope
()
->
FindVar
(
"final_output.tmp_1"
)
->
Get
<
framework
::
LoDTensor
>
());
LOG
(
INFO
)
<<
"output3 "
<<
inference
::
LoDTensorSummary
<
float
>
(
p2
->
scope
()
->
FindVar
(
"final_output.tmp_1"
)
->
Get
<
framework
::
LoDTensor
>
());
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
LOG
(
INFO
)
<<
output_data
[
i
]
<<
" "
<<
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
())[
i
]
<<
" "
<<
static_cast
<
float
*>
(
analysis_outputs
.
front
().
data
.
data
())[
i
];
EXPECT_NEAR
(
output_data
[
i
],
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
())[
i
],
1e-3
);
}
LOG
(
INFO
)
<<
"batch_size: "
<<
FLAGS_batch_size
;
LOG
(
INFO
)
<<
"zero average time: "
<<
total_time
/
(
FLAGS_repeat
*
FLAGS_batch_size
);
LOG
(
INFO
)
<<
"analysis average time: "
<<
analysis_total_time
/
(
FLAGS_repeat
*
FLAGS_batch_size
);
LOG
(
INFO
)
<<
"native average time: "
<<
native_total_time
/
(
FLAGS_repeat
*
FLAGS_batch_size
);
}
TEST
(
Analyzer_rnn1
,
ZeroCopyMultiThread
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
use_feed_fetch_ops
=
false
;
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto
base_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
double
total_time_of_threads
{
0
};
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
));
}
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
threads
.
emplace_back
([
config
,
&
total_time_of_threads
,
&
predictors
,
tid
]
{
// auto predictor = base_predictor->Clone();
auto
&
predictor
=
predictors
[
tid
];
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
data
);
NEW_TENSOR
(
week
);
NEW_TENSOR
(
minute
);
NEW_TENSOR
(
hidden_init
);
// Prepare data for AnalysisPredictor
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
Timer
timer
;
double
total_time
{
0
};
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
PrepareZeroCopyInputs
(
data_lod_attention_tensor
.
get
(),
cell_init_tensor
.
get
(),
data_tensor
.
get
(),
hidden_init_tensor
.
get
(),
week_tensor
.
get
(),
minute_tensor
.
get
(),
&
data
,
FLAGS_batch_size
);
timer
.
tic
();
predictor
->
ZeroCopyRun
();
total_time
+=
timer
.
toc
();
}
total_time_of_threads
+=
total_time
;
LOG
(
INFO
)
<<
"thread time: "
<<
total_time
/
FLAGS_repeat
;
});
}
for
(
auto
&
t
:
threads
)
{
t
.
join
();
}
LOG
(
INFO
)
<<
"average time: "
<<
total_time_of_threads
/
FLAGS_num_threads
/
FLAGS_repeat
;
}
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
浏览文件 @
a6fbf7ec
...
...
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
}
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
a6fbf7ec
...
...
@@ -19,6 +19,7 @@ limitations under the License. */
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
using
contrib
::
AnalysisConfig
;
struct
Record
{
std
::
vector
<
float
>
data
;
...
...
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
}
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
a6fbf7ec
...
...
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
std
::
unordered_map
<
std
::
string
,
int
>
GetFuseStatis
(
AnalysisConfig
config
,
std
::
unordered_map
<
std
::
string
,
int
>
GetFuseStatis
(
PaddlePredictor
*
predictor
,
int
*
num_ops
)
{
auto
predictor
=
CreateTestPredictor
(
config
);
AnalysisPredictor
*
analysis_predictor
=
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
());
auto
*
analysis_predictor
=
static_cast
<
AnalysisPredictor
*>
(
predictor
);
auto
&
fuse_statis
=
analysis_predictor
->
analysis_argument
()
.
Get
<
std
::
unordered_map
<
std
::
string
,
int
>>
(
framework
::
ir
::
kFuseStatisAttr
);
...
...
paddle/fluid/operators/detection/generate_proposals_op.cc
浏览文件 @
a6fbf7ec
...
...
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/
framework/var_type
.h"
#include "paddle/fluid/
operators/detail/safe_ref
.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
...
...
@@ -25,21 +27,17 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
struct
AppendProposalsFunctor
{
LoDTensor
*
out_
;
int64_t
offset_
;
Tensor
*
to_add_
;
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
AppendProposalsFunctor
(
LoDTensor
*
out
,
int64_t
offset
,
Tensor
*
to_add
)
:
out_
(
out
),
offset_
(
offset
),
to_add_
(
to_add
)
{}
template
<
typename
T
>
void
apply
()
const
{
auto
*
out_data
=
out_
->
data
<
T
>
();
auto
*
to_add_data
=
to_add_
->
data
<
T
>
();
memcpy
(
out_data
+
offset_
,
to_add_data
,
to_add_
->
numel
()
*
sizeof
(
T
));
}
};
static
void
AppendProposals
(
Tensor
*
dst
,
int64_t
offset
,
const
Tensor
&
src
)
{
auto
*
out_data
=
dst
->
data
<
void
>
();
auto
*
to_add_data
=
src
.
data
<
void
>
();
size_t
size_of_t
=
framework
::
SizeOfType
(
src
.
type
());
offset
*=
size_of_t
;
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
out_data
)
+
offset
),
to_add_data
,
src
.
numel
()
*
size_of_t
);
}
class
GenerateProposalsOp
:
public
framework
::
OperatorWithKernel
{
public:
...
...
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
};
template
<
class
T
>
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
static
inline
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
row
=
all_anchors
->
dims
()[
0
];
...
...
@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_height
;
}
else
{
bbox_center_x
=
...
...
@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_height
;
}
...
...
@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
}
template
<
class
T
>
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
static
inline
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
zero
(
0
);
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
}
}
template
<
class
T
>
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
static
inline
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
im_scale
=
im_info_data
[
2
];
...
...
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
keep
->
Resize
({
keep_len
});
}
bool
SortScorePairDescend
(
const
std
::
pair
<
float
,
int
>
&
pair1
,
const
std
::
pair
<
float
,
int
>
&
pair2
)
{
return
pair1
.
first
>
pair2
.
first
;
}
template
<
class
T
>
void
GetMaxScoreIndex
(
const
std
::
vector
<
T
>
&
scores
,
std
::
vector
<
std
::
pair
<
T
,
int
>>
*
sorted_indices
)
{
static
inline
std
::
vector
<
std
::
pair
<
T
,
int
>>
GetSortedScoreIndex
(
const
std
::
vector
<
T
>
&
scores
)
{
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
sorted_indices
.
reserve
(
scores
.
size
());
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
->
push_back
(
std
::
make_pair
(
scores
[
i
],
i
)
);
sorted_indices
.
emplace_back
(
scores
[
i
],
i
);
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
->
begin
(),
sorted_indices
->
end
(),
SortScorePairDescend
);
std
::
stable_sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
[](
const
std
::
pair
<
T
,
int
>
&
a
,
const
std
::
pair
<
T
,
int
>
&
b
)
{
return
a
.
first
<
b
.
first
;
});
return
sorted_indices
;
}
template
<
class
T
>
T
BBoxArea
(
const
T
*
box
,
const
bool
normalized
)
{
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
...
...
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
}
template
<
class
T
>
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
const
bool
normalized
)
{
static
inline
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
...
...
@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
std
::
max
(
0.0
f
,
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
0.0
f
,
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_w
=
std
::
max
(
T
(
0
)
,
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
T
(
0
)
,
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
...
...
@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
}
}
template
<
typename
T
>
static
inline
Tensor
VectorToTensor
(
const
std
::
vector
<
T
>
&
selected_indices
,
int
selected_num
)
{
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
auto
*
keep_data
=
keep_nms
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
class
T
>
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
Tensor
*
scores
,
const
T
nms_threshold
,
const
float
eta
)
{
static
inline
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
PADDLE_ENFORCE_NOT_NULL
(
bbox
);
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
...
...
@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
GetMaxScoreIndex
<
T
>
(
scores_data
,
&
sorted_indices
);
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
bool
flag
;
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
front
().
second
;
flag
=
true
;
for
(
size_t
k
=
0
;
k
<
selected_indices
.
size
();
++
k
)
{
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
if
(
flag
)
{
const
int
kept_idx
=
selected_indices
[
k
];
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
...
...
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
}
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
selected_num
++
;
++
selected_num
;
}
sorted_indices
.
erase
(
sorted_indices
.
begin
());
sorted_indices
.
erase
(
sorted_indices
.
end
());
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
int
*
keep_data
=
keep_nms
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
return
VectorToTensor
(
selected_indices
,
selected_num
);
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
class
GenerateProposalsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
auto
anchors
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Anchors"
),
"Cannot find input Anchors(%s) in scope"
,
context
.
Inputs
(
"Anchors"
)[
0
]);
auto
variances
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Variances"
),
"Cannot find input Variances(%s) in scope"
,
context
.
Inputs
(
"Variances"
)[
0
]);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
...
...
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
float
min_size
=
context
.
Attr
<
float
>
(
"min_size"
);
float
eta
=
context
.
Attr
<
float
>
(
"eta"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
scores_dim
=
scores
->
dims
();
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
bbox_dim
=
bbox_deltas
->
dims
();
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
...
...
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_swap
.
mutable_data
<
T
>
({
num
,
h_score
,
w_score
,
c_score
},
dev_ctx
.
GetPlace
());
math
::
Transpose
<
DeviceContext
,
T
,
4
>
trans
;
math
::
Transpose
<
platform
::
CPU
DeviceContext
,
T
,
4
>
trans
;
std
::
vector
<
int
>
axis
=
{
0
,
2
,
3
,
1
};
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
lod0
(
1
,
0
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
)
;
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
}
);
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
lod
.
resize
(
1
);
auto
&
lod0
=
lod
[
0
]
;
lod0
.
push_back
(
0
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
}
);
var
iances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
...
...
@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
tensor_pair
.
first
;
Tensor
scores
=
tensor_pair
.
second
;
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_rois
->
type
()),
AppendProposalsFunctor
(
rpn_rois
,
4
*
num_proposals
,
&
proposals
));
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_roi_probs
->
type
()),
AppendProposalsFunctor
(
rpn_roi_probs
,
num_proposals
,
&
scores
));
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
AppendProposals
(
rpn_rois
,
4
*
num_proposals
,
proposals
);
AppendProposals
(
rpn_roi_probs
,
num_proposals
,
scores
);
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
emplace
_back
(
num_proposals
);
lod0
.
push
_back
(
num_proposals
);
}
lod
.
emplace_back
(
lod0
);
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
...
...
@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
}
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
platform
::
CPU
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
...
...
@@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
}
std
::
function
<
bool
(
const
int64_t
&
,
const
int64_t
&
)
>
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
if
(
pre_nms_top_n
<=
0
||
pre_nms_top_n
>=
scores_slice
.
numel
())
{
std
::
sort
(
index
,
index
+
scores_slice
.
numel
(),
compare
);
...
...
@@ -469,12 +470,12 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
Generate Proposals OP
This operator proposes rois according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_de
ltai
s and scores are the output of RPN. Final proposals
the box can be calculated by anchors. Bbox_de
tail
s and scores are the output of RPN. Final proposals
could be used to train detection net.
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
of anchors, H and W are height and width of the feature map.
BboxDeltas is the differece between predicted box locat
oi
n and anchor location. In format of (N, 4*A, H, W)
BboxDeltas is the differece between predicted box locat
io
n and anchor location. In format of (N, 4*A, H, W)
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
...
...
@@ -490,6 +491,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
generate_proposals
,
ops
::
GenerateProposalsOp
,
ops
::
GenerateProposalsOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
generate_proposals
,
ops
::
GenerateProposalsKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
REGISTER_OP_CPU_KERNEL
(
generate_proposals
,
ops
::
GenerateProposalsKernel
<
float
>
,
ops
::
GenerateProposalsKernel
<
double
>
);
paddle/fluid/operators/detection/generate_proposals_op.cu
浏览文件 @
a6fbf7ec
...
...
@@ -19,8 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -37,21 +39,25 @@ namespace {
int
const
kThreadsPerBlock
=
sizeof
(
uint64_t
)
*
8
;
template
<
typename
T
>
__global__
void
RangeInitKernel
(
const
T
start
,
const
T
delta
,
const
int
size
,
T
*
out
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
size
)
{
out
[
i
]
=
start
+
i
*
delta
;
}
}
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
struct
RangeInitFunctor
{
int
start_
;
int
delta_
;
int
*
out_
;
__device__
void
operator
()(
size_t
i
)
{
out_
[
i
]
=
start_
+
i
*
delta_
;
}
};
template
<
typename
T
>
void
SortDescending
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
value
,
Tensor
*
value_out
,
Tensor
*
index_out
)
{
int
num
=
value
.
numel
();
static
void
SortDescending
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
value
,
Tensor
*
value_out
,
Tensor
*
index_out
)
{
int
num
=
static_cast
<
int
>
(
value
.
numel
());
Tensor
index_in_t
;
int
*
idx_in
=
index_in_t
.
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
int
block
=
512
;
auto
stream
=
ctx
.
stream
(
);
RangeInitKernel
<<<
DIVUP
(
num
,
block
),
block
,
0
,
stream
>>>
(
0
,
1
,
num
,
idx_in
);
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
ctx
,
num
)
;
for_range
(
RangeInitFunctor
{
0
,
1
,
idx_in
}
);
int
*
idx_out
=
index_out
->
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
const
T
*
keys_in
=
value
.
data
<
T
>
();
...
...
@@ -73,22 +79,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
}
template
<
typename
T
>
__device__
__forceinline__
T
Min
(
T
x
,
T
y
)
{
return
x
<
y
?
x
:
y
;
}
template
<
typename
T
>
__device__
__forceinline__
T
Max
(
T
x
,
T
y
)
{
return
x
>
y
?
x
:
y
;
}
template
<
typename
T
>
__global__
void
BoxDecodeAndClipKernel
(
const
T
*
anchor
,
const
T
*
deltas
,
const
T
*
var
,
const
int
*
index
,
const
T
*
im_info
,
const
int
num
,
T
*
proposals
)
{
T
kBBoxClipDefault
=
log
(
1000.0
/
16.0
);
CUDA_1D_KERNEL_LOOP
(
i
,
num
)
{
struct
BoxDecodeAndClipFunctor
{
const
T
*
anchor
;
const
T
*
deltas
;
const
T
*
var
;
const
int
*
index
;
const
T
*
im_info
;
T
*
proposals
;
BoxDecodeAndClipFunctor
(
const
T
*
anchor
,
const
T
*
deltas
,
const
T
*
var
,
const
int
*
index
,
const
T
*
im_info
,
T
*
proposals
)
:
anchor
(
anchor
),
deltas
(
deltas
),
var
(
var
),
index
(
index
),
im_info
(
im_info
),
proposals
(
proposals
)
{}
T
bbox_clip_default
{
static_cast
<
T
>
(
kBBoxClipDefault
)};
__device__
void
operator
()(
size_t
i
)
{
int
k
=
index
[
i
]
*
4
;
T
axmin
=
anchor
[
k
];
T
aymin
=
anchor
[
k
+
1
];
...
...
@@ -105,17 +116,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T
dxmax
=
deltas
[
k
+
2
];
T
dymax
=
deltas
[
k
+
3
];
T
d_cx
=
0.
,
d_cy
=
0.
,
d_w
=
0.
,
d_h
=
0.
;
T
d_cx
,
d_cy
,
d_w
,
d_h
;
if
(
var
)
{
d_cx
=
cx
+
dxmin
*
w
*
var
[
k
];
d_cy
=
cy
+
dymin
*
h
*
var
[
k
+
1
];
d_w
=
exp
(
Min
<
T
>
(
dxmax
*
var
[
k
+
2
],
kBBoxClipD
efault
))
*
w
;
d_h
=
exp
(
Min
<
T
>
(
dymax
*
var
[
k
+
3
],
kBBoxClipD
efault
))
*
h
;
d_w
=
exp
(
Min
(
dxmax
*
var
[
k
+
2
],
bbox_clip_d
efault
))
*
w
;
d_h
=
exp
(
Min
(
dymax
*
var
[
k
+
3
],
bbox_clip_d
efault
))
*
h
;
}
else
{
d_cx
=
cx
+
dxmin
*
w
;
d_cy
=
cy
+
dymin
*
h
;
d_w
=
exp
(
Min
<
T
>
(
dxmax
,
kBBoxClipD
efault
))
*
w
;
d_h
=
exp
(
Min
<
T
>
(
dymax
,
kBBoxClipD
efault
))
*
h
;
d_w
=
exp
(
Min
(
dxmax
,
bbox_clip_d
efault
))
*
w
;
d_h
=
exp
(
Min
(
dymax
,
bbox_clip_d
efault
))
*
h
;
}
T
oxmin
=
d_cx
-
d_w
*
0.5
;
...
...
@@ -123,17 +134,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T
oxmax
=
d_cx
+
d_w
*
0.5
-
1.
;
T
oymax
=
d_cy
+
d_h
*
0.5
-
1.
;
proposals
[
i
*
4
]
=
Max
<
T
>
(
Min
<
T
>
(
oxmin
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
1
]
=
Max
<
T
>
(
Min
<
T
>
(
oymin
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
2
]
=
Max
<
T
>
(
Min
<
T
>
(
oxmax
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
3
]
=
Max
<
T
>
(
Min
<
T
>
(
oymax
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
]
=
Max
(
Min
(
oxmin
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
1
]
=
Max
(
Min
(
oymin
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
2
]
=
Max
(
Min
(
oxmax
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
3
]
=
Max
(
Min
(
oymax
,
im_info
[
0
]
-
1.
),
0.
);
}
}
__device__
__forceinline__
T
Min
(
T
a
,
T
b
)
const
{
return
a
>
b
?
b
:
a
;
}
__device__
__forceinline__
T
Max
(
T
a
,
T
b
)
const
{
return
a
>
b
?
a
:
b
;
}
};
template
<
typename
T
,
int
BlockSize
>
__global__
void
FilterBBoxes
(
const
T
*
bboxes
,
const
T
*
im_info
,
const
T
min_size
,
const
int
num
,
int
*
keep_
num
,
int
*
keep
)
{
static
__global__
void
FilterBBoxes
(
const
T
*
bboxes
,
const
T
*
im_info
,
const
T
min_size
,
const
int
num
,
int
*
keep_num
,
int
*
keep
)
{
T
im_h
=
im_info
[
0
];
T
im_w
=
im_info
[
1
];
T
im_scale
=
im_info
[
2
];
...
...
@@ -178,7 +193,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
}
}
__device__
inline
float
IoU
(
const
float
*
a
,
const
float
*
b
)
{
static
__device__
inline
float
IoU
(
const
float
*
a
,
const
float
*
b
)
{
float
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
float
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
float
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
...
@@ -188,8 +203,9 @@ __device__ inline float IoU(const float *a, const float *b) {
return
inter_s
/
(
s_a
+
s_b
-
inter_s
);
}
__global__
void
NMSKernel
(
const
int
n_boxes
,
const
float
nms_overlap_thresh
,
const
float
*
dev_boxes
,
uint64_t
*
dev_mask
)
{
static
__global__
void
NMSKernel
(
const
int
n_boxes
,
const
float
nms_overlap_thresh
,
const
float
*
dev_boxes
,
uint64_t
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
...
...
@@ -231,9 +247,9 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
}
template
<
typename
T
>
void
NMS
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
proposals
,
const
Tensor
&
sorted_indices
,
const
T
nms_threshold
,
Tensor
*
keep_out
)
{
static
void
NMS
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
proposals
,
const
Tensor
&
sorted_indices
,
const
T
nms_threshold
,
Tensor
*
keep_out
)
{
int
boxes_num
=
proposals
.
dims
()[
0
];
PADDLE_ENFORCE_EQ
(
boxes_num
,
sorted_indices
.
dims
()[
0
]);
...
...
@@ -244,14 +260,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const
T
*
boxes
=
proposals
.
data
<
T
>
();
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
int
size_bytes
=
boxes_num
*
col_blocks
*
sizeof
(
uint64_t
);
auto
d_mask_allocation
=
memory
::
Alloc
(
place
,
size_bytes
);
uint64_t
*
d_mask
=
reinterpret_cast
<
uint64_t
*>
(
d_mask_allocation
->
ptr
());
NMSKernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_threshold
,
boxes
,
d_mask
);
auto
h_mask_allocation
=
memory
::
Alloc
(
platform
::
CPUPlace
(),
size_bytes
);
uint64_t
*
h_mask
=
reinterpret_cast
<
uint64_t
*>
(
h_mask_allocation
->
ptr
());
memory
::
Copy
(
platform
::
CPUPlace
(),
h_mask
,
place
,
d_mask
,
size_bytes
,
0
);
framework
::
Vector
<
uint64_t
>
mask
(
boxes_num
*
col_blocks
);
NMSKernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_threshold
,
boxes
,
mask
.
CUDAMutableData
(
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
())));
std
::
vector
<
uint64_t
>
remv
(
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
uint64_t
)
*
col_blocks
);
...
...
@@ -265,7 +277,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
++
num_to_keep
;
keep_vec
.
push_back
(
i
);
uint64_t
*
p
=
&
h_
mask
[
0
]
+
i
*
col_blocks
;
uint64_t
*
p
=
&
mask
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv
[
j
]
|=
p
[
j
];
}
...
...
@@ -274,12 +286,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
int
*
keep
=
keep_out
->
mutable_data
<
int
>
({
num_to_keep
},
ctx
.
GetPlace
());
memory
::
Copy
(
place
,
keep
,
platform
::
CPUPlace
(),
keep_vec
.
data
(),
sizeof
(
int
)
*
num_to_keep
,
0
);
memory
::
Free
(
place
,
d_mask
);
memory
::
Free
(
platform
::
CPUPlace
(),
h_mask
);
}
template
<
typename
T
>
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
st
atic
st
d
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
im_info
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas
,
// [M, 4]
...
...
@@ -298,18 +308,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
// 2. box decode and clipping
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
pre_nms_num
,
4
},
ctx
.
GetPlace
());
int
block
=
512
;
auto
stream
=
ctx
.
stream
();
BoxDecodeAndClipKernel
<
T
><<<
DIVUP
(
pre_nms_num
,
block
),
block
,
0
,
stream
>>>
(
anchors
.
data
<
T
>
(),
bbox_deltas
.
data
<
T
>
(),
variances
.
data
<
T
>
(),
index_sort
.
data
<
int
>
(),
im_info
.
data
<
T
>
(),
pre_nms_num
,
proposals
.
data
<
T
>
());
{
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
ctx
,
pre_nms_num
);
for_range
(
BoxDecodeAndClipFunctor
<
T
>
{
anchors
.
data
<
T
>
(),
bbox_deltas
.
data
<
T
>
(),
variances
.
data
<
T
>
(),
index_sort
.
data
<
int
>
(),
im_info
.
data
<
T
>
(),
proposals
.
data
<
T
>
()});
}
// 3. filter
Tensor
keep_index
,
keep_num_t
;
keep_index
.
mutable_data
<
int
>
({
pre_nms_num
},
ctx
.
GetPlace
());
keep_num_t
.
mutable_data
<
int
>
({
1
},
ctx
.
GetPlace
());
min_size
=
std
::
max
(
min_size
,
1.0
f
);
auto
stream
=
ctx
.
stream
();
FilterBBoxes
<
T
,
512
><<<
1
,
512
,
0
,
stream
>>>
(
proposals
.
data
<
T
>
(),
im_info
.
data
<
T
>
(),
min_size
,
pre_nms_num
,
keep_num_t
.
data
<
int
>
(),
keep_index
.
data
<
int
>
());
...
...
@@ -353,8 +365,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
auto
anchors
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Anchors"
),
"Cannot find input Anchors(%s) in scope"
,
context
.
Inputs
(
"Anchors"
)[
0
]);
auto
variances
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Variances"
),
"Cannot find input Variances(%s) in scope"
,
context
.
Inputs
(
"Variances"
)[
0
]);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
...
...
@@ -390,10 +406,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
);
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
});
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
variances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
context
.
GetPlace
());
...
...
@@ -402,7 +416,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
T
*
rpn_rois_data
=
rpn_rois
->
data
<
T
>
();
T
*
rpn_roi_probs_data
=
rpn_roi_probs
->
data
<
T
>
();
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx
.
GetPlace
());
auto
&
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx
.
GetPlace
());
int64_t
num_proposals
=
0
;
std
::
vector
<
size_t
>
offset
(
1
,
0
);
...
...
@@ -415,12 +429,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
box_score_pair
=
ProposalForOneImage
<
T
>
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
ProposalForOneImage
<
T
>
(
dev_ctx
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
box_score_pair
.
first
;
Tensor
scores
=
box_score_pair
.
second
;
Tensor
&
proposals
=
box_score_pair
.
first
;
Tensor
&
scores
=
box_score_pair
.
second
;
memory
::
Copy
(
place
,
rpn_rois_data
+
num_proposals
*
4
,
place
,
proposals
.
data
<
T
>
(),
sizeof
(
T
)
*
proposals
.
numel
(),
0
);
...
...
paddle/fluid/operators/gather.h
浏览文件 @
a6fbf7ec
...
...
@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// check index of shape 1-D
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
1
);
int
index_size
=
index
.
dims
()[
0
];
int
64_t
index_size
=
index
.
dims
()[
0
];
auto
src_dims
=
src
.
dims
();
framework
::
DDim
output_dims
(
src_dims
);
output_dims
[
0
]
=
index_size
;
const
T
*
p_src
=
src
.
data
<
T
>
();
const
int
*
p_index
=
index
.
data
<
int
>
();
...
...
@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const
size_t
slice_bytes
=
slice_size
*
sizeof
(
T
);
for
(
int
i
=
0
;
i
<
index_size
;
++
i
)
{
for
(
int
64_t
i
=
0
;
i
<
index_size
;
++
i
)
{
int
index_
=
p_index
[
i
];
memcpy
(
p_output
+
i
*
slice_size
,
p_src
+
index_
*
slice_size
,
slice_bytes
);
}
...
...
paddle/fluid/string/pretty_log.h
浏览文件 @
a6fbf7ec
...
...
@@ -56,13 +56,13 @@ struct Style {
};
template
<
typename
...
Args
>
static
void
PrettyLogEndl
(
const
std
::
string
&
style
,
const
char
*
fmt
,
const
Args
&
...
args
)
{
static
void
PrettyLogEndl
(
const
std
::
string
&
style
,
const
char
*
fmt
,
const
Args
&
...
args
)
{
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
()
<<
std
::
endl
;
}
template
<
typename
...
Args
>
static
void
PrettyLog
(
const
std
::
string
&
style
,
const
char
*
fmt
,
const
Args
&
...
args
)
{
static
void
PrettyLog
(
const
std
::
string
&
style
,
const
char
*
fmt
,
const
Args
&
...
args
)
{
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
();
}
...
...
python/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
...
@@ -87,6 +87,7 @@ if (WITH_TESTING)
endif
()
endif
()
add_subdirectory
(
paddle/fluid/tests
)
add_subdirectory
(
paddle/fluid/contrib/tests
)
endif
()
install
(
DIRECTORY
${
PADDLE_PYTHON_PACKAGE_DIR
}
DESTINATION opt/paddle/share/wheels
...
...
python/paddle/fluid/contrib/__init__.py
浏览文件 @
a6fbf7ec
...
...
@@ -20,8 +20,11 @@ from . import memory_usage_calc
from
.memory_usage_calc
import
*
from
.
import
op_frequence
from
.op_frequence
import
*
from
.
import
quantize
from
.quantize
import
*
__all__
=
[]
__all__
+=
decoder
.
__all__
__all__
+=
memory_usage_calc
.
__all__
__all__
+=
op_frequence
.
__all__
__all__
+=
quantize
.
__all__
python/paddle/fluid/contrib/quantize/__init__.py
0 → 100644
浏览文件 @
a6fbf7ec
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
.
import
quantize_transpiler
from
.quantize_transpiler
import
*
__all__
=
quantize_transpiler
.
__all__
python/paddle/fluid/contrib/quantize/quantize_transpiler.py
0 → 100644
浏览文件 @
a6fbf7ec
此差异已折叠。
点击以展开。
python/paddle/fluid/contrib/tests/CMakeLists.txt
0 → 100644
浏览文件 @
a6fbf7ec
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
foreach
(
src
${
TEST_OPS
}
)
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endforeach
()
python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
0 → 100644
浏览文件 @
a6fbf7ec
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import
numpy
as
np
import
six
import
unittest
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.contrib.quantize.quantize_transpiler
import
_original_var_name
from
paddle.fluid.contrib.quantize.quantize_transpiler
import
QuantizeTranspiler
def
linear_fc
(
num
):
data
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
32
,
32
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
data
for
_
in
six
.
moves
.
xrange
(
num
):
hidden
=
fluid
.
layers
.
fc
(
hidden
,
size
=
128
,
act
=
'relu'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
hidden
,
label
=
label
)
loss
=
fluid
.
layers
.
mean
(
loss
)
return
loss
def
residual_block
(
num
):
def
conv_bn_layer
(
input
,
ch_out
,
filter_size
,
stride
,
padding
,
act
=
'relu'
,
bias_attr
=
False
):
tmp
=
fluid
.
layers
.
conv2d
(
input
=
input
,
filter_size
=
filter_size
,
num_filters
=
ch_out
,
stride
=
stride
,
padding
=
padding
,
act
=
None
,
bias_attr
=
bias_attr
)
return
fluid
.
layers
.
batch_norm
(
input
=
tmp
,
act
=
act
)
data
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
32
,
32
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
data
for
_
in
six
.
moves
.
xrange
(
num
):
conv
=
conv_bn_layer
(
hidden
,
16
,
3
,
1
,
1
,
act
=
None
,
bias_attr
=
True
)
short
=
conv_bn_layer
(
hidden
,
16
,
1
,
1
,
0
,
act
=
None
)
hidden
=
fluid
.
layers
.
elementwise_add
(
x
=
conv
,
y
=
short
,
act
=
'relu'
)
fc
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
fc
,
label
=
label
)
loss
=
fluid
.
layers
.
mean
(
loss
)
return
loss
def
conv_net
(
img
,
label
):
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
img
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
conv_pool_1
=
fluid
.
layers
.
batch_norm
(
conv_pool_1
)
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
conv_pool_2
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
return
avg_loss
class
TestQuantizeTranspiler
(
unittest
.
TestCase
):
def
setUp
(
self
):
# since quant_op and dequant_op is not ready, use cos and sin for test
self
.
weight_quant_op_type
=
'fake_quantize_abs_max'
self
.
dequant_op_type
=
'fake_dequantize_max_abs'
self
.
quantizable_op_and_inputs
=
{
'conv2d'
:
[
'Input'
,
'Filter'
],
'depthwise_conv2d'
:
[
'Input'
,
'Filter'
],
'mul'
:
[
'X'
,
'Y'
]
}
self
.
quantizable_op_grad_and_inputs
=
{
'conv2d_grad'
:
[
'Input'
,
'Filter'
],
'depthwise_conv2d_grad'
:
[
'Input'
,
'Filter'
],
'mul_grad'
:
[
'X'
,
'Y'
]
}
def
check_program
(
self
,
program
):
quantized_ops
=
{}
persistable_vars
=
[
v
.
name
for
v
in
filter
(
lambda
var
:
var
.
persistable
,
program
.
list_vars
())
]
for
block
in
program
.
blocks
:
for
idx
,
op
in
enumerate
(
block
.
ops
):
# check forward
if
op
.
type
in
self
.
quantizable_op_and_inputs
:
for
i
,
arg_name
in
enumerate
(
op
.
input_arg_names
):
quant_op_type
=
self
.
weight_quant_op_type
if
\
_original_var_name
(
arg_name
)
\
in
persistable_vars
else
self
.
act_quant_op_type
self
.
assertTrue
(
arg_name
.
endswith
(
'.quantized.dequantized'
))
if
arg_name
not
in
quantized_ops
:
self
.
assertEqual
(
block
.
ops
[
idx
-
2
*
i
-
1
].
type
,
self
.
dequant_op_type
)
self
.
assertEqual
(
block
.
ops
[
idx
-
2
*
i
-
2
].
type
,
quant_op_type
)
quantized_ops
[
arg_name
]
=
block
.
ops
[
idx
-
2
*
i
-
2
]
else
:
op_idx
=
block
.
ops
.
index
(
quantized_ops
[
arg_name
])
self
.
assertLess
(
op_idx
,
idx
)
# check backward
if
op
.
type
in
self
.
quantizable_op_grad_and_inputs
:
for
pname
in
self
.
quantizable_op_grad_and_inputs
[
op
.
type
]:
arg_name
=
op
.
input
(
pname
)[
0
]
self
.
assertTrue
(
arg_name
.
endswith
(
'.quantized.dequantized'
))
self
.
assertTrue
(
arg_name
in
quantized_ops
)
def
linear_fc_quant
(
self
,
quant_type
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
linear_fc
(
3
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
t
=
QuantizeTranspiler
(
activation_quantize_type
=
quant_type
)
t
.
training_transpile
(
main
)
self
.
check_program
(
main
)
def
test_linear_fc_quant_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_abs_max'
self
.
linear_fc_quant
(
'abs_max'
)
def
test_linear_fc_quant_range_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_range_abs_max'
self
.
linear_fc_quant
(
'range_abs_max'
)
def
residual_block_quant
(
self
,
quant_type
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
residual_block
(
2
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
t
=
QuantizeTranspiler
(
activation_quantize_type
=
quant_type
)
t
.
training_transpile
(
main
)
self
.
check_program
(
main
)
def
test_residual_block_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_abs_max'
self
.
residual_block_quant
(
'abs_max'
)
def
test_residual_block_range_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_range_abs_max'
self
.
residual_block_quant
(
'range_abs_max'
)
def
freeze_program
(
self
,
use_cuda
):
def
build_program
(
main
,
startup
,
is_test
):
with
fluid
.
unique_name
.
guard
():
with
fluid
.
program_guard
(
main
,
startup
):
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
loss
=
conv_net
(
img
,
label
)
if
not
is_test
:
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
return
[
img
,
label
],
loss
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
test_program
=
fluid
.
Program
()
feeds
,
loss
=
build_program
(
main
,
startup
,
False
)
build_program
(
test_program
,
startup
,
True
)
test_program
=
test_program
.
clone
(
for_test
=
True
)
quant_transpiler
=
QuantizeTranspiler
()
quant_transpiler
.
training_transpile
(
main
)
quant_transpiler
.
training_transpile
(
test_program
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
iter
=
5
batch_size
=
8
class_num
=
10
exe
.
run
(
startup
)
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
mnist
.
train
(),
buf_size
=
500
),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
feeds
,
place
=
place
)
with
fluid
.
program_guard
(
main
):
for
_
in
range
(
iter
):
data
=
next
(
train_reader
())
loss_v
=
exe
.
run
(
program
=
main
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
loss
])
with
fluid
.
program_guard
(
test_program
):
test_data
=
next
(
test_reader
())
w_var
=
fluid
.
framework
.
_get_var
(
'conv2d_1.w_0.quantized'
,
test_program
)
# Testing during training
test_loss1
,
w_quant
=
exe
.
run
(
program
=
test_program
,
feed
=
feeder
.
feed
(
test_data
),
fetch_list
=
[
loss
,
w_var
])
# Freeze program for inference, but the weight of fc/conv is still float type.
quant_transpiler
.
freeze_program
(
test_program
,
place
)
test_loss2
,
=
exe
.
run
(
program
=
test_program
,
feed
=
feeder
.
feed
(
test_data
),
fetch_list
=
[
loss
])
self
.
assertAlmostEqual
(
test_loss1
,
test_loss2
,
delta
=
1e-3
)
w_freeze
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'conv2d_1.w_0'
)
.
get_tensor
())
self
.
assertEqual
(
np
.
sum
(
w_freeze
),
np
.
sum
(
w_quant
))
# Convert parameter to 8-bit.
quant_transpiler
.
convert_to_int8
(
test_program
,
place
)
# Save the 8-bit parameter and model file.
fluid
.
io
.
save_inference_model
(
'model_8bit'
,
[
'image'
,
'label'
],
[
loss
],
exe
,
test_program
)
# Test whether the 8-bit parameter and model file can be loaded successfully.
[
infer
,
feed
,
fetch
]
=
fluid
.
io
.
load_inference_model
(
'model_8bit'
,
exe
)
# Check the loaded 8-bit weight.
w_8bit
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'conv2d_1.w_0.int8'
)
.
get_tensor
())
self
.
assertEqual
(
w_8bit
.
dtype
,
np
.
int8
)
self
.
assertEqual
(
np
.
sum
(
w_8bit
),
np
.
sum
(
w_freeze
))
def
test_freeze_program_cuda
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
with
fluid
.
unique_name
.
guard
():
self
.
freeze_program
(
True
)
def
test_freeze_program_cpu
(
self
):
with
fluid
.
unique_name
.
guard
():
self
.
freeze_program
(
False
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
...
@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
list
(
REMOVE_ITEM TEST_OPS op_test
)
# op_test is a helper python file, not a test
list
(
REMOVE_ITEM TEST_OPS decorators
)
# decorators is a helper python file, not a test
if
(
APPLE
)
if
(
NOT WITH_DISTRIBUTE
)
list
(
REMOVE_ITEM TEST_OPS test_desc_clone
)
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
a6fbf7ec
...
...
@@ -50,9 +50,7 @@ class TestDistRunnerBase(object):
def
run_pserver
(
self
,
args
):
self
.
get_model
(
batch_size
=
2
)
if
args
.
mem_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
# NOTE: pserver should not call memory optimize
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
)
...
...
@@ -70,7 +68,7 @@ class TestDistRunnerBase(object):
self
.
get_model
(
batch_size
=
2
)
if
args
.
mem_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
fluid
.
memory_optimize
(
fluid
.
default_main_program
()
,
skip_grads
=
True
)
if
args
.
is_dist
:
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
...
...
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
a6fbf7ec
...
...
@@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
# TODO(typhoonzero): fix this test
# class TestDistseResnXt2x2WithMemopt(TestDistBase):
# def _setup_config(self):
# self._sync_mode = True
# self._mem_opt = True
# def test_dist_train(self):
# self.check_with_place("dist_se_resnext.py", delta=1e-7)
class
TestDistseResnXt2x2WithMemopt
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_mem_opt
=
True
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
...
...
python/paddle/fluid/transpiler/__init__.py
浏览文件 @
a6fbf7ec
...
...
@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
from
.ps_dispatcher
import
HashName
,
RoundRobin
__all__
=
[
"DistributeTranspiler"
,
"memory_optimize"
,
"release_memory"
,
"HashName"
,
"RoundRobin"
,
"DistributeTranspilerConfig"
"DistributeTranspiler"
,
"memory_optimize"
,
"release_memory"
,
"HashName"
,
"RoundRobin"
,
"DistributeTranspilerConfig"
,
]
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
浏览文件 @
a6fbf7ec
...
...
@@ -14,10 +14,10 @@
from
__future__
import
print_function
from
collections
import
defaultdict
,
OrderedDict
,
Callable
from
collections
import
defaultdict
,
MutableSet
from
..
import
core
from
...
import
compat
as
cpt
from
..framework
import
Program
,
default_main_program
,
Parameter
,
Variable
from
..framework
import
Program
,
default_main_program
,
Parameter
,
Variable
,
core
from
..backward
import
_rename_arg_
from
functools
import
reduce
from
six.moves
import
range
...
...
@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
PRINT_LOG
=
False
class
OrderedSet
(
MutableSet
):
def
__init__
(
self
,
iterable
=
None
):
self
.
end
=
end
=
[]
end
+=
[
None
,
end
,
end
]
# sentinel node for doubly linked list
self
.
map
=
{}
# key --> [key, prev, next]
if
iterable
is
not
None
:
self
|=
iterable
def
__len__
(
self
):
return
len
(
self
.
map
)
def
__contains__
(
self
,
key
):
return
key
in
self
.
map
def
add
(
self
,
key
):
if
key
not
in
self
.
map
:
end
=
self
.
end
curr
=
end
[
1
]
curr
[
2
]
=
end
[
1
]
=
self
.
map
[
key
]
=
[
key
,
curr
,
end
]
def
update
(
self
,
other
):
for
e
in
other
:
self
.
add
(
e
)
def
discard
(
self
,
key
):
if
key
in
self
.
map
:
key
,
prev
,
next
=
self
.
map
.
pop
(
key
)
prev
[
2
]
=
next
next
[
1
]
=
prev
def
remove
(
self
,
key
):
self
.
discard
(
key
)
def
__iter__
(
self
):
end
=
self
.
end
curr
=
end
[
2
]
while
curr
is
not
end
:
yield
curr
[
0
]
curr
=
curr
[
2
]
def
__reversed__
(
self
):
end
=
self
.
end
curr
=
end
[
1
]
while
curr
is
not
end
:
yield
curr
[
0
]
curr
=
curr
[
1
]
def
pop
(
self
,
last
=
True
):
if
not
self
:
raise
KeyError
(
'set is empty'
)
key
=
self
.
end
[
1
][
0
]
if
last
else
self
.
end
[
2
][
0
]
self
.
discard
(
key
)
return
key
def
__repr__
(
self
):
if
not
self
:
return
'%s()'
%
(
self
.
__class__
.
__name__
,
)
return
'%s(%r)'
%
(
self
.
__class__
.
__name__
,
list
(
self
))
def
__eq__
(
self
,
other
):
if
isinstance
(
other
,
OrderedSet
):
return
len
(
self
)
==
len
(
other
)
and
list
(
self
)
==
list
(
other
)
return
set
(
self
)
==
set
(
other
)
class
ControlFlowGraph
(
object
):
def
__init__
(
self
,
program
,
ops
,
forward_num
,
skip_opt
):
self
.
_program
=
program
self
.
_ops
=
ops
self
.
_forward_num
=
forward_num
self
.
_successors
=
defaultdict
(
s
et
)
self
.
_presuccessors
=
defaultdict
(
s
et
)
self
.
_uses
=
defaultdict
(
s
et
)
self
.
_defs
=
defaultdict
(
s
et
)
self
.
_live_in
=
defaultdict
(
s
et
)
self
.
_live_out
=
defaultdict
(
s
et
)
self
.
_successors
=
defaultdict
(
OrderedS
et
)
self
.
_presuccessors
=
defaultdict
(
OrderedS
et
)
self
.
_uses
=
defaultdict
(
OrderedS
et
)
self
.
_defs
=
defaultdict
(
OrderedS
et
)
self
.
_live_in
=
defaultdict
(
OrderedS
et
)
self
.
_live_out
=
defaultdict
(
OrderedS
et
)
self
.
_skip_opt
=
skip_opt
self
.
pool
=
[]
...
...
@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
# NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize
=
[
x
for
x
in
sorted
(
list
(
in_diff
))
x
for
x
in
in_diff
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
]
if
can_optimize
:
...
...
@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
if
self
.
pool
:
# NOTE: must sort the in_diff set for cases that get different cache var.
defs_can_optimize
=
[
x
for
x
in
s
orted
(
list
(
self
.
_defs
[
i
]))
x
for
x
in
s
elf
.
_defs
[
i
]
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
]
out_pair
=
[
...
...
@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
return
cfgs
def
memory_optimize
(
input_program
,
skip_opt_set
=
None
,
print_log
=
False
,
level
=
0
):
def
_is_opt_role_op
(
op
):
op_maker
=
core
.
op_proto_and_checker_maker
optimize_role
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
if
op_maker
.
kOpRoleAttrName
()
in
op
.
attr_names
and
\
int
(
op
.
all_attrs
()[
op_maker
.
kOpRoleAttrName
()])
==
int
(
optimize_role
):
return
True
def
memory_optimize
(
input_program
,
skip_opt_set
=
None
,
print_log
=
False
,
level
=
0
,
skip_grads
=
False
):
"""Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock.
...
...
@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
raise
ValueError
(
"only support opt_level 0 or 1."
)
global
PRINT_LOG
PRINT_LOG
=
print_log
if
skip_grads
:
grad_set
=
set
()
OP_ROLE_VAR
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
()
for
op
in
input_program
.
global_block
().
ops
:
if
_is_opt_role_op
(
op
):
if
op
.
attr
(
OP_ROLE_VAR
):
grad_name
=
op
.
attr
(
OP_ROLE_VAR
)[
1
]
grad_set
.
add
(
grad_name
)
if
not
skip_opt_set
:
skip_opt_set
=
grad_set
else
:
skip_opt_set
.
update
(
grad_set
)
cfgs
=
_get_cfgs
(
input_program
)
for
cfg
in
cfgs
:
cfg
.
memory_optimize
(
skip_opt_set
=
skip_opt_set
,
level
=
level
)
...
...
python/setup.py.in
浏览文件 @
a6fbf7ec
...
...
@@ -106,6 +106,7 @@ packages=['paddle',
'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details']
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录