Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a6fbf7ec
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a6fbf7ec
编写于
9月 28, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'feature/refine_generate_proposals_op' into rewrite_allocation
上级
58ed412f
593ad763
变更
45
显示空白变更内容
内联
并排
Showing
45 changed file
with
2535 addition
and
308 deletion
+2535
-308
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+1
-0
paddle/fluid/API.spec
paddle/fluid/API.spec
+6
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+5
-2
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+6
-6
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+150
-0
paddle/fluid/framework/naive_executor.h
paddle/fluid/framework/naive_executor.h
+63
-0
paddle/fluid/framework/naive_executor_test.cc
paddle/fluid/framework/naive_executor_test.cc
+70
-0
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+8
-2
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+31
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-1
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+1
-1
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+13
-7
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+211
-31
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+49
-10
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+67
-0
paddle/fluid/inference/api/api.cc
paddle/fluid/inference/api/api.cc
+22
-16
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-1
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+13
-9
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+3
-3
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+111
-0
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+46
-0
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+138
-0
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+53
-1
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+6
-1
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+4
-1
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+282
-3
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+2
-1
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+3
-1
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+2
-4
paddle/fluid/operators/detection/generate_proposals_op.cc
paddle/fluid/operators/detection/generate_proposals_op.cc
+97
-97
paddle/fluid/operators/detection/generate_proposals_op.cu
paddle/fluid/operators/detection/generate_proposals_op.cu
+88
-74
paddle/fluid/operators/gather.h
paddle/fluid/operators/gather.h
+2
-4
paddle/fluid/string/pretty_log.h
paddle/fluid/string/pretty_log.h
+4
-4
python/CMakeLists.txt
python/CMakeLists.txt
+1
-0
python/paddle/fluid/contrib/__init__.py
python/paddle/fluid/contrib/__init__.py
+3
-0
python/paddle/fluid/contrib/quantize/__init__.py
python/paddle/fluid/contrib/quantize/__init__.py
+20
-0
python/paddle/fluid/contrib/quantize/quantize_transpiler.py
python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+557
-0
python/paddle/fluid/contrib/tests/CMakeLists.txt
python/paddle/fluid/contrib/tests/CMakeLists.txt
+6
-0
python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
...on/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+272
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+0
-1
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+2
-4
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+7
-8
python/paddle/fluid/transpiler/__init__.py
python/paddle/fluid/transpiler/__init__.py
+6
-2
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
...paddle/fluid/transpiler/memory_optimization_transpiler.py
+101
-11
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
cmake/external/anakin.cmake
浏览文件 @
a6fbf7ec
...
@@ -52,6 +52,7 @@ ExternalProject_Add(
...
@@ -52,6 +52,7 @@ ExternalProject_Add(
PREFIX
${
ANAKIN_SOURCE_DIR
}
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CMAKE_ARGS
${
CMAKE_ARGS_PREFIX
}
CMAKE_ARGS
${
CMAKE_ARGS_PREFIX
}
-DUSE_LOGGER=YES
-DUSE_X86_PLACE=YES
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=
${
THIRD_PARTY_PATH
}
/install/protobuf
-DPROTOBUF_ROOT=
${
THIRD_PARTY_PATH
}
/install/protobuf
...
...
paddle/fluid/API.spec
浏览文件 @
a6fbf7ec
...
@@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
...
@@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
], varargs=None, keywords=None, defaults=(None, False, 0
))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
, 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False
))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
...
@@ -299,13 +299,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
...
@@ -299,13 +299,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
], varargs=None, keywords=None, defaults=(None, False, 0
))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'
, 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False
))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
@@ -56,9 +56,9 @@ else()
...
@@ -56,9 +56,9 @@ else()
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
endif
()
endif
()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version
)
else
()
else
()
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
...
@@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
...
@@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
else
()
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op
)
endif
()
endif
()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
...
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_viz_pass base
)
pass_library
(
graph_viz_pass base
)
pass_library
(
fc_fuse_pass inference
)
pass_library
(
fc_fuse_pass inference
)
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
pass_library
(
conv_relu_mkldnn_fuse_pass inference
)
pass_library
(
conv_relu_mkldnn_fuse_pass inference
)
endif
()
endif
()
pass_library
(
attention_lstm_fuse_pass inference
)
pass_library
(
attention_lstm_fuse_pass inference
)
pass_library
(
infer_clean_graph_pass inference
)
pass_library
(
infer_clean_graph_pass inference
)
pass_library
(
fc_lstm_fuse_pass inference
)
pass_library
(
fc_lstm_fuse_pass inference
)
...
@@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
...
@@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
endif
()
endif
()
paddle/fluid/framework/naive_executor.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
framework
{
// These code can be shared with Executor.
static
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
)
{
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR
)
{
var
->
GetMutable
<
LoDTensor
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
SELECTED_ROWS
)
{
var
->
GetMutable
<
SelectedRows
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FEED_MINIBATCH
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
var
->
GetMutable
<
LoDTensorArray
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
PLACE_LIST
)
{
var
->
GetMutable
<
platform
::
PlaceList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
READER
)
{
var
->
GetMutable
<
ReaderHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
CHANNEL
)
{
var
->
GetMutable
<
ChannelHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
RAW
)
{
// GetMutable will be called in operator
}
else
{
PADDLE_THROW
(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]"
,
var_type
);
}
}
void
NaiveExecutor
::
Prepare
(
Scope
*
parent_scope
,
const
ProgramDesc
&
program_desc
,
int
block_id
,
bool
with_feed_fetch_ops
)
{
if
(
!
parent_scope
)
{
scope_
=
new
framework
::
Scope
;
}
else
{
scope_
=
&
parent_scope
->
NewScope
();
}
CreateVariables
(
program_desc
,
scope_
,
block_id
);
CreateOps
(
program_desc
,
block_id
,
with_feed_fetch_ops
);
}
void
NaiveExecutor
::
Run
()
{
for
(
auto
&
op
:
ops_
)
{
VLOG
(
4
)
<<
"run "
<<
op
->
Type
();
op
->
Run
(
*
scope_
,
place_
);
}
}
void
NaiveExecutor
::
CreateVariables
(
const
ProgramDesc
&
desc
,
Scope
*
scope
,
int
block_id
)
{
PADDLE_ENFORCE
(
scope
);
auto
&
global_block
=
desc
.
Block
(
block_id
);
const
Scope
*
ancestor_scope
=
scope
;
while
(
ancestor_scope
->
parent
())
{
ancestor_scope
=
ancestor_scope
->
parent
();
}
if
(
ancestor_scope
!=
scope
)
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
continue
;
}
// Create persistable vars in ancestor scope.
if
(
var
->
Persistable
())
{
auto
*
ptr
=
const_cast
<
Scope
*>
(
ancestor_scope
)
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
// Create temporary variables in local scope.
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
}
else
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
}
}
void
NaiveExecutor
::
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
bool
with_feed_fetch_ops
)
{
for
(
const
auto
&
op_desc
:
desc
.
Block
(
block_id
).
AllOps
())
{
if
(
!
with_feed_fetch_ops
&&
(
op_desc
->
Type
()
==
"feed"
||
op_desc
->
Type
()
==
"fetch"
))
{
string
::
PrettyLogEndl
(
string
::
Style
::
detail
(),
"--- skip [%s], %s -> %s"
,
op_desc
->
Input
(
"X"
)[
0
],
op_desc
->
Type
(),
op_desc
->
Output
(
"Out"
)[
0
]);
continue
;
}
ops_
.
emplace_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
}
LoDTensor
*
NaiveExecutor
::
FindTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
scope_
,
"Need to init scope first"
);
auto
*
var
=
scope_
->
FindVar
(
name
);
PADDLE_ENFORCE
(
var
,
"No variable [%s] in the scope"
);
auto
*
tensor
=
const_cast
<
LoDTensor
*>
(
&
var
->
Get
<
LoDTensor
>
());
return
tensor
;
}
void
NaiveExecutor
::
CleanFeedFetchOps
()
{
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops
;
for
(
auto
&
op
:
ops_
)
{
if
(
op
->
Type
()
!=
"feed"
&&
op
->
Type
()
!=
"fetch"
)
{
ops
.
emplace_back
(
std
::
move
(
op
));
}
}
ops_
.
swap
(
ops
);
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor.h
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
/*
* Simple, intuitive and effective. Only single thread is supported, and
* currently designed for inference.
*/
class
NaiveExecutor
{
public:
explicit
NaiveExecutor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
// Create child scope.
// Create variables.
// @with_feed_fetch_ops: whether to work with the feed and fetch operators.
void
Prepare
(
Scope
*
parent_scope
,
const
ProgramDesc
&
program_desc
,
int
block_id
,
bool
with_feed_fetch_ops
);
// Run all the operators.
void
Run
();
// Get an tensor to operating directly, without the need for feed_ops.
LoDTensor
*
FindTensor
(
const
std
::
string
&
name
);
Scope
*
scope
()
{
return
scope_
;
}
void
CleanFeedFetchOps
();
protected:
void
CreateVariables
(
const
ProgramDesc
&
desc
,
Scope
*
scope
,
int
block_id
);
void
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
bool
with_feed_fetch_ops
);
private:
const
platform
::
Place
place_
;
// Catch the required resource to avoid recreate.
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
Scope
*
scope_
;
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor_test.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include <gtest/gtest.h>
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
framework
{
TEST
(
NaiveExecutor
,
Basic
)
{
ProgramDesc
program
;
auto
*
main_block
=
program
.
MutableBlock
(
0
);
auto
*
a
=
main_block
->
Var
(
"a"
);
// input
auto
*
b
=
main_block
->
Var
(
"b"
);
// input
auto
*
c
=
main_block
->
Var
(
"c"
);
// input
a
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
b
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
c
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
auto
*
add
=
main_block
->
AppendOp
();
add
->
SetType
(
"elementwise_add"
);
add
->
SetInput
(
"X"
,
{
"a"
});
add
->
SetInput
(
"Y"
,
{
"b"
});
add
->
SetOutput
(
"Out"
,
{
"c"
});
auto
place
=
platform
::
CPUPlace
();
NaiveExecutor
exe
(
place
);
exe
.
Prepare
(
nullptr
,
program
,
0
,
false
/*with feed fetch ops*/
);
auto
*
a_tensor
=
exe
.
FindTensor
(
"a"
);
auto
*
b_tensor
=
exe
.
FindTensor
(
"b"
);
auto
*
c_tensor
=
exe
.
FindTensor
(
"c"
);
a_tensor
->
Resize
({
1
,
4
});
b_tensor
->
Resize
({
1
,
4
});
c_tensor
->
Resize
({
1
,
4
});
b_tensor
->
mutable_data
<
float
>
(
place
);
a_tensor
->
mutable_data
<
float
>
(
place
);
float
a_arr
[]
=
{
0
,
1
,
2
,
3
};
float
b_arr
[]
=
{
0.0
,
.1
,
.2
,
.3
};
std
::
copy_n
(
a_arr
,
4
,
a_tensor
->
mutable_data
<
float
>
(
place
));
std
::
copy_n
(
b_arr
,
4
,
b_tensor
->
mutable_data
<
float
>
(
place
));
exe
.
Run
();
auto
*
c_data
=
c_tensor
->
mutable_data
<
float
>
(
place
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
EXPECT_NEAR
(
c_data
[
i
],
1.1
*
i
,
1e-3
);
}
}
}
// namespace framework
}
// namespace paddle
USE_OP
(
elementwise_add
);
paddle/fluid/framework/operator.cc
浏览文件 @
a6fbf7ec
...
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
...
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform
::
SetDeviceId
(
dev_id
);
platform
::
SetDeviceId
(
dev_id
);
#endif
#endif
}
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
if
(
platform
::
IsProfileEnabled
())
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
}
RunImpl
(
scope
,
place
);
RunImpl
(
scope
,
place
);
if
(
VLOG_IS_ON
(
3
))
{
if
(
VLOG_IS_ON
(
3
))
{
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
}
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
a6fbf7ec
...
@@ -20,6 +20,13 @@ limitations under the License. */
...
@@ -20,6 +20,13 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
// The mutex is not needed by training and inference, only for distribution.
#if PADDLE_WITH_DISTRIBUTE
#define WITH_LOCK 1
#else
#define WITH_LOCK 0
#endif
DEFINE_bool
(
benchmark
,
false
,
DEFINE_bool
(
benchmark
,
false
,
"Doing memory benchmark. It will make deleting scope synchronized, "
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"and add some memory usage logs."
...
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
...
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
Scope
::~
Scope
()
{
DropKids
();
}
Scope
::~
Scope
()
{
DropKids
();
}
Scope
&
Scope
::
NewScope
()
const
{
Scope
&
Scope
::
NewScope
()
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
kids_
.
push_back
(
new
Scope
(
this
));
kids_
.
push_back
(
new
Scope
(
this
));
return
*
kids_
.
back
();
return
*
kids_
.
back
();
}
}
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
)
{
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
)
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
return
VarInternal
(
name
);
return
VarInternal
(
name
);
}
}
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
if
(
name
!=
nullptr
)
{
if
(
name
!=
nullptr
)
{
*
name
=
new_name
;
*
name
=
new_name
;
...
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
...
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
}
}
Variable
*
Scope
::
FindVar
(
const
std
::
string
&
name
)
const
{
Variable
*
Scope
::
FindVar
(
const
std
::
string
&
name
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
return
FindVarInternal
(
name
);
return
FindVarInternal
(
name
);
}
}
const
Scope
*
Scope
::
FindScope
(
const
Variable
*
var
)
const
{
const
Scope
*
Scope
::
FindScope
(
const
Variable
*
var
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
return
FindScopeInternal
(
var
);
return
FindScopeInternal
(
var
);
}
}
void
Scope
::
DropKids
()
{
void
Scope
::
DropKids
()
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
for
(
Scope
*
s
:
kids_
)
delete
s
;
for
(
Scope
*
s
:
kids_
)
delete
s
;
kids_
.
clear
();
kids_
.
clear
();
}
}
bool
Scope
::
HasKid
(
const
Scope
*
scope
)
const
{
bool
Scope
::
HasKid
(
const
Scope
*
scope
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
return
it
!=
this
->
kids_
.
end
();
return
it
!=
this
->
kids_
.
end
();
}
}
std
::
vector
<
std
::
string
>
Scope
::
LocalVarNames
()
const
{
std
::
vector
<
std
::
string
>
Scope
::
LocalVarNames
()
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
std
::
vector
<
std
::
string
>
known_vars
;
std
::
vector
<
std
::
string
>
known_vars
;
known_vars
.
reserve
(
this
->
vars_
.
size
());
known_vars
.
reserve
(
this
->
vars_
.
size
());
for
(
auto
&
p
:
vars_
)
{
for
(
auto
&
p
:
vars_
)
{
...
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
...
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
}
}
void
Scope
::
DeleteScope
(
Scope
*
scope
)
const
{
void
Scope
::
DeleteScope
(
Scope
*
scope
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
PADDLE_ENFORCE
(
it
!=
this
->
kids_
.
end
(),
"Cannot find %p as kid scope"
,
scope
);
PADDLE_ENFORCE
(
it
!=
this
->
kids_
.
end
(),
"Cannot find %p as kid scope"
,
scope
);
this
->
kids_
.
erase
(
it
);
this
->
kids_
.
erase
(
it
);
...
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
...
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
}
}
void
Scope
::
EraseVars
(
const
std
::
vector
<
std
::
string
>&
var_names
)
{
void
Scope
::
EraseVars
(
const
std
::
vector
<
std
::
string
>&
var_names
)
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
std
::
set
<
std
::
string
>
var_set
(
var_names
.
begin
(),
var_names
.
end
());
std
::
set
<
std
::
string
>
var_set
(
var_names
.
begin
(),
var_names
.
end
());
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
if
(
var_set
.
find
(
it
->
first
)
!=
var_set
.
end
())
{
if
(
var_set
.
find
(
it
->
first
)
!=
var_set
.
end
())
{
...
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
...
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void
Scope
::
Rename
(
const
std
::
string
&
origin_name
,
void
Scope
::
Rename
(
const
std
::
string
&
origin_name
,
const
std
::
string
&
new_name
)
const
{
const
std
::
string
&
new_name
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
RenameInternal
(
origin_name
,
new_name
);
RenameInternal
(
origin_name
,
new_name
);
}
}
std
::
string
Scope
::
Rename
(
const
std
::
string
&
origin_name
)
const
{
std
::
string
Scope
::
Rename
(
const
std
::
string
&
origin_name
)
const
{
#if WITH_LOCK
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
#endif
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
RenameInternal
(
origin_name
,
new_name
);
RenameInternal
(
origin_name
,
new_name
);
return
new_name
;
return
new_name
;
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
a6fbf7ec
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
a6fbf7ec
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
@@ -18,10 +18,10 @@ if(APPLE)
...
@@ -18,10 +18,10 @@ if(APPLE)
endif
(
APPLE
)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
${
GLOB_PASS_LIB
}
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
analysis_predictor
)
endif
()
endif
()
function
(
inference_api_test TARGET_NAME
)
function
(
inference_api_test TARGET_NAME
)
...
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
...
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
endfunction
(
inference_api_test
)
endfunction
(
inference_api_test
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api
)
cc_test
(
test_paddle_inference_api
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
SRCS api_tester.cc
DEPS paddle_inference_api
)
DEPS paddle_inference_api
)
...
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
...
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
inference_api_test
(
test_api_impl SRC api_impl_tester.cc
inference_api_test
(
test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification
)
ARGS test_word2vec test_image_classification
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
${
inference_deps
}
paddle_inference_api
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
cc_library
(
paddle_inference_tensorrt_subgraph_engine
cc_library
(
paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter
)
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter
zero_copy_tensor_dummy
)
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
endif
()
endif
()
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
# compile the libinference_anakin_api.a and anakin.so.
cc_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml
)
cc_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml
scope zero_copy_tensor_dummy
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
scope
)
function
(
anakin_target target_name
)
function
(
anakin_target target_name
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endfunction
()
endfunction
()
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
a6fbf7ec
...
@@ -16,11 +16,15 @@
...
@@ -16,11 +16,15 @@
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -28,8 +32,11 @@ DECLARE_bool(profile);
...
@@ -28,8 +32,11 @@ DECLARE_bool(profile);
namespace
paddle
{
namespace
paddle
{
using
contrib
::
AnalysisConfig
;
bool
AnalysisPredictor
::
Init
(
bool
AnalysisPredictor
::
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
)
{
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
)
{
VLOG
(
3
)
<<
"Predictor::init()"
;
VLOG
(
3
)
<<
"Predictor::init()"
;
#if !defined(_WIN32)
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
...
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(
...
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(
if
(
config_
.
use_gpu
)
{
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
LOG
(
WARNING
)
<<
"ir optimize only supports CPU currently"
;
LOG
(
WARNING
)
<<
"ir optimize only supports CPU currently, enable_ir_optim "
"is turned false."
;
config_
.
enable_ir_optim
=
false
;
config_
.
enable_ir_optim
=
false
;
}
else
{
}
else
{
place_
=
paddle
::
platform
::
CPUPlace
();
place_
=
paddle
::
platform
::
CPUPlace
();
...
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
...
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
scope_
.
reset
(
new
paddle
::
framework
::
Scope
());
scope_
.
reset
(
new
paddle
::
framework
::
Scope
());
}
}
executor_
.
reset
(
new
paddle
::
framework
::
Executor
(
place_
));
executor_
.
reset
(
new
paddle
::
framework
::
Naive
Executor
(
place_
));
// Initialize the inference program
if
(
!
program
)
{
if
(
!
config_
.
model_dir
.
empty
())
{
if
(
!
LoadProgramDesc
())
return
false
;
// Parameters are saved in separate files sited in
OptimizeInferenceProgram
();
// the specified `dirname`.
inference_program_
=
paddle
::
inference
::
Load
(
executor_
.
get
(),
scope_
.
get
(),
config_
.
model_dir
);
}
else
if
(
!
config_
.
prog_file
.
empty
()
&&
!
config_
.
param_file
.
empty
())
{
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_
=
paddle
::
inference
::
Load
(
executor_
.
get
(),
scope_
.
get
(),
config_
.
prog_file
,
config_
.
param_file
);
}
else
{
}
else
{
LOG
(
ERROR
)
<<
"fail to load inference model from "
<<
config_
.
model_dir
;
inference_program_
=
program
;
}
executor_
->
Prepare
(
scope_
.
get
(),
*
inference_program_
,
0
,
config_
.
use_feed_fetch_ops
);
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch
();
return
true
;
}
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
VLOG
(
3
)
<<
"Predictor::predict"
;
inference
::
Timer
timer
;
timer
.
tic
();
// set feed variable
std
::
vector
<
framework
::
LoDTensor
>
feeds
;
framework
::
Scope
*
scope
=
sub_scope_
?
sub_scope_
:
scope_
.
get
();
if
(
!
SetFeed
(
inputs
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to set feed"
;
return
false
;
return
false
;
}
}
// Run the inference program
// if share variables, we need not create variables
executor_
->
Run
();
OptimizeInferenceProgram
();
// get fetch variable
if
(
config_
.
_use_mkldnn
)
{
if
(
!
GetFetch
(
output_data
,
scope
))
{
executor_
->
EnableMKLDNN
(
*
inference_program_
);
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
}
ctx_
=
executor_
->
Prepare
(
*
inference_program_
,
0
);
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
return
true
;
}
VLOG
(
5
)
<<
"to create variables"
;
bool
AnalysisPredictor
::
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
PADDLE_ENFORCE
(
scope_
.
get
());
framework
::
Scope
*
scope
)
{
executor_
->
CreateVariables
(
*
inference_program_
,
VLOG
(
3
)
<<
"Predictor::set_feed"
;
sub_scope_
?
sub_scope_
:
scope_
.
get
(),
0
);
if
(
inputs
.
size
()
!=
feeds_
.
size
())
{
// Get the feed_target_names and fetch_target_names
LOG
(
ERROR
)
<<
"wrong feed input size, need "
<<
feeds_
.
size
()
<<
" but get "
PrepareFeedFetch
();
<<
inputs
.
size
();
return
false
;
}
// Cache the inputs memory for better concurrency performance.
feed_tensors_
.
resize
(
inputs
.
size
());
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
&
input
=
feed_tensors_
[
i
];
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
void
*
input_ptr
;
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
platform
::
CPUPlace
());
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
platform
::
CPUPlace
());
}
else
{
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
return
false
;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
lod
.
emplace_back
(
level
);
}
input
.
set_lod
(
lod
);
int
idx
=
-
1
;
if
(
config_
.
specify_input_name
)
{
idx
=
feed_names_
[
inputs
[
i
].
name
];
}
else
{
idx
=
boost
::
get
<
int
>
(
feeds_
[
i
]
->
GetAttr
(
"col"
));
}
framework
::
SetFeedVariable
(
scope
,
input
,
"feed"
,
idx
);
}
return
true
;
}
template
<
typename
T
>
void
AnalysisPredictor
::
GetFetchOne
(
const
framework
::
LoDTensor
&
fetch
,
PaddleTensor
*
output
)
{
// set shape.
auto
shape
=
framework
::
vectorize
(
fetch
.
dims
());
output
->
shape
.
assign
(
shape
.
begin
(),
shape
.
end
());
// set data.
const
T
*
data
=
fetch
.
data
<
T
>
();
int
num_elems
=
inference
::
VecReduceToInt
(
shape
);
output
->
data
.
Resize
(
num_elems
*
sizeof
(
T
));
// The fetched tensor output by fetch op, should always in CPU memory, so just
// copy.
memcpy
(
output
->
data
.
data
(),
data
,
num_elems
*
sizeof
(
T
));
// set lod
output
->
lod
.
clear
();
for
(
auto
&
level
:
fetch
.
lod
())
{
output
->
lod
.
emplace_back
(
level
.
begin
(),
level
.
end
());
}
}
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
outputs
->
resize
(
fetchs_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetchs_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetchs_
[
i
]
->
GetAttr
(
"col"
));
PADDLE_ENFORCE
((
size_t
)
idx
==
i
);
framework
::
LoDTensor
&
fetch
=
framework
::
GetFetchVariable
(
*
scope
,
"fetch"
,
idx
);
auto
type
=
fetch
.
type
();
auto
output
=
&
(
outputs
->
at
(
i
));
if
(
type
==
typeid
(
float
))
{
GetFetchOne
<
float
>
(
fetch
,
output
);
output
->
dtype
=
PaddleDType
::
FLOAT32
;
}
else
if
(
type
==
typeid
(
int64_t
))
{
GetFetchOne
<
int64_t
>
(
fetch
,
output
);
output
->
dtype
=
PaddleDType
::
INT64
;
}
else
{
LOG
(
ERROR
)
<<
"unknown type, only support float32 and int64 now."
;
}
}
return
true
;
return
true
;
}
}
...
@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
new
std
::
string
(
config_
.
prog_file
));
new
std
::
string
(
config_
.
prog_file
));
argument_
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
argument_
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
}
}
argument_
.
origin_program_desc
.
reset
(
argument_
.
origin_program_desc
.
reset
(
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
...
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
}
template
<
>
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
const
contrib
::
AnalysisConfig
&
config
)
{
VLOG
(
3
)
<<
"create AnalysisConfig"
;
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
)
{
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
// 1. GPU memeroy
...
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
...
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
}
}
std
::
unique_ptr
<
PaddlePredictor
>
predictor
(
new
AnalysisPredictor
(
config
));
std
::
unique_ptr
<
PaddlePredictor
>
predictor
(
new
AnalysisPredictor
(
config
));
if
(
!
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
Init
(
nullptr
))
{
if
(
!
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
Init
(
nullptr
))
{
return
nullptr
;
return
nullptr
;
}
}
return
predictor
;
return
predictor
;
}
}
void
AnalysisPredictor
::
PrepareFeedFetch
()
{
for
(
auto
*
op
:
inference_program_
->
Block
(
0
).
AllOps
())
{
if
(
op
->
Type
()
==
"feed"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
feeds_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
feeds_
.
resize
(
idx
+
1
);
}
feeds_
[
idx
]
=
op
;
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
fetchs_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetchs_
.
resize
(
idx
+
1
);
}
fetchs_
[
idx
]
=
op
;
}
}
}
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetInputTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
std
::
unique_ptr
<
ZeroCopyTensor
>
res
(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
true
;
res
->
SetName
(
name
);
return
res
;
}
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetOutputTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
std
::
unique_ptr
<
ZeroCopyTensor
>
res
(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
false
;
res
->
SetName
(
name
);
return
res
;
}
bool
AnalysisPredictor
::
ZeroCopyRun
()
{
executor_
->
Run
();
return
true
;
}
bool
AnalysisPredictor
::
LoadProgramDesc
()
{
// Initialize the inference program
std
::
unique_ptr
<
framework
::
Executor
>
tmp_exe
(
new
framework
::
Executor
(
platform
::
CPUPlace
()));
if
(
!
config_
.
model_dir
.
empty
())
{
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_
=
paddle
::
inference
::
Load
(
static_cast
<
framework
::
Executor
*>
(
tmp_exe
.
get
()),
scope_
.
get
(),
config_
.
model_dir
);
}
else
if
(
!
config_
.
prog_file
.
empty
()
&&
!
config_
.
param_file
.
empty
())
{
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_
=
paddle
::
inference
::
Load
(
static_cast
<
framework
::
Executor
*>
(
tmp_exe
.
get
()),
scope_
.
get
(),
config_
.
prog_file
,
config_
.
param_file
);
}
else
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"not valid model path '%s' or program path '%s'."
,
config_
.
model_dir
,
config_
.
param_file
);
return
false
;
}
return
true
;
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
auto
*
x
=
new
AnalysisPredictor
(
config_
);
x
->
Init
(
scope_
,
inference_program_
);
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
}
template
<
>
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
const
contrib
::
AnalysisConfig
&
config
)
{
const
contrib
::
AnalysisConfig
&
config
)
{
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
,
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
PaddleEngineKind
::
kAnalysis
>
(
config
);
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
a6fbf7ec
...
@@ -12,42 +12,81 @@
...
@@ -12,42 +12,81 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#pragma once
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
using
inference
::
analysis
::
Argument
;
using
inference
::
analysis
::
Argument
;
using
inference
::
analysis
::
Analyzer
;
using
inference
::
analysis
::
Analyzer
;
using
framework
::
proto
::
ProgramDesc
;
using
framework
::
proto
::
ProgramDesc
;
using
framework
::
NaiveExecutor
;
using
contrib
::
AnalysisConfig
;
/* This predictor is based on the original native predictor with IR and Analysis
/* This predictor is based on the original native predictor with IR and Analysis
* support. It will optimize IR and Parameters in the runtime.
* support. It will optimize IR and Parameters in the runtime.
* TODO(Superjomn) Replace the Navive predictor?
* TODO(Superjomn) Replace the Navive predictor?
*/
*/
class
AnalysisPredictor
:
public
Native
PaddlePredictor
{
class
AnalysisPredictor
:
public
PaddlePredictor
{
public:
public:
explicit
AnalysisPredictor
(
const
contrib
::
AnalysisConfig
&
config
)
explicit
AnalysisPredictor
(
const
AnalysisConfig
&
config
)
:
config_
(
config
)
{}
:
NativePaddlePredictor
(
config
),
config_
(
config
)
{}
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
);
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
=
nullptr
);
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
bool
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
=
-
1
)
override
{
int
batch_size
=
-
1
)
override
;
return
NativePaddlePredictor
::
Run
(
inputs
,
output_data
,
batch_size
);
}
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
override
;
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
const
std
::
string
&
name
)
override
;
bool
ZeroCopyRun
()
override
;
void
PrepareFeedFetch
();
void
OptimizeInferenceProgram
();
void
OptimizeInferenceProgram
();
Argument
&
analysis_argument
()
{
return
argument_
;
}
Argument
&
analysis_argument
()
{
return
argument_
;
}
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
framework
::
Scope
*
scope
()
{
return
executor_
->
scope
();
}
framework
::
ProgramDesc
&
program
()
{
return
*
inference_program_
;
}
protected:
bool
LoadProgramDesc
();
bool
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
input_datas
,
framework
::
Scope
*
scope
);
bool
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
output_data
,
framework
::
Scope
*
scope
);
template
<
typename
T
>
void
GetFetchOne
(
const
framework
::
LoDTensor
&
fetchs
,
PaddleTensor
*
output_data
);
private:
private:
contrib
::
AnalysisConfig
config_
;
contrib
::
AnalysisConfig
config_
;
Argument
argument_
;
Argument
argument_
;
std
::
unique_ptr
<
NaiveExecutor
>
executor_
;
platform
::
Place
place_
;
std
::
shared_ptr
<
framework
::
Scope
>
scope_
;
framework
::
Scope
*
sub_scope_
{
nullptr
};
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
std
::
vector
<
framework
::
OpDesc
*>
fetchs_
;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
};
};
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor_tester.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
namespace
paddle
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
TEST
(
AnalysisPredictor
,
ZeroCopy
)
{
AnalysisConfig
config
;
config
.
model_dir
=
FLAGS_dirname
+
"/word2vec.inference.model"
;
config
.
use_feed_fetch_ops
=
false
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
auto
w0
=
predictor
->
GetInputTensor
(
"firstw"
);
auto
w1
=
predictor
->
GetInputTensor
(
"secondw"
);
auto
w2
=
predictor
->
GetInputTensor
(
"thirdw"
);
auto
w3
=
predictor
->
GetInputTensor
(
"forthw"
);
w0
->
Reshape
({
4
,
1
});
w1
->
Reshape
({
4
,
1
});
w2
->
Reshape
({
4
,
1
});
w3
->
Reshape
({
4
,
1
});
auto
*
w0_data
=
w0
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
auto
*
w1_data
=
w1
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
auto
*
w2_data
=
w2
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
auto
*
w3_data
=
w3
->
mutable_data
<
int64_t
>
(
PaddlePlace
::
kCPU
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
w0_data
[
i
]
=
i
;
w1_data
[
i
]
=
i
;
w2_data
[
i
]
=
i
;
w3_data
[
i
]
=
i
;
}
predictor
->
ZeroCopyRun
();
auto
out
=
predictor
->
GetOutputTensor
(
"fc_1.tmp_2"
);
PaddlePlace
place
;
int
size
=
0
;
auto
*
out_data
=
out
->
data
<
float
>
(
&
place
,
&
size
);
LOG
(
INFO
)
<<
"output size: "
<<
size
/
sizeof
(
float
);
LOG
(
INFO
)
<<
"output_data: "
<<
out_data
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/api.cc
浏览文件 @
a6fbf7ec
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
//
you may not use this file except in compliance with the License.
// Licensed under the Apache License, Version 2.0 (the "License");
You may obtain a copy of the License at
// you may not use this file except in compliance with the License.
http://www.apache.org/licenses/LICENSE-2.0
// You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
//
distributed under the License is distributed on an "AS IS" BASIS,
// http://www.apache.org/licenses/LICENSE-2.0
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
See the License for the specific language governing permissions and
// Unless required by applicable law or agreed to in writing, software
limitations under the License. */
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
...
@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
}
}
}
}
PaddleBuf
::
PaddleBuf
(
PaddleBuf
&&
other
)
PaddleBuf
::
PaddleBuf
(
PaddleBuf
&&
other
)
:
data_
(
other
.
data_
),
:
data_
(
other
.
data_
),
length_
(
other
.
length_
),
length_
(
other
.
length_
),
memory_owned_
(
other
.
memory_owned_
)
{
memory_owned_
(
other
.
memory_owned_
)
{
...
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
...
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
other
.
length_
=
0
;
other
.
length_
=
0
;
}
}
PaddleBuf
::
PaddleBuf
(
const
PaddleBuf
&
other
)
{
*
this
=
other
;
}
PaddleBuf
::
PaddleBuf
(
const
PaddleBuf
&
other
)
{
*
this
=
other
;
}
PaddleBuf
&
PaddleBuf
::
operator
=
(
const
PaddleBuf
&
other
)
{
PaddleBuf
&
PaddleBuf
::
operator
=
(
const
PaddleBuf
&
other
)
{
if
(
!
other
.
memory_owned_
)
{
if
(
!
other
.
memory_owned_
)
{
data_
=
other
.
data_
;
data_
=
other
.
data_
;
length_
=
other
.
length_
;
length_
=
other
.
length_
;
...
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
...
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
return
*
this
;
return
*
this
;
}
}
PaddleBuf
&
PaddleBuf
::
operator
=
(
PaddleBuf
&&
other
)
{
PaddleBuf
&
PaddleBuf
::
operator
=
(
PaddleBuf
&&
other
)
{
// only the buffer with external memory can be copied
// only the buffer with external memory can be copied
data_
=
other
.
data_
;
data_
=
other
.
data_
;
length_
=
other
.
length_
;
length_
=
other
.
length_
;
...
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
...
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
}
}
}
}
void
PaddleBuf
::
Reset
(
void
*
data
,
size_t
length
)
{
void
PaddleBuf
::
Reset
(
void
*
data
,
size_t
length
)
{
Free
();
Free
();
memory_owned_
=
false
;
memory_owned_
=
false
;
data_
=
data
;
data_
=
data
;
...
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
...
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void
PaddleBuf
::
Free
()
{
void
PaddleBuf
::
Free
()
{
if
(
memory_owned_
&&
data_
)
{
if
(
memory_owned_
&&
data_
)
{
PADDLE_ENFORCE_GT
(
length_
,
0
);
PADDLE_ENFORCE_GT
(
length_
,
0
);
free
(
static_cast
<
char
*>
(
data_
));
free
(
static_cast
<
char
*>
(
data_
));
data_
=
nullptr
;
data_
=
nullptr
;
length_
=
0
;
length_
=
0
;
}
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
a6fbf7ec
...
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
...
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
VLOG
(
4
)
<<
"Run prepared context"
;
VLOG
(
4
)
<<
"Run prepared context"
;
executor_
->
RunPreparedContext
(
ctx_
.
get
(),
scope
,
executor_
->
RunPreparedContext
(
ctx_
.
get
(),
scope
,
false
,
/* don't create local scope each time*/
false
,
/* don't create local scope each time*/
false
/* don't create variable ea
t
ch time */
);
false
/* don't create variable each time */
);
VLOG
(
4
)
<<
"Finish prepared context"
;
VLOG
(
4
)
<<
"Finish prepared context"
;
// get fetch variable
// get fetch variable
if
(
!
GetFetch
(
output_data
,
scope
))
{
if
(
!
GetFetch
(
output_data
,
scope
))
{
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
a6fbf7ec
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
...
@@ -30,6 +30,8 @@
...
@@ -30,6 +30,8 @@
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
...
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
~
NativePaddlePredictor
()
override
;
~
NativePaddlePredictor
()
override
;
framework
::
Scope
*
scope
()
{
return
sub_scope_
?
sub_scope_
:
scope_
.
get
();
}
protected:
protected:
bool
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
input_datas
,
bool
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
input_datas
,
framework
::
Scope
*
scope
);
framework
::
Scope
*
scope
);
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
a6fbf7ec
...
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
...
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig
GetConfig
()
{
NativeConfig
GetConfig
()
{
NativeConfig
config
;
NativeConfig
config
;
config
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config
.
model_dir
=
FLAGS_dirname
+
"
/
word2vec.inference.model"
;
LOG
(
INFO
)
<<
"dirname "
<<
config
.
model_dir
;
LOG
(
INFO
)
<<
"dirname "
<<
config
.
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
config
.
fraction_of_gpu_memory
=
0.15
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
...
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
...
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
config
.
model_dir
=
FLAGS_dirname
+
"image_classification_resnet.inference.model"
;
FLAGS_dirname
+
"
/
image_classification_resnet.inference.model"
;
const
bool
is_combined
=
false
;
const
bool
is_combined
=
false
;
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
...
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
...
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
config
.
model_dir
=
FLAGS_dirname
+
"image_classification_resnet.inference.model"
;
FLAGS_dirname
+
"
/
image_classification_resnet.inference.model"
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
std
::
vector
<
framework
::
LoDTensor
>
jobs
(
num_jobs
);
std
::
vector
<
framework
::
LoDTensor
>
jobs
(
num_jobs
);
...
...
paddle/fluid/inference/api/details/zero_copy_tensor.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
void
ZeroCopyTensor
::
Reshape
(
const
std
::
vector
<
int
>
&
shape
)
{
PADDLE_ENFORCE
(
!
name_
.
empty
(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."
);
PADDLE_ENFORCE
(
input_or_output_
,
"Can't reshape the output tensor, it is readonly"
);
PADDLE_ENFORCE
(
scope_
);
auto
*
scope
=
static_cast
<
framework
::
Scope
*>
(
scope_
);
auto
*
var
=
scope
->
FindVar
(
name_
);
PADDLE_ENFORCE
(
var
,
"No tensor called [%s] in the runtime scope"
,
name_
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
(
shape
));
}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
switch
(
static_cast
<
int
>
(
place
))
{
case
static_cast
<
int
>
(
PaddlePlace
::
kCPU
):
{
return
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
}
case
static_cast
<
int
>
(
PaddlePlace
::
kGPU
):
{
return
tensor
->
mutable_data
<
T
>
(
platform
::
CUDAPlace
());
}
default:
PADDLE_THROW
(
"Unsupported place: %d"
,
static_cast
<
int
>
(
place
));
break
;
}
return
nullptr
;
}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
auto
*
res
=
tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
*
place
=
PaddlePlace
::
kCPU
;
}
else
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
*
place
=
PaddlePlace
::
kGPU
;
}
else
{
*
place
=
PaddlePlace
::
kUNK
;
}
*
size
=
tensor
->
numel
();
return
res
;
}
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
int
*
size
);
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
int
*
size
);
template
float
*
ZeroCopyTensor
::
mutable_data
<
float
>(
PaddlePlace
place
);
template
int64_t
*
ZeroCopyTensor
::
mutable_data
<
int64_t
>(
PaddlePlace
place
);
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
PADDLE_ENFORCE
(
!
name_
.
empty
(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."
);
PADDLE_ENFORCE
(
scope_
);
auto
*
scope
=
static_cast
<
framework
::
Scope
*>
(
scope_
);
auto
*
var
=
scope
->
FindVar
(
name_
);
PADDLE_ENFORCE
(
var
,
"No tensor called [%s] in the runtime scope"
,
name_
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
return
tensor
;
}
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
PADDLE_ENFORCE
(
tensor
,
"not found tensor called %s in the scope"
,
name_
);
return
framework
::
vectorize
(
tensor
->
dims
());
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
framework
::
LoD
lod
;
for
(
auto
&
level
:
x
)
{
lod
.
emplace_back
(
level
);
}
tensor
->
set_lod
(
lod
);
}
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
for
(
auto
&
level
:
tensor
->
lod
())
{
res
.
emplace_back
(
level
);
}
return
res
;
}
}
// namespace paddle
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
0 → 100644
浏览文件 @
a6fbf7ec
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace
paddle
{
void
ZeroCopyTensor
::
Reshape
(
const
std
::
vector
<
int
>
&
shape
)
{}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
return
nullptr
;
}
template
<
typename
T
>
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
{
return
nullptr
;
}
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
int
*
size
);
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
int
*
size
);
template
float
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
);
template
int64_t
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
);
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
return
nullptr
;
}
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
{
return
{};
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{}
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
return
std
::
vector
<
std
::
vector
<
size_t
>>
();
}
}
// namespace paddle
paddle/fluid/inference/api/helper.h
浏览文件 @
a6fbf7ec
...
@@ -21,8 +21,10 @@
...
@@ -21,8 +21,10 @@
#include <sstream>
#include <sstream>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
...
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
}
}
}
}
template
<
typename
T
>
static
int
ZeroCopyTensorAssignData
(
ZeroCopyTensor
*
tensor
,
const
std
::
vector
<
std
::
vector
<
T
>>
&
data
)
{
int
size
{
0
};
auto
*
ptr
=
tensor
->
mutable_data
<
T
>
(
PaddlePlace
::
kCPU
);
int
c
=
0
;
for
(
const
auto
&
f
:
data
)
{
for
(
T
v
:
f
)
{
ptr
[
c
++
]
=
v
;
}
}
return
size
;
}
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
)
{
static
std
::
string
DescribeTensor
(
const
PaddleTensor
&
tensor
)
{
std
::
stringstream
os
;
std
::
stringstream
os
;
os
<<
"Tensor ["
<<
tensor
.
name
<<
"]
\n
"
;
os
<<
"Tensor ["
<<
tensor
.
name
<<
"]
\n
"
;
...
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
...
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
}
}
template
<
typename
T
>
std
::
string
LoDTensorSummary
(
const
framework
::
LoDTensor
&
tensor
)
{
std
::
stringstream
ss
;
ss
<<
"
\n
---- tensor ---"
<<
'\n'
;
ss
<<
"lod: ["
;
for
(
const
auto
&
level
:
tensor
.
lod
())
{
ss
<<
"[ "
;
for
(
auto
i
:
level
)
{
ss
<<
i
<<
", "
;
}
ss
<<
"]"
;
}
ss
<<
"]
\n
"
;
ss
<<
"shape: ["
;
int
size
=
1
;
for
(
int
i
=
0
;
i
<
tensor
.
dims
().
size
();
i
++
)
{
int
dim
=
tensor
.
dims
()[
i
];
ss
<<
dim
<<
", "
;
size
*=
dim
;
}
ss
<<
"]
\n
"
;
ss
<<
"data: "
;
for
(
int
i
=
0
;
i
<
std
::
min
(
20
,
size
);
i
++
)
{
ss
<<
tensor
.
data
<
T
>
()[
i
]
<<
" "
;
}
ss
<<
"
\n
"
;
return
ss
.
str
();
}
static
bool
CompareLoD
(
const
framework
::
LoD
&
a
,
const
framework
::
LoD
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"lod size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
auto
&
al
=
a
[
i
];
auto
&
bl
=
b
[
i
];
if
(
al
.
size
()
!=
bl
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"level size %d != %d"
,
al
.
size
(),
bl
.
size
());
return
false
;
}
}
return
true
;
}
static
bool
CompareShape
(
const
std
::
vector
<
int64_t
>
&
a
,
const
std
::
vector
<
int64_t
>
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
if
(
a
[
i
]
!=
b
[
i
])
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape %d-th element not match %d != %d"
,
i
,
a
[
i
],
b
[
i
]);
return
false
;
}
}
return
true
;
}
static
bool
CompareTensorData
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
auto
a_shape
=
framework
::
vectorize
(
a
.
dims
());
auto
b_shape
=
framework
::
vectorize
(
b
.
dims
());
size_t
a_size
=
std
::
accumulate
(
a_shape
.
begin
(),
a_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
size_t
b_size
=
std
::
accumulate
(
b_shape
.
begin
(),
b_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
if
(
a_size
!=
b_size
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data size not match, %d != %d"
,
a_size
,
b_size
);
}
for
(
size_t
i
=
0
;
i
<
a_size
;
i
++
)
{
if
(
a
.
type
()
==
typeid
(
float
))
{
const
auto
*
a_data
=
a
.
data
<
float
>
();
const
auto
*
b_data
=
b
.
data
<
float
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
else
if
(
a
.
type
()
==
typeid
(
int64_t
))
{
const
auto
*
a_data
=
a
.
data
<
int64_t
>
();
const
auto
*
b_data
=
b
.
data
<
int64_t
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
}
return
true
;
}
static
bool
CompareTensor
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
if
(
!
CompareLoD
(
a
.
lod
(),
b
.
lod
()))
{
return
false
;
}
if
(
!
CompareShape
(
framework
::
vectorize
(
a
.
dims
()),
framework
::
vectorize
(
b
.
dims
())))
{
return
false
;
}
if
(
!
CompareTensorData
(
a
,
b
))
{
return
false
;
}
return
true
;
}
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
a6fbf7ec
...
@@ -101,6 +101,40 @@ struct PaddleTensor {
...
@@ -101,6 +101,40 @@ struct PaddleTensor {
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
// Tensor+LoD equals LoDTensor
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
// Tensor+LoD equals LoDTensor
};
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
// Tensor without copy, currently only supports AnalysisPredictor.
class
ZeroCopyTensor
{
public:
void
Reshape
(
const
std
::
vector
<
int
>&
shape
);
// Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size.
// Once can directly call this data to feed the data.
// This is for write the input tensor.
template
<
typename
T
>
T
*
mutable_data
(
PaddlePlace
place
);
// Get the memory directly, will return the place and memory size by pointer.
// This is for reading the output tensor.
template
<
typename
T
>
T
*
data
(
PaddlePlace
*
place
,
int
*
size
);
std
::
vector
<
int64_t
>
shape
();
void
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>&
x
);
std
::
vector
<
std
::
vector
<
size_t
>>
lod
()
const
;
protected:
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
void
*
FindTensor
()
const
;
private:
std
::
string
name_
;
bool
input_or_output_
;
friend
class
AnalysisPredictor
;
void
*
scope_
{
nullptr
};
};
/*
/*
* A simple Inference API for Paddle.
* A simple Inference API for Paddle.
*/
*/
...
@@ -120,6 +154,19 @@ class PaddlePredictor {
...
@@ -120,6 +154,19 @@ class PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
int
batch_size
=
-
1
)
=
0
;
// Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly,
// without copy.
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
}
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
}
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
// Clone a predictor that share the model weights, the Cloned predictor should
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
// be thread-safe.
virtual
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
=
0
;
virtual
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
=
0
;
...
@@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig {
...
@@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig {
IrPassMode
ir_mode
{
IrPassMode
::
kExclude
};
IrPassMode
ir_mode
{
IrPassMode
::
kExclude
};
std
::
vector
<
std
::
string
>
ir_passes
;
std
::
vector
<
std
::
string
>
ir_passes
;
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
bool
use_feed_fetch_ops
{
true
};
// NOTE this is just for internal development, please not use it. NOT
// stable
// yet.
bool
_use_mkldnn
{
false
};
bool
_use_mkldnn
{
false
};
};
};
...
...
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
浏览文件 @
a6fbf7ec
...
@@ -18,6 +18,8 @@ namespace paddle {
...
@@ -18,6 +18,8 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
using
contrib
::
AnalysisConfig
;
struct
DataRecord
{
struct
DataRecord
{
std
::
vector
<
int64_t
>
data
;
std
::
vector
<
int64_t
>
data
;
std
::
vector
<
size_t
>
lod
;
std
::
vector
<
size_t
>
lod
;
...
@@ -78,6 +80,7 @@ struct DataRecord {
...
@@ -78,6 +80,7 @@ struct DataRecord {
}
}
}
}
}
}
DataRecord
NextBatch
()
{
DataRecord
NextBatch
()
{
DataRecord
data
;
DataRecord
data
;
data
.
data
=
batched_datas
[
batch_iter
];
data
.
data
=
batched_datas
[
batch_iter
];
...
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
...
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
int
num_ops
;
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_gru_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_gru_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
...
...
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
浏览文件 @
a6fbf7ec
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
struct
DataRecord
{
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
word_data_all
,
mention_data_all
;
std
::
vector
<
std
::
vector
<
int64_t
>>
word_data_all
,
mention_data_all
;
...
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
...
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
int
num_ops
;
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_gru_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_gru_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
a6fbf7ec
...
@@ -12,12 +12,16 @@
...
@@ -12,12 +12,16 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_bool
(
with_precision_check
,
true
,
"turn on test"
);
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
using
namespace
framework
;
// NOLINT
using
namespace
framework
;
// NOLINT
using
namespace
contrib
;
// NOLINT
struct
DataRecord
{
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
link_step_data_all
;
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
link_step_data_all
;
...
@@ -29,10 +33,12 @@ struct DataRecord {
...
@@ -29,10 +33,12 @@ struct DataRecord {
size_t
batch_iter
{
0
};
size_t
batch_iter
{
0
};
size_t
batch_size
{
1
};
size_t
batch_size
{
1
};
DataRecord
()
=
default
;
DataRecord
()
=
default
;
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
:
batch_size
(
batch_size
)
{
:
batch_size
(
batch_size
)
{
Load
(
path
);
Load
(
path
);
}
}
DataRecord
NextBatch
()
{
DataRecord
NextBatch
()
{
DataRecord
data
;
DataRecord
data
;
size_t
batch_end
=
batch_iter
+
batch_size
;
size_t
batch_end
=
batch_iter
+
batch_size
;
...
@@ -101,6 +107,7 @@ struct DataRecord {
...
@@ -101,6 +107,7 @@ struct DataRecord {
num_samples
=
num_lines
;
num_samples
=
num_lines
;
}
}
};
};
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
int
batch_size
)
{
int
batch_size
)
{
PaddleTensor
lod_attention_tensor
,
init_zero_tensor
,
lod_tensor_tensor
,
PaddleTensor
lod_attention_tensor
,
init_zero_tensor
,
lod_tensor_tensor
,
...
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
}
}
void
SetConfig
(
contrib
::
AnalysisConfig
*
cfg
)
{
void
PrepareZeroCopyInputs
(
ZeroCopyTensor
*
lod_attention_tensor
,
ZeroCopyTensor
*
cell_init_tensor
,
ZeroCopyTensor
*
data_tensor
,
ZeroCopyTensor
*
hidden_init_tensor
,
ZeroCopyTensor
*
week_tensor
,
ZeroCopyTensor
*
minute_tensor
,
DataRecord
*
data_record
,
int
batch_size
)
{
auto
one_batch
=
data_record
->
NextBatch
();
std
::
vector
<
int
>
rnn_link_data_shape
(
{
static_cast
<
int
>
(
one_batch
.
rnn_link_data
.
size
()),
static_cast
<
int
>
(
one_batch
.
rnn_link_data
.
front
().
size
())});
lod_attention_tensor
->
Reshape
({
1
,
2
});
lod_attention_tensor
->
SetLoD
({
one_batch
.
lod1
,
one_batch
.
lod2
});
cell_init_tensor
->
Reshape
({
batch_size
,
15
});
cell_init_tensor
->
SetLoD
({
one_batch
.
lod3
});
hidden_init_tensor
->
Reshape
({
batch_size
,
15
});
hidden_init_tensor
->
SetLoD
({
one_batch
.
lod3
});
data_tensor
->
Reshape
(
rnn_link_data_shape
);
data_tensor
->
SetLoD
({
one_batch
.
lod1
});
week_tensor
->
Reshape
(
{
static_cast
<
int
>
(
one_batch
.
rnn_week_datas
.
size
()),
static_cast
<
int
>
(
one_batch
.
rnn_week_datas
.
front
().
size
())});
week_tensor
->
SetLoD
({
one_batch
.
lod3
});
minute_tensor
->
Reshape
(
{
static_cast
<
int
>
(
one_batch
.
rnn_minute_datas
.
size
()),
static_cast
<
int
>
(
one_batch
.
rnn_minute_datas
.
front
().
size
())});
minute_tensor
->
SetLoD
({
one_batch
.
lod3
});
// assign data
float
arr0
[]
=
{
0
,
0
};
std
::
vector
<
float
>
zeros
(
batch_size
*
15
,
0
);
std
::
copy_n
(
arr0
,
2
,
lod_attention_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
std
::
copy_n
(
arr0
,
2
,
data_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
std
::
copy_n
(
zeros
.
begin
(),
zeros
.
size
(),
cell_init_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
std
::
copy_n
(
zeros
.
begin
(),
zeros
.
size
(),
hidden_init_tensor
->
mutable_data
<
float
>
(
PaddlePlace
::
kCPU
));
ZeroCopyTensorAssignData
(
data_tensor
,
one_batch
.
rnn_link_data
);
ZeroCopyTensorAssignData
(
week_tensor
,
one_batch
.
rnn_week_datas
);
ZeroCopyTensorAssignData
(
minute_tensor
,
one_batch
.
rnn_minute_datas
);
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
use_gpu
=
false
;
cfg
->
use_gpu
=
false
;
...
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
...
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
int
num_ops
;
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
2
);
// bi-directional LSTM
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
2
);
// bi-directional LSTM
...
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {
...
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
4
/* num_threads */
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
bool
CompareTensors
(
framework
::
Scope
&
a_scope
,
framework
::
Scope
&
b_scope
,
const
std
::
vector
<
std
::
string
>
&
tensors
)
{
for
(
auto
&
x
:
tensors
)
{
auto
*
a_var
=
a_scope
.
FindVar
(
x
);
auto
*
b_var
=
b_scope
.
FindVar
(
x
);
if
(
a_var
&&
b_var
)
{
if
(
a_var
->
Type
()
==
typeid
(
framework
::
LoDTensor
)
||
a_var
->
Type
()
==
typeid
(
framework
::
Tensor
))
{
LOG
(
INFO
)
<<
"comparing tensor "
<<
x
;
auto
&
a_t
=
a_var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
b_t
=
b_var
->
Get
<
framework
::
LoDTensor
>
();
if
(
!
inference
::
CompareTensor
(
a_t
,
b_t
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor %s not match in two scopes"
,
x
);
}
}
else
{
LOG
(
INFO
)
<<
"skip no tensor "
<<
x
;
}
}
else
{
LOG
(
INFO
)
<<
"skip tensor "
<<
x
;
}
}
return
true
;
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// on the complex RNN1 model.
TEST
(
Analyzer_rnn1
,
ZeroCopy
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
use_feed_fetch_ops
=
false
;
PaddlePlace
place
;
int
output_size
{
0
};
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
// the analysis predictor needs feed/fetch.
auto
analysis_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
data
);
NEW_TENSOR
(
week
);
NEW_TENSOR
(
minute
);
NEW_TENSOR
(
hidden_init
);
// Prepare data for AnalysisPredictor
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
PrepareZeroCopyInputs
(
data_lod_attention_tensor
.
get
(),
cell_init_tensor
.
get
(),
data_tensor
.
get
(),
hidden_init_tensor
.
get
(),
week_tensor
.
get
(),
minute_tensor
.
get
(),
&
data
,
FLAGS_batch_size
);
// Prepare data for NativePredictor
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
native_inputs
;
SetInput
(
&
native_inputs
);
std
::
vector
<
PaddleTensor
>
native_outputs
;
std
::
vector
<
PaddleTensor
>
analysis_outputs
;
auto
output_tensor
=
predictor
->
GetOutputTensor
(
"final_output.tmp_1"
);
// Run analysis predictor
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
2
);
// bi-directional LSTM
ASSERT_EQ
(
fuse_statis
.
at
(
"seq_concat_fc_fuse"
),
1
);
ASSERT_EQ
(
num_ops
,
13
);
// After graph optimization, only 13 operators exists.
Timer
timer
;
double
total_time
{
0
};
double
native_total_time
{
0
};
double
analysis_total_time
{
0.
};
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
timer
.
tic
();
predictor
->
ZeroCopyRun
();
total_time
+=
timer
.
toc
();
}
auto
*
output_data
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
ASSERT_GT
(
output_size
,
0
);
// more than one output!
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
// Run native predictor.
timer
.
tic
();
ASSERT_TRUE
(
native_predictor
->
Run
(
native_inputs
.
front
(),
&
native_outputs
));
native_total_time
+=
timer
.
toc
();
}
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
timer
.
tic
();
ASSERT_TRUE
(
analysis_predictor
->
Run
(
native_inputs
.
front
(),
&
analysis_outputs
));
analysis_total_time
+=
timer
.
toc
();
}
if
(
!
FLAGS_with_precision_check
)
{
return
;
}
int
native_output_size
=
VecReduceToInt
(
native_outputs
.
front
().
shape
);
EXPECT_EQ
(
native_output_size
,
output_size
);
// Compare tensors between analysis and zerocopy
auto
*
p0
=
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
());
auto
*
p1
=
static_cast
<
AnalysisPredictor
*>
(
analysis_predictor
.
get
());
auto
*
p2
=
static_cast
<
NativePaddlePredictor
*>
(
native_predictor
.
get
());
std
::
vector
<
std
::
string
>
tensor_names
;
for
(
auto
&
var_desc
:
p0
->
program
().
Block
(
0
).
AllVars
())
{
tensor_names
.
push_back
(
var_desc
->
Name
());
}
LOG
(
INFO
)
<<
"Comparing tensors"
;
ASSERT_TRUE
(
CompareTensors
(
*
p0
->
scope
(),
*
p1
->
scope
(),
{
"final_output.tmp_1"
}));
ASSERT_TRUE
(
CompareTensors
(
*
p0
->
scope
(),
*
p2
->
scope
(),
{
"final_output.tmp_1"
}));
LOG
(
INFO
)
<<
"output1 "
<<
inference
::
LoDTensorSummary
<
float
>
(
p0
->
scope
()
->
FindVar
(
"final_output.tmp_1"
)
->
Get
<
framework
::
LoDTensor
>
());
LOG
(
INFO
)
<<
"output2 "
<<
inference
::
LoDTensorSummary
<
float
>
(
p1
->
scope
()
->
FindVar
(
"final_output.tmp_1"
)
->
Get
<
framework
::
LoDTensor
>
());
LOG
(
INFO
)
<<
"output3 "
<<
inference
::
LoDTensorSummary
<
float
>
(
p2
->
scope
()
->
FindVar
(
"final_output.tmp_1"
)
->
Get
<
framework
::
LoDTensor
>
());
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
LOG
(
INFO
)
<<
output_data
[
i
]
<<
" "
<<
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
())[
i
]
<<
" "
<<
static_cast
<
float
*>
(
analysis_outputs
.
front
().
data
.
data
())[
i
];
EXPECT_NEAR
(
output_data
[
i
],
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
())[
i
],
1e-3
);
}
LOG
(
INFO
)
<<
"batch_size: "
<<
FLAGS_batch_size
;
LOG
(
INFO
)
<<
"zero average time: "
<<
total_time
/
(
FLAGS_repeat
*
FLAGS_batch_size
);
LOG
(
INFO
)
<<
"analysis average time: "
<<
analysis_total_time
/
(
FLAGS_repeat
*
FLAGS_batch_size
);
LOG
(
INFO
)
<<
"native average time: "
<<
native_total_time
/
(
FLAGS_repeat
*
FLAGS_batch_size
);
}
TEST
(
Analyzer_rnn1
,
ZeroCopyMultiThread
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
use_feed_fetch_ops
=
false
;
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto
base_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
double
total_time_of_threads
{
0
};
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
));
}
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
threads
.
emplace_back
([
config
,
&
total_time_of_threads
,
&
predictors
,
tid
]
{
// auto predictor = base_predictor->Clone();
auto
&
predictor
=
predictors
[
tid
];
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
data
);
NEW_TENSOR
(
week
);
NEW_TENSOR
(
minute
);
NEW_TENSOR
(
hidden_init
);
// Prepare data for AnalysisPredictor
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
Timer
timer
;
double
total_time
{
0
};
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
i
++
)
{
PrepareZeroCopyInputs
(
data_lod_attention_tensor
.
get
(),
cell_init_tensor
.
get
(),
data_tensor
.
get
(),
hidden_init_tensor
.
get
(),
week_tensor
.
get
(),
minute_tensor
.
get
(),
&
data
,
FLAGS_batch_size
);
timer
.
tic
();
predictor
->
ZeroCopyRun
();
total_time
+=
timer
.
toc
();
}
total_time_of_threads
+=
total_time
;
LOG
(
INFO
)
<<
"thread time: "
<<
total_time
/
FLAGS_repeat
;
});
}
for
(
auto
&
t
:
threads
)
{
t
.
join
();
}
LOG
(
INFO
)
<<
"average time: "
<<
total_time_of_threads
/
FLAGS_num_threads
/
FLAGS_repeat
;
}
}
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
浏览文件 @
a6fbf7ec
...
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
...
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
AnalysisConfig
cfg
;
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
int
num_ops
;
int
num_ops
;
auto
fuse_statis
=
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
}
}
// Compare result of NativeConfig and AnalysisConfig
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
a6fbf7ec
...
@@ -19,6 +19,7 @@ limitations under the License. */
...
@@ -19,6 +19,7 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
using
contrib
::
AnalysisConfig
;
struct
Record
{
struct
Record
{
std
::
vector
<
float
>
data
;
std
::
vector
<
float
>
data
;
...
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
...
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
AnalysisConfig
cfg
;
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
int
num_ops
;
int
num_ops
;
GetFuseStatis
(
cfg
,
&
num_ops
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
}
}
// Compare result of NativeConfig and AnalysisConfig
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
a6fbf7ec
...
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
...
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
std
::
unordered_map
<
std
::
string
,
int
>
GetFuseStatis
(
AnalysisConfig
config
,
std
::
unordered_map
<
std
::
string
,
int
>
GetFuseStatis
(
PaddlePredictor
*
predictor
,
int
*
num_ops
)
{
int
*
num_ops
)
{
auto
predictor
=
CreateTestPredictor
(
config
);
auto
*
analysis_predictor
=
static_cast
<
AnalysisPredictor
*>
(
predictor
);
AnalysisPredictor
*
analysis_predictor
=
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
());
auto
&
fuse_statis
=
analysis_predictor
->
analysis_argument
()
auto
&
fuse_statis
=
analysis_predictor
->
analysis_argument
()
.
Get
<
std
::
unordered_map
<
std
::
string
,
int
>>
(
.
Get
<
std
::
unordered_map
<
std
::
string
,
int
>>
(
framework
::
ir
::
kFuseStatisAttr
);
framework
::
ir
::
kFuseStatisAttr
);
...
...
paddle/fluid/operators/detection/generate_proposals_op.cc
浏览文件 @
a6fbf7ec
...
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cmath>
#include <cstring>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/
framework/var_type
.h"
#include "paddle/fluid/
operators/detail/safe_ref
.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
...
@@ -25,21 +27,17 @@ namespace operators {
...
@@ -25,21 +27,17 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
struct
AppendProposalsFunctor
{
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
LoDTensor
*
out_
;
int64_t
offset_
;
Tensor
*
to_add_
;
AppendProposalsFunctor
(
LoDTensor
*
out
,
int64_t
offset
,
Tensor
*
to_add
)
static
void
AppendProposals
(
Tensor
*
dst
,
int64_t
offset
,
const
Tensor
&
src
)
{
:
out_
(
out
),
offset_
(
offset
),
to_add_
(
to_add
)
{}
auto
*
out_data
=
dst
->
data
<
void
>
();
auto
*
to_add_data
=
src
.
data
<
void
>
();
template
<
typename
T
>
size_t
size_of_t
=
framework
::
SizeOfType
(
src
.
type
());
void
apply
()
const
{
offset
*=
size_of_t
;
auto
*
out_data
=
out_
->
data
<
T
>
();
std
::
memcpy
(
auto
*
to_add_data
=
to_add_
->
data
<
T
>
();
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
out_data
)
+
offset
),
memcpy
(
out_data
+
offset_
,
to_add_data
,
to_add_
->
numel
()
*
sizeof
(
T
));
to_add_data
,
src
.
numel
()
*
size_of_t
);
}
}
};
class
GenerateProposalsOp
:
public
framework
::
OperatorWithKernel
{
class
GenerateProposalsOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
...
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
...
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
};
};
template
<
class
T
>
template
<
class
T
>
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
all_anchors
,
static
inline
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
row
=
all_anchors
->
dims
()[
0
];
int64_t
row
=
all_anchors
->
dims
()[
0
];
...
@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
...
@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
anchor_center_y
;
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
],
bbox_deltas_data
[
i
*
len
+
2
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_width
;
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
],
bbox_deltas_data
[
i
*
len
+
3
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_height
;
anchor_height
;
}
else
{
}
else
{
bbox_center_x
=
bbox_center_x
=
...
@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
...
@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y
=
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_width
;
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_height
;
anchor_height
;
}
}
...
@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
...
@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
}
}
template
<
class
T
>
template
<
class
T
>
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
im_info
,
static
inline
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
)
{
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
zero
(
0
);
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
1
)
{
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
2
)
{
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
{
}
else
{
boxes_data
[
i
]
=
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
}
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
,
static
inline
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
im_scale
=
im_info_data
[
2
];
T
im_scale
=
im_info_data
[
2
];
...
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
...
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
keep
->
Resize
({
keep_len
});
keep
->
Resize
({
keep_len
});
}
}
bool
SortScorePairDescend
(
const
std
::
pair
<
float
,
int
>
&
pair1
,
const
std
::
pair
<
float
,
int
>
&
pair2
)
{
return
pair1
.
first
>
pair2
.
first
;
}
template
<
class
T
>
template
<
class
T
>
void
GetMaxScoreIndex
(
const
std
::
vector
<
T
>
&
scores
,
static
inline
std
::
vector
<
std
::
pair
<
T
,
int
>>
GetSortedScoreIndex
(
std
::
vector
<
std
::
pair
<
T
,
int
>>
*
sorted_indices
)
{
const
std
::
vector
<
T
>
&
scores
)
{
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
sorted_indices
.
reserve
(
scores
.
size
());
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
->
push_back
(
std
::
make_pair
(
scores
[
i
],
i
)
);
sorted_indices
.
emplace_back
(
scores
[
i
],
i
);
}
}
// Sort the score pair according to the scores in descending order
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
->
begin
(),
sorted_indices
->
end
(),
std
::
stable_sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
SortScorePairDescend
);
[](
const
std
::
pair
<
T
,
int
>
&
a
,
const
std
::
pair
<
T
,
int
>
&
b
)
{
return
a
.
first
<
b
.
first
;
});
return
sorted_indices
;
}
}
template
<
class
T
>
template
<
class
T
>
T
BBoxArea
(
const
T
*
box
,
const
bool
normalized
)
{
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
// (e.g. xmax < xmin or ymax < ymin), return 0.
...
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
...
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
}
}
template
<
class
T
>
template
<
class
T
>
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
const
bool
normalized
)
{
static
inline
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
return
static_cast
<
T
>
(
0.
);
...
@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
...
@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
std
::
max
(
0.0
f
,
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_w
=
std
::
max
(
T
(
0
)
,
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
0.0
f
,
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_h
=
std
::
max
(
T
(
0
)
,
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
...
@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
...
@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
}
}
}
}
template
<
typename
T
>
static
inline
Tensor
VectorToTensor
(
const
std
::
vector
<
T
>
&
selected_indices
,
int
selected_num
)
{
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
auto
*
keep_data
=
keep_nms
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
class
T
>
template
<
class
T
>
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
Tensor
*
scores
,
static
inline
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
const
T
nms_threshold
,
const
float
eta
)
{
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
PADDLE_ENFORCE_NOT_NULL
(
bbox
);
PADDLE_ENFORCE_NOT_NULL
(
bbox
);
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
// 4: [xmin ymin xmax ymax]
...
@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
...
@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetMaxScoreIndex
<
T
>
(
scores_data
,
&
sorted_indices
);
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
bool
flag
;
while
(
sorted_indices
.
size
()
!=
0
)
{
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
front
().
second
;
int
idx
=
sorted_indices
.
back
().
second
;
flag
=
true
;
bool
flag
=
true
;
for
(
size_t
k
=
0
;
k
<
selected_indices
.
size
();
++
k
)
{
for
(
int
kept_idx
:
selected_indices
)
{
if
(
flag
)
{
if
(
flag
)
{
const
int
kept_idx
=
selected_indices
[
k
];
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
flag
=
(
overlap
<=
adaptive_threshold
);
...
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
...
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
}
}
if
(
flag
)
{
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
selected_indices
.
push_back
(
idx
);
selected_num
++
;
++
selected_num
;
}
}
sorted_indices
.
erase
(
sorted_indices
.
begin
());
sorted_indices
.
erase
(
sorted_indices
.
end
());
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
adaptive_threshold
*=
eta
;
}
}
}
}
Tensor
keep_nms
;
return
VectorToTensor
(
selected_indices
,
selected_num
);
keep_nms
.
Resize
({
selected_num
});
int
*
keep_data
=
keep_nms
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
class
GenerateProposalsKernel
:
public
framework
::
OpKernel
<
T
>
{
class
GenerateProposalsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
anchors
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Anchors"
),
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
"Cannot find input Anchors(%s) in scope"
,
context
.
Inputs
(
"Anchors"
)[
0
]);
auto
variances
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Variances"
),
"Cannot find input Variances(%s) in scope"
,
context
.
Inputs
(
"Variances"
)[
0
]);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
...
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
float
min_size
=
context
.
Attr
<
float
>
(
"min_size"
);
float
min_size
=
context
.
Attr
<
float
>
(
"min_size"
);
float
eta
=
context
.
Attr
<
float
>
(
"eta"
);
float
eta
=
context
.
Attr
<
float
>
(
"eta"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
scores_dim
=
scores
->
dims
();
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
int64_t
w_score
=
scores_dim
[
3
];
auto
bbox_dim
=
bbox_deltas
->
dims
();
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
int64_t
w_bbox
=
bbox_dim
[
3
];
...
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_swap
.
mutable_data
<
T
>
({
num
,
h_score
,
w_score
,
c_score
},
scores_swap
.
mutable_data
<
T
>
({
num
,
h_score
,
w_score
,
c_score
},
dev_ctx
.
GetPlace
());
dev_ctx
.
GetPlace
());
math
::
Transpose
<
DeviceContext
,
T
,
4
>
trans
;
math
::
Transpose
<
platform
::
CPU
DeviceContext
,
T
,
4
>
trans
;
std
::
vector
<
int
>
axis
=
{
0
,
2
,
3
,
1
};
std
::
vector
<
int
>
axis
=
{
0
,
2
,
3
,
1
};
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
framework
::
LoD
lod
;
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
lod0
(
1
,
0
);
lod
.
resize
(
1
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
)
;
auto
&
lod0
=
lod
[
0
]
;
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
}
);
lod0
.
push_back
(
0
);
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
}
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
var
iances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
...
@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
tensor_pair
.
first
;
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
scores
=
tensor_pair
.
second
;
Tensor
&
scores
=
tensor_pair
.
second
;
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_rois
->
type
()),
AppendProposalsFunctor
(
rpn_rois
,
4
*
num_proposals
,
&
proposals
));
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_roi_probs
->
type
()),
AppendProposalsFunctor
(
rpn_roi_probs
,
num_proposals
,
&
scores
));
AppendProposals
(
rpn_rois
,
4
*
num_proposals
,
proposals
);
AppendProposals
(
rpn_roi_probs
,
num_proposals
,
scores
);
num_proposals
+=
proposals
.
dims
()[
0
];
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
emplace
_back
(
num_proposals
);
lod0
.
push
_back
(
num_proposals
);
}
}
lod
.
emplace_back
(
lod0
);
rpn_rois
->
set_lod
(
lod
);
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
rpn_rois
->
Resize
({
num_proposals
,
4
});
...
@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
}
}
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
platform
::
CPU
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
const
Tensor
&
scores_slice
,
// [N, 1]
...
@@ -392,8 +394,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -392,8 +394,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
index
[
i
]
=
i
;
}
}
std
::
function
<
bool
(
const
int64_t
&
,
const
int64_t
&
)
>
compare
=
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
return
scores_data
[
i
]
>
scores_data
[
j
];
};
};
...
@@ -469,12 +470,12 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -469,12 +470,12 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
Generate Proposals OP
Generate Proposals OP
This operator proposes rois according to each box with their probability to be a foreground object and
This operator proposes rois according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_de
ltai
s and scores are the output of RPN. Final proposals
the box can be calculated by anchors. Bbox_de
tail
s and scores are the output of RPN. Final proposals
could be used to train detection net.
could be used to train detection net.
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
of anchors, H and W are height and width of the feature map.
of anchors, H and W are height and width of the feature map.
BboxDeltas is the differece between predicted box locat
oi
n and anchor location. In format of (N, 4*A, H, W)
BboxDeltas is the differece between predicted box locat
io
n and anchor location. In format of (N, 4*A, H, W)
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
...
@@ -490,6 +491,5 @@ namespace ops = paddle::operators;
...
@@ -490,6 +491,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
generate_proposals
,
ops
::
GenerateProposalsOp
,
REGISTER_OPERATOR
(
generate_proposals
,
ops
::
GenerateProposalsOp
,
ops
::
GenerateProposalsOpMaker
,
ops
::
GenerateProposalsOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
generate_proposals
,
ops
::
GenerateProposalsKernel
<
float
>
,
generate_proposals
,
ops
::
GenerateProposalsKernel
<
double
>
);
ops
::
GenerateProposalsKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
paddle/fluid/operators/detection/generate_proposals_op.cu
浏览文件 @
a6fbf7ec
...
@@ -19,8 +19,10 @@ limitations under the License. */
...
@@ -19,8 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -37,21 +39,25 @@ namespace {
...
@@ -37,21 +39,25 @@ namespace {
int
const
kThreadsPerBlock
=
sizeof
(
uint64_t
)
*
8
;
int
const
kThreadsPerBlock
=
sizeof
(
uint64_t
)
*
8
;
template
<
typename
T
>
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
__global__
void
RangeInitKernel
(
const
T
start
,
const
T
delta
,
const
int
size
,
T
*
out
)
{
struct
RangeInitFunctor
{
CUDA_1D_KERNEL_LOOP
(
i
,
size
)
{
out
[
i
]
=
start
+
i
*
delta
;
}
int
start_
;
}
int
delta_
;
int
*
out_
;
__device__
void
operator
()(
size_t
i
)
{
out_
[
i
]
=
start_
+
i
*
delta_
;
}
};
template
<
typename
T
>
template
<
typename
T
>
void
SortDescending
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
value
,
static
void
SortDescending
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
value_out
,
Tensor
*
index_out
)
{
const
Tensor
&
value
,
Tensor
*
value_out
,
int
num
=
value
.
numel
();
Tensor
*
index_out
)
{
int
num
=
static_cast
<
int
>
(
value
.
numel
());
Tensor
index_in_t
;
Tensor
index_in_t
;
int
*
idx_in
=
index_in_t
.
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
int
*
idx_in
=
index_in_t
.
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
int
block
=
512
;
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
ctx
,
num
)
;
auto
stream
=
ctx
.
stream
(
);
for_range
(
RangeInitFunctor
{
0
,
1
,
idx_in
}
);
RangeInitKernel
<<<
DIVUP
(
num
,
block
),
block
,
0
,
stream
>>>
(
0
,
1
,
num
,
idx_in
);
int
*
idx_out
=
index_out
->
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
int
*
idx_out
=
index_out
->
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
const
T
*
keys_in
=
value
.
data
<
T
>
();
const
T
*
keys_in
=
value
.
data
<
T
>
();
...
@@ -73,22 +79,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
...
@@ -73,22 +79,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
}
}
template
<
typename
T
>
template
<
typename
T
>
__device__
__forceinline__
T
Min
(
T
x
,
T
y
)
{
struct
BoxDecodeAndClipFunctor
{
return
x
<
y
?
x
:
y
;
const
T
*
anchor
;
}
const
T
*
deltas
;
const
T
*
var
;
template
<
typename
T
>
const
int
*
index
;
__device__
__forceinline__
T
Max
(
T
x
,
T
y
)
{
const
T
*
im_info
;
return
x
>
y
?
x
:
y
;
}
T
*
proposals
;
template
<
typename
T
>
BoxDecodeAndClipFunctor
(
const
T
*
anchor
,
const
T
*
deltas
,
const
T
*
var
,
__global__
void
BoxDecodeAndClipKernel
(
const
T
*
anchor
,
const
T
*
deltas
,
const
int
*
index
,
const
T
*
im_info
,
T
*
proposals
)
const
T
*
var
,
const
int
*
index
,
:
anchor
(
anchor
),
const
T
*
im_info
,
const
int
num
,
deltas
(
deltas
),
T
*
proposals
)
{
var
(
var
),
T
kBBoxClipDefault
=
log
(
1000.0
/
16.0
);
index
(
index
),
CUDA_1D_KERNEL_LOOP
(
i
,
num
)
{
im_info
(
im_info
),
proposals
(
proposals
)
{}
T
bbox_clip_default
{
static_cast
<
T
>
(
kBBoxClipDefault
)};
__device__
void
operator
()(
size_t
i
)
{
int
k
=
index
[
i
]
*
4
;
int
k
=
index
[
i
]
*
4
;
T
axmin
=
anchor
[
k
];
T
axmin
=
anchor
[
k
];
T
aymin
=
anchor
[
k
+
1
];
T
aymin
=
anchor
[
k
+
1
];
...
@@ -105,17 +116,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
...
@@ -105,17 +116,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T
dxmax
=
deltas
[
k
+
2
];
T
dxmax
=
deltas
[
k
+
2
];
T
dymax
=
deltas
[
k
+
3
];
T
dymax
=
deltas
[
k
+
3
];
T
d_cx
=
0.
,
d_cy
=
0.
,
d_w
=
0.
,
d_h
=
0.
;
T
d_cx
,
d_cy
,
d_w
,
d_h
;
if
(
var
)
{
if
(
var
)
{
d_cx
=
cx
+
dxmin
*
w
*
var
[
k
];
d_cx
=
cx
+
dxmin
*
w
*
var
[
k
];
d_cy
=
cy
+
dymin
*
h
*
var
[
k
+
1
];
d_cy
=
cy
+
dymin
*
h
*
var
[
k
+
1
];
d_w
=
exp
(
Min
<
T
>
(
dxmax
*
var
[
k
+
2
],
kBBoxClipD
efault
))
*
w
;
d_w
=
exp
(
Min
(
dxmax
*
var
[
k
+
2
],
bbox_clip_d
efault
))
*
w
;
d_h
=
exp
(
Min
<
T
>
(
dymax
*
var
[
k
+
3
],
kBBoxClipD
efault
))
*
h
;
d_h
=
exp
(
Min
(
dymax
*
var
[
k
+
3
],
bbox_clip_d
efault
))
*
h
;
}
else
{
}
else
{
d_cx
=
cx
+
dxmin
*
w
;
d_cx
=
cx
+
dxmin
*
w
;
d_cy
=
cy
+
dymin
*
h
;
d_cy
=
cy
+
dymin
*
h
;
d_w
=
exp
(
Min
<
T
>
(
dxmax
,
kBBoxClipD
efault
))
*
w
;
d_w
=
exp
(
Min
(
dxmax
,
bbox_clip_d
efault
))
*
w
;
d_h
=
exp
(
Min
<
T
>
(
dymax
,
kBBoxClipD
efault
))
*
h
;
d_h
=
exp
(
Min
(
dymax
,
bbox_clip_d
efault
))
*
h
;
}
}
T
oxmin
=
d_cx
-
d_w
*
0.5
;
T
oxmin
=
d_cx
-
d_w
*
0.5
;
...
@@ -123,17 +134,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
...
@@ -123,17 +134,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T
oxmax
=
d_cx
+
d_w
*
0.5
-
1.
;
T
oxmax
=
d_cx
+
d_w
*
0.5
-
1.
;
T
oymax
=
d_cy
+
d_h
*
0.5
-
1.
;
T
oymax
=
d_cy
+
d_h
*
0.5
-
1.
;
proposals
[
i
*
4
]
=
Max
<
T
>
(
Min
<
T
>
(
oxmin
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
]
=
Max
(
Min
(
oxmin
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
1
]
=
Max
<
T
>
(
Min
<
T
>
(
oymin
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
1
]
=
Max
(
Min
(
oymin
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
2
]
=
Max
<
T
>
(
Min
<
T
>
(
oxmax
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
2
]
=
Max
(
Min
(
oxmax
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
3
]
=
Max
<
T
>
(
Min
<
T
>
(
oymax
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
3
]
=
Max
(
Min
(
oymax
,
im_info
[
0
]
-
1.
),
0.
);
}
}
}
__device__
__forceinline__
T
Min
(
T
a
,
T
b
)
const
{
return
a
>
b
?
b
:
a
;
}
__device__
__forceinline__
T
Max
(
T
a
,
T
b
)
const
{
return
a
>
b
?
a
:
b
;
}
};
template
<
typename
T
,
int
BlockSize
>
template
<
typename
T
,
int
BlockSize
>
__global__
void
FilterBBoxes
(
const
T
*
bboxes
,
const
T
*
im_info
,
static
__global__
void
FilterBBoxes
(
const
T
*
bboxes
,
const
T
*
im_info
,
const
T
min_size
,
const
int
num
,
int
*
keep_
num
,
const
T
min_size
,
const
int
num
,
int
*
keep
)
{
int
*
keep_num
,
int
*
keep
)
{
T
im_h
=
im_info
[
0
];
T
im_h
=
im_info
[
0
];
T
im_w
=
im_info
[
1
];
T
im_w
=
im_info
[
1
];
T
im_scale
=
im_info
[
2
];
T
im_scale
=
im_info
[
2
];
...
@@ -178,7 +193,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
...
@@ -178,7 +193,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
}
}
}
}
__device__
inline
float
IoU
(
const
float
*
a
,
const
float
*
b
)
{
static
__device__
inline
float
IoU
(
const
float
*
a
,
const
float
*
b
)
{
float
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
float
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
float
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
float
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
float
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
float
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
@@ -188,7 +203,8 @@ __device__ inline float IoU(const float *a, const float *b) {
...
@@ -188,7 +203,8 @@ __device__ inline float IoU(const float *a, const float *b) {
return
inter_s
/
(
s_a
+
s_b
-
inter_s
);
return
inter_s
/
(
s_a
+
s_b
-
inter_s
);
}
}
__global__
void
NMSKernel
(
const
int
n_boxes
,
const
float
nms_overlap_thresh
,
static
__global__
void
NMSKernel
(
const
int
n_boxes
,
const
float
nms_overlap_thresh
,
const
float
*
dev_boxes
,
uint64_t
*
dev_mask
)
{
const
float
*
dev_boxes
,
uint64_t
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
const
int
col_start
=
blockIdx
.
x
;
...
@@ -231,7 +247,7 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
...
@@ -231,7 +247,7 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
NMS
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
proposals
,
static
void
NMS
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
proposals
,
const
Tensor
&
sorted_indices
,
const
T
nms_threshold
,
const
Tensor
&
sorted_indices
,
const
T
nms_threshold
,
Tensor
*
keep_out
)
{
Tensor
*
keep_out
)
{
int
boxes_num
=
proposals
.
dims
()[
0
];
int
boxes_num
=
proposals
.
dims
()[
0
];
...
@@ -244,14 +260,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
...
@@ -244,14 +260,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const
T
*
boxes
=
proposals
.
data
<
T
>
();
const
T
*
boxes
=
proposals
.
data
<
T
>
();
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
int
size_bytes
=
boxes_num
*
col_blocks
*
sizeof
(
uint64_t
);
framework
::
Vector
<
uint64_t
>
mask
(
boxes_num
*
col_blocks
);
auto
d_mask_allocation
=
memory
::
Alloc
(
place
,
size_bytes
);
NMSKernel
<<<
blocks
,
threads
>>>
(
uint64_t
*
d_mask
=
reinterpret_cast
<
uint64_t
*>
(
d_mask_allocation
->
ptr
());
boxes_num
,
nms_threshold
,
boxes
,
NMSKernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_threshold
,
boxes
,
d_mask
);
mask
.
CUDAMutableData
(
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
())));
auto
h_mask_allocation
=
memory
::
Alloc
(
platform
::
CPUPlace
(),
size_bytes
);
uint64_t
*
h_mask
=
reinterpret_cast
<
uint64_t
*>
(
h_mask_allocation
->
ptr
());
memory
::
Copy
(
platform
::
CPUPlace
(),
h_mask
,
place
,
d_mask
,
size_bytes
,
0
);
std
::
vector
<
uint64_t
>
remv
(
col_blocks
);
std
::
vector
<
uint64_t
>
remv
(
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
uint64_t
)
*
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
uint64_t
)
*
col_blocks
);
...
@@ -265,7 +277,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
...
@@ -265,7 +277,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
++
num_to_keep
;
++
num_to_keep
;
keep_vec
.
push_back
(
i
);
keep_vec
.
push_back
(
i
);
uint64_t
*
p
=
&
h_
mask
[
0
]
+
i
*
col_blocks
;
uint64_t
*
p
=
&
mask
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv
[
j
]
|=
p
[
j
];
remv
[
j
]
|=
p
[
j
];
}
}
...
@@ -274,12 +286,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
...
@@ -274,12 +286,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
int
*
keep
=
keep_out
->
mutable_data
<
int
>
({
num_to_keep
},
ctx
.
GetPlace
());
int
*
keep
=
keep_out
->
mutable_data
<
int
>
({
num_to_keep
},
ctx
.
GetPlace
());
memory
::
Copy
(
place
,
keep
,
platform
::
CPUPlace
(),
keep_vec
.
data
(),
memory
::
Copy
(
place
,
keep
,
platform
::
CPUPlace
(),
keep_vec
.
data
(),
sizeof
(
int
)
*
num_to_keep
,
0
);
sizeof
(
int
)
*
num_to_keep
,
0
);
memory
::
Free
(
place
,
d_mask
);
memory
::
Free
(
platform
::
CPUPlace
(),
h_mask
);
}
}
template
<
typename
T
>
template
<
typename
T
>
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
st
atic
st
d
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
im_info
,
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
im_info
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas
,
// [M, 4]
const
Tensor
&
bbox_deltas
,
// [M, 4]
...
@@ -298,18 +308,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
...
@@ -298,18 +308,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
// 2. box decode and clipping
// 2. box decode and clipping
Tensor
proposals
;
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
pre_nms_num
,
4
},
ctx
.
GetPlace
());
proposals
.
mutable_data
<
T
>
({
pre_nms_num
,
4
},
ctx
.
GetPlace
());
int
block
=
512
;
auto
stream
=
ctx
.
stream
();
{
BoxDecodeAndClipKernel
<
T
><<<
DIVUP
(
pre_nms_num
,
block
),
block
,
0
,
stream
>>>
(
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
ctx
,
pre_nms_num
);
for_range
(
BoxDecodeAndClipFunctor
<
T
>
{
anchors
.
data
<
T
>
(),
bbox_deltas
.
data
<
T
>
(),
variances
.
data
<
T
>
(),
anchors
.
data
<
T
>
(),
bbox_deltas
.
data
<
T
>
(),
variances
.
data
<
T
>
(),
index_sort
.
data
<
int
>
(),
im_info
.
data
<
T
>
(),
pre_nms_num
,
index_sort
.
data
<
int
>
(),
im_info
.
data
<
T
>
(),
proposals
.
data
<
T
>
()});
proposals
.
data
<
T
>
());
}
// 3. filter
// 3. filter
Tensor
keep_index
,
keep_num_t
;
Tensor
keep_index
,
keep_num_t
;
keep_index
.
mutable_data
<
int
>
({
pre_nms_num
},
ctx
.
GetPlace
());
keep_index
.
mutable_data
<
int
>
({
pre_nms_num
},
ctx
.
GetPlace
());
keep_num_t
.
mutable_data
<
int
>
({
1
},
ctx
.
GetPlace
());
keep_num_t
.
mutable_data
<
int
>
({
1
},
ctx
.
GetPlace
());
min_size
=
std
::
max
(
min_size
,
1.0
f
);
min_size
=
std
::
max
(
min_size
,
1.0
f
);
auto
stream
=
ctx
.
stream
();
FilterBBoxes
<
T
,
512
><<<
1
,
512
,
0
,
stream
>>>
(
FilterBBoxes
<
T
,
512
><<<
1
,
512
,
0
,
stream
>>>
(
proposals
.
data
<
T
>
(),
im_info
.
data
<
T
>
(),
min_size
,
pre_nms_num
,
proposals
.
data
<
T
>
(),
im_info
.
data
<
T
>
(),
min_size
,
pre_nms_num
,
keep_num_t
.
data
<
int
>
(),
keep_index
.
data
<
int
>
());
keep_num_t
.
data
<
int
>
(),
keep_index
.
data
<
int
>
());
...
@@ -353,8 +365,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -353,8 +365,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
anchors
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Anchors"
),
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
"Cannot find input Anchors(%s) in scope"
,
context
.
Inputs
(
"Anchors"
)[
0
]);
auto
variances
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Variances"
),
"Cannot find input Variances(%s) in scope"
,
context
.
Inputs
(
"Variances"
)[
0
]);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
...
@@ -390,10 +406,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -390,10 +406,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
});
variances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
context
.
GetPlace
());
context
.
GetPlace
());
...
@@ -402,7 +416,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -402,7 +416,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
T
*
rpn_rois_data
=
rpn_rois
->
data
<
T
>
();
T
*
rpn_rois_data
=
rpn_rois
->
data
<
T
>
();
T
*
rpn_roi_probs_data
=
rpn_roi_probs
->
data
<
T
>
();
T
*
rpn_roi_probs_data
=
rpn_roi_probs
->
data
<
T
>
();
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx
.
GetPlace
());
auto
&
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx
.
GetPlace
());
int64_t
num_proposals
=
0
;
int64_t
num_proposals
=
0
;
std
::
vector
<
size_t
>
offset
(
1
,
0
);
std
::
vector
<
size_t
>
offset
(
1
,
0
);
...
@@ -415,12 +429,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -415,12 +429,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
box_score_pair
=
std
::
pair
<
Tensor
,
Tensor
>
box_score_pair
=
ProposalForOneImage
<
T
>
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
ProposalForOneImage
<
T
>
(
dev_ctx
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
box_score_pair
.
first
;
Tensor
&
proposals
=
box_score_pair
.
first
;
Tensor
scores
=
box_score_pair
.
second
;
Tensor
&
scores
=
box_score_pair
.
second
;
memory
::
Copy
(
place
,
rpn_rois_data
+
num_proposals
*
4
,
place
,
memory
::
Copy
(
place
,
rpn_rois_data
+
num_proposals
*
4
,
place
,
proposals
.
data
<
T
>
(),
sizeof
(
T
)
*
proposals
.
numel
(),
0
);
proposals
.
data
<
T
>
(),
sizeof
(
T
)
*
proposals
.
numel
(),
0
);
...
...
paddle/fluid/operators/gather.h
浏览文件 @
a6fbf7ec
...
@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
...
@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// check index of shape 1-D
// check index of shape 1-D
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
1
);
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
1
);
int
index_size
=
index
.
dims
()[
0
];
int
64_t
index_size
=
index
.
dims
()[
0
];
auto
src_dims
=
src
.
dims
();
auto
src_dims
=
src
.
dims
();
framework
::
DDim
output_dims
(
src_dims
);
output_dims
[
0
]
=
index_size
;
const
T
*
p_src
=
src
.
data
<
T
>
();
const
T
*
p_src
=
src
.
data
<
T
>
();
const
int
*
p_index
=
index
.
data
<
int
>
();
const
int
*
p_index
=
index
.
data
<
int
>
();
...
@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
...
@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const
size_t
slice_bytes
=
slice_size
*
sizeof
(
T
);
const
size_t
slice_bytes
=
slice_size
*
sizeof
(
T
);
for
(
int
i
=
0
;
i
<
index_size
;
++
i
)
{
for
(
int
64_t
i
=
0
;
i
<
index_size
;
++
i
)
{
int
index_
=
p_index
[
i
];
int
index_
=
p_index
[
i
];
memcpy
(
p_output
+
i
*
slice_size
,
p_src
+
index_
*
slice_size
,
slice_bytes
);
memcpy
(
p_output
+
i
*
slice_size
,
p_src
+
index_
*
slice_size
,
slice_bytes
);
}
}
...
...
paddle/fluid/string/pretty_log.h
浏览文件 @
a6fbf7ec
...
@@ -56,13 +56,13 @@ struct Style {
...
@@ -56,13 +56,13 @@ struct Style {
};
};
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
void
PrettyLogEndl
(
const
std
::
string
&
style
,
const
char
*
fmt
,
static
void
PrettyLogEndl
(
const
std
::
string
&
style
,
const
char
*
fmt
,
const
Args
&
...
args
)
{
const
Args
&
...
args
)
{
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
()
<<
std
::
endl
;
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
()
<<
std
::
endl
;
}
}
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
void
PrettyLog
(
const
std
::
string
&
style
,
const
char
*
fmt
,
static
void
PrettyLog
(
const
std
::
string
&
style
,
const
char
*
fmt
,
const
Args
&
...
args
)
{
const
Args
&
...
args
)
{
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
();
std
::
cerr
<<
style
<<
Sprintf
(
fmt
,
args
...)
<<
reset
();
}
}
...
...
python/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
@@ -87,6 +87,7 @@ if (WITH_TESTING)
...
@@ -87,6 +87,7 @@ if (WITH_TESTING)
endif
()
endif
()
endif
()
endif
()
add_subdirectory
(
paddle/fluid/tests
)
add_subdirectory
(
paddle/fluid/tests
)
add_subdirectory
(
paddle/fluid/contrib/tests
)
endif
()
endif
()
install
(
DIRECTORY
${
PADDLE_PYTHON_PACKAGE_DIR
}
install
(
DIRECTORY
${
PADDLE_PYTHON_PACKAGE_DIR
}
DESTINATION opt/paddle/share/wheels
DESTINATION opt/paddle/share/wheels
...
...
python/paddle/fluid/contrib/__init__.py
浏览文件 @
a6fbf7ec
...
@@ -20,8 +20,11 @@ from . import memory_usage_calc
...
@@ -20,8 +20,11 @@ from . import memory_usage_calc
from
.memory_usage_calc
import
*
from
.memory_usage_calc
import
*
from
.
import
op_frequence
from
.
import
op_frequence
from
.op_frequence
import
*
from
.op_frequence
import
*
from
.
import
quantize
from
.quantize
import
*
__all__
=
[]
__all__
=
[]
__all__
+=
decoder
.
__all__
__all__
+=
decoder
.
__all__
__all__
+=
memory_usage_calc
.
__all__
__all__
+=
memory_usage_calc
.
__all__
__all__
+=
op_frequence
.
__all__
__all__
+=
op_frequence
.
__all__
__all__
+=
quantize
.
__all__
python/paddle/fluid/contrib/quantize/__init__.py
0 → 100644
浏览文件 @
a6fbf7ec
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
.
import
quantize_transpiler
from
.quantize_transpiler
import
*
__all__
=
quantize_transpiler
.
__all__
python/paddle/fluid/contrib/quantize/quantize_transpiler.py
0 → 100644
浏览文件 @
a6fbf7ec
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
collections
import
numpy
as
np
from
paddle.fluid.framework
import
default_main_program
,
default_startup_program
,
program_guard
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid
import
unique_name
from
paddle.fluid
import
core
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.param_attr
import
ParamAttr
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.nn
import
autoincreased_step_counter
from
paddle.fluid.framework
import
Variable
from
paddle.fluid.executor
import
global_scope
from
paddle.fluid.transpiler.inference_transpiler
import
InferenceTranspiler
__all__
=
[
'QuantizeTranspiler'
]
_QUANTIZABLE_OP_TYPES
=
[
'conv2d'
,
'depthwise_conv2d'
,
'mul'
]
def
_quantized_var_name
(
var_name
):
"""
Return quantized variable name for the input `var_name`.
"""
return
"%s.quantized"
%
(
var_name
)
def
_dequantized_var_name
(
var_name
):
"""
Return dequantized variable name for the input `var_name`.
"""
return
"%s.dequantized"
%
(
var_name
)
def
_quantized_scale_name
(
var_name
):
"""
Return quantized variable name for the input `var_name`.
"""
return
"%s.scale"
%
(
var_name
)
def
_original_var_name
(
var_name
):
"""
Return the original variable name.
"""
if
var_name
.
endswith
(
'.quantized.dequantized'
):
return
var_name
[:
-
len
(
'.quantized.dequantized'
)]
if
var_name
.
endswith
(
'.quantized'
):
return
var_name
[:
-
len
(
'.quantized'
)]
if
var_name
.
endswith
(
'.dequantized'
):
return
var_name
[:
-
len
(
'.dequantized'
)]
if
var_name
.
endswith
(
'.scale'
):
return
var_name
[:
-
len
(
'.scale'
)]
else
:
return
var_name
def
_is_float
(
v
):
return
isinstance
(
v
,
float
)
or
isinstance
(
v
,
np
.
float32
)
def
quant
(
x
,
scale
,
num_bits
):
y
=
np
.
round
(
x
/
scale
*
((
1
<<
(
num_bits
-
1
))
-
1
))
return
y
class
QuantizeTranspiler
(
object
):
def
__init__
(
self
,
weight_bits
=
8
,
activation_bits
=
8
,
activation_quantize_type
=
'abs_max'
,
weight_quantize_type
=
'abs_max'
,
window_size
=
10000
):
"""
Convert and rewrite the fluid Program according to weight and
activation quantization type.
Args:
weight_bits (int): quantization bit number for weights,
the bias is not quantized.
activation_bits (int): quantization bit number for activation.
activation_quantize_type (str): quantization type for activation,
now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
the quantization scale will be calculated dynamically each step
in both training and testing period. If use 'range_abs_max',
a static quantization scale will be calculated during training
and used in inference.
weight_quantize_type (str): quantization type for weights,
support 'abs_max'. The 'range_abs_max' usually is not used for
weight, since weights are fixed once the model is well trained.
window_size (int): the window size for 'range_abs_max' quantization.
Examples:
.. code-block:: python
# the original program will be rewrite, if you don't want to
# change it, please clone at first.
# quantize_program = program.clone()
t = fluid.QuantizeTranspiler()
t.transpile(quantize_program)
"""
self
.
weight_bits
=
weight_bits
self
.
activation_bits
=
activation_bits
quant_type
=
[
'abs_max'
,
'range_abs_max'
]
if
weight_quantize_type
not
in
quant_type
:
raise
ValueError
(
"Unknown weight_quantize_type: '%s'. It can only be "
,
"'abs_max' or 'range_abs_max'."
,
str
(
weight_quantize_type
))
if
activation_quantize_type
not
in
quant_type
:
raise
ValueError
(
"Unknown activation_quantize_type : '%s'. It can only be "
,
"'abs_max' or 'range_abs_max'."
,
str
(
activation_quantize_type
))
self
.
weight_quantize_type
=
weight_quantize_type
self
.
activation_quantize_type
=
activation_quantize_type
self
.
window_size
=
window_size
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
self
.
fake_quant_op_types
=
[
'fake_quantize_abs_max'
,
'fake_quantize_range_abs_max'
]
self
.
fake_dequant_op_types
=
[
'fake_dequantize_max_abs'
]
self
.
is_test
=
None
self
.
global_step
=
None
def
training_transpile
(
self
,
program
=
None
,
startup_program
=
None
):
"""Rewrites a training input program in place for simulated
quantization. Insert fake quantization and de-quantization ops into
program to simulate the error introduced by quantization. And change
the graident ops' input by using the faked quantization weights and
activation. Since the program is transformed in place, the graph
connection will change.
Args:
program (Program): the input program to be transpile.
"""
self
.
is_test
=
False
program
=
default_main_program
()
if
program
is
None
else
program
startup_program
=
default_startup_program
()
if
startup_program
is
\
None
else
startup_program
# marked the variable which has been quantized and dequantized.
dequanted_vars
=
[
collections
.
OrderedDict
()
for
_
in
range
(
len
(
program
.
blocks
))
]
grad_op_types
=
[
'%s_grad'
%
(
type
)
for
type
in
_QUANTIZABLE_OP_TYPES
]
params
=
[
p
.
name
for
p
in
program
.
global_block
().
iter_parameters
()]
def
_transpile_forward
(
block
,
op
):
idx
=
block
.
ops
.
index
(
op
)
block_id
=
block
.
idx
# insert quant op and dequant op
for
name
in
op
.
input_arg_names
:
if
name
in
dequanted_vars
[
block_id
]:
dequant_var
=
dequanted_vars
[
block_id
][
name
]
else
:
var
=
block
.
var
(
name
)
quant_bits
=
self
.
weight_bits
if
var
.
name
in
params
\
else
self
.
activation_bits
quant_type
=
self
.
weight_quantize_type
if
var
.
name
\
in
params
else
self
.
activation_quantize_type
quant_var
,
scale_var
=
self
.
_insert_quant_op
(
block
,
idx
,
var
,
quant_bits
,
quant_type
)
dequant_var
=
self
.
_insert_dequant_op
(
block
,
idx
+
1
,
quant_var
,
scale_var
,
quant_bits
)
dequanted_vars
[
block_id
][
name
]
=
dequant_var
# rename the forward op inputs
op
.
_rename_input
(
name
,
dequant_var
.
name
)
def
_transpile_backward
(
block
,
op
):
block_id
=
block
.
idx
no_dequanted_input_vars
=
True
for
name
in
op
.
input_arg_names
:
if
name
in
dequanted_vars
[
block_id
]:
dequant_var
=
dequanted_vars
[
block_id
][
name
]
op
.
_rename_input
(
name
,
dequant_var
.
name
)
no_dequanted_input_vars
=
False
if
no_dequanted_input_vars
:
raise
ValueError
(
"There is no dequanted inputs for op %s."
%
(
op
.
type
))
with
program_guard
(
program
,
startup_program
):
self
.
_create_global_step
()
for
block
in
program
.
blocks
:
ops
=
list
(
block
.
ops
)
block_id
=
block
.
idx
for
op
in
ops
:
# rewrite the forward ProgramDes
if
op
.
type
in
_QUANTIZABLE_OP_TYPES
:
_transpile_forward
(
block
,
op
)
# rename the backward op inputs
if
op
.
type
in
grad_op_types
:
_transpile_backward
(
block
,
op
)
def
_create_global_step
(
self
):
if
self
.
weight_quantize_type
==
'range_abs_max'
or
\
self
.
activation_quantize_type
==
'range_abs_max'
:
self
.
global_step
=
autoincreased_step_counter
()
def
freeze_program
(
self
,
program
,
place
,
fuse_bn
=
False
,
scope
=
None
):
"""Freeze input training program for inference.
Args:
program (Program): the input program to be transpile.
"""
self
.
is_test
=
True
scope
=
global_scope
()
if
scope
is
None
else
scope
program
=
default_main_program
()
if
program
is
None
else
program
if
fuse_bn
:
bn_fuse_transpiler
=
BNFuseTranspiler
()
bn_fuse_transpiler
.
transpile
(
program
,
place
)
persistable_vars
=
[
v
.
name
for
v
in
filter
(
lambda
var
:
var
.
persistable
,
program
.
list_vars
())
]
op_in_rename_map
=
[
collections
.
OrderedDict
()
for
_
in
range
(
len
(
program
.
blocks
))
]
op_out_rename_map
=
[
collections
.
OrderedDict
()
for
_
in
range
(
len
(
program
.
blocks
))
]
var_scale_map
=
[
collections
.
OrderedDict
()
for
_
in
range
(
len
(
program
.
blocks
))
]
def
_remove_fake_quant_and_dequant_op
(
block
,
op
):
idx
=
block
.
ops
.
index
(
op
)
block_id
=
block
.
idx
k
=
op
.
output
(
'Out'
)[
0
]
v
=
op
.
input
(
'X'
)[
0
]
if
v
not
in
op_in_rename_map
[
block_id
]:
op_in_rename_map
[
block_id
][
k
]
=
v
else
:
op_in_rename_map
[
block_id
][
k
]
=
op_in_rename_map
[
block_id
][
v
]
block
.
_remove_op
(
idx
)
def
_insert_post_dequant_op
(
block
,
op
):
idx
=
block
.
ops
.
index
(
op
)
block_id
=
block
.
idx
max_range
=
None
scale_var
=
None
for
name
in
op
.
input_arg_names
:
if
name
in
op_in_rename_map
[
block_id
]:
op
.
_rename_input
(
name
,
op_in_rename_map
[
block_id
][
name
])
scale_v
=
var_scale_map
[
block_id
][
_original_var_name
(
name
)]
if
_original_var_name
(
name
)
in
persistable_vars
:
param_range
=
(
1
<<
(
self
.
weight_bits
-
1
))
-
1
act_range
=
(
1
<<
(
self
.
activation_bits
-
1
))
-
1
assert
_is_float
(
scale_v
)
max_range
=
param_range
*
act_range
/
scale_v
else
:
assert
isinstance
(
scale_v
,
Variable
)
scale_var
=
var_scale_map
[
block_id
][
_original_var_name
(
name
)]
if
len
(
op
.
output_arg_names
)
!=
1
:
raise
ValueError
(
"Only support one output, but op %s has"
" more than one output."
%
(
op
.
type
))
out_var
=
block
.
var
(
op
.
output_arg_names
[
0
])
dequant_var
=
block
.
create_var
(
name
=
_dequantized_var_name
(
out_var
.
name
),
type
=
out_var
.
type
,
shape
=
out_var
.
shape
,
dtype
=
out_var
.
dtype
)
# insert fake_dequantize_op
dequant_op
=
block
.
_insert_op
(
idx
+
1
,
type
=
"fake_dequantize_max_abs"
,
attrs
=
{
'max_range'
:
float
(
max_range
)},
inputs
=
{
"X"
:
out_var
,
'Scale'
:
scale_var
},
outputs
=
{
"Out"
:
dequant_var
})
op_out_rename_map
[
block_id
][
out_var
.
name
]
=
dequant_var
.
name
return
dequant_var
def
_load_var
(
name
):
return
np
.
array
(
scope
.
find_var
(
name
).
get_tensor
())
def
_restore_var
(
name
,
arr
):
t
=
scope
.
find_var
(
name
).
get_tensor
()
t
.
set
(
arr
,
place
)
for
block
in
program
.
blocks
:
ops
=
list
(
block
.
ops
)
block_id
=
block
.
idx
for
op
in
ops
:
op_type
=
op
.
type
# insert dequant_op after fc/conv, need to rename
# input of the followed ops
for
name
in
op
.
input_arg_names
:
if
name
in
op_out_rename_map
[
block_id
]:
op
.
_rename_input
(
name
,
op_out_rename_map
[
block_id
][
name
])
if
op_type
in
self
.
fake_quant_op_types
:
in_arg_name
=
op
.
input
(
'X'
)[
0
]
if
in_arg_name
in
persistable_vars
:
if
self
.
weight_quantize_type
==
'abs_max'
:
param
=
_load_var
(
in_arg_name
)
scale_v
=
np
.
max
(
np
.
abs
(
param
))
else
:
scale_v
=
_load_var
(
op
.
output
(
'OutScale'
)[
0
])
var_scale_map
[
block_id
][
in_arg_name
]
=
scale_v
else
:
scale_v
=
block
.
var
(
op
.
output
(
'OutScale'
)[
0
])
var_scale_map
[
block_id
][
in_arg_name
]
=
scale_v
if
in_arg_name
in
persistable_vars
:
_remove_fake_quant_and_dequant_op
(
block
,
op
)
# quantize weight and restore
param_t
=
_load_var
(
in_arg_name
)
param_q_t
=
quant
(
param_t
,
scale_v
,
self
.
weight_bits
)
_restore_var
(
in_arg_name
,
param_q_t
)
if
op_type
in
self
.
fake_dequant_op_types
:
_remove_fake_quant_and_dequant_op
(
block
,
op
)
if
op_type
in
_QUANTIZABLE_OP_TYPES
:
dequant_var
=
_insert_post_dequant_op
(
block
,
op
)
# remove the unused var in ProgramDesc
self
.
_remove_unused_var
(
program
)
#program = program.clone()
def
convert_to_int8
(
self
,
program
,
place
,
scope
=
None
):
scope
=
global_scope
()
if
scope
is
None
else
scope
program
=
default_main_program
()
if
program
is
None
else
program
def
_load_var
(
name
):
return
np
.
array
(
scope
.
find_var
(
name
).
get_tensor
())
global_block
=
program
.
global_block
()
def
convert_to_int8
(
var
):
int8_var_name
=
var
.
name
+
".int8"
int8_var
=
global_block
.
create_parameter
(
name
=
int8_var_name
.
encode
(
'ascii'
),
type
=
var
.
type
,
dtype
=
core
.
VarDesc
.
VarType
.
INT8
,
shape
=
var
.
shape
)
tensor
=
_load_var
(
var
.
name
)
scope
.
var
(
int8_var_name
)
int8_tensor
=
scope
.
find_var
(
int8_var_name
).
get_tensor
()
int8_tensor
.
set
(
tensor
.
astype
(
np
.
int8
),
place
)
return
int8_var
input_map
=
{}
for
block
in
program
.
blocks
:
for
op
in
list
(
block
.
ops
):
if
op
.
type
in
_QUANTIZABLE_OP_TYPES
:
for
name
in
op
.
input_arg_names
:
var
=
block
.
var
(
name
)
if
var
.
persistable
:
if
name
not
in
input_map
:
int8_var
=
convert_to_int8
(
var
)
input_map
[
name
]
=
int8_var
.
name
op
.
_rename_input
(
name
,
input_map
[
name
])
self
.
_remove_unused_var
(
program
)
def
_remove_unused_var
(
self
,
program
):
all_remove_vars
=
[]
for
block
in
program
.
blocks
:
args
=
[]
for
op
in
block
.
ops
:
args
+=
op
.
input_arg_names
args
+=
op
.
output_arg_names
args
=
list
(
set
(
args
))
var_names
=
block
.
vars
.
keys
()
sub_block_remove_vars
=
[]
for
var
in
var_names
:
if
var
not
in
args
:
sub_block_remove_vars
.
append
(
var
)
all_remove_vars
.
append
(
sub_block_remove_vars
)
remove_vars
=
[
list
(
set
(
v
))
for
v
in
all_remove_vars
]
for
i
,
block
in
enumerate
(
program
.
blocks
):
for
v
in
remove_vars
[
i
]:
block
.
_remove_var
(
v
)
def
_insert_quant_abs_max_op
(
self
,
block
,
idx
,
var
,
quant_bits
):
"""Insert fake_quantize_abs_max op.
"""
quant_var
=
block
.
create_var
(
name
=
_quantized_var_name
(
var
.
name
),
type
=
var
.
type
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
)
scale
=
block
.
create_var
(
name
=
_quantized_scale_name
(
var
.
name
),
type
=
var
.
type
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
)
quant_op
=
block
.
_insert_op
(
idx
,
type
=
'fake_quantize_abs_max'
,
attrs
=
{
'bit_length'
:
quant_bits
},
inputs
=
{
'X'
:
var
},
outputs
=
{
'Out'
:
quant_var
,
'OutScale'
:
scale
})
return
quant_var
,
scale
def
_insert_quant_range_abs_max_op
(
self
,
block
,
idx
,
var
,
quant_bits
):
"""Insert fake_quantize_range_abs_max
"""
quant_var
=
block
.
create_var
(
name
=
_quantized_var_name
(
var
.
name
),
type
=
var
.
type
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
)
scale
=
self
.
helper
.
create_parameter
(
attr
=
ParamAttr
(
name
=
_quantized_scale_name
(
var
.
name
),
initializer
=
Constant
(
0.001
),
trainable
=
False
),
shape
=
[
1
],
dtype
=
var
.
dtype
)
scale
.
stop_gradient
=
True
ins
=
{
'X'
:
var
,
'InScale'
:
scale
}
outs
=
{
'Out'
:
quant_var
,
'OutScale'
:
scale
}
if
not
self
.
is_test
:
# A global step counter variable with type int64
scales
=
self
.
helper
.
create_global_variable
(
name
=
unique_name
.
generate
(
'scales'
),
persistable
=
True
,
dtype
=
var
.
dtype
,
shape
=
[
self
.
window_size
])
self
.
helper
.
set_variable_initializer
(
scales
,
initializer
=
Constant
(
value
=
0
))
ins
[
'Iter'
]
=
self
.
global_step
outs
[
'OutScales'
]
=
scales
attrs
=
{
'window_size'
:
self
.
window_size
,
'bit_length'
:
quant_bits
,
'is_test'
:
self
.
is_test
}
quant_op
=
block
.
_insert_op
(
idx
,
type
=
'fake_quantize_range_abs_max'
,
attrs
=
attrs
,
inputs
=
ins
,
outputs
=
outs
)
return
quant_var
,
scale
def
_insert_quant_op
(
self
,
block
,
idx
,
var
,
quant_bits
,
quant_type
):
"""
Insert fake_quantize_op
"""
if
quant_type
==
'abs_max'
:
return
self
.
_insert_quant_abs_max_op
(
block
,
idx
,
var
,
quant_bits
)
elif
quant_type
==
'range_abs_max'
:
return
self
.
_insert_quant_range_abs_max_op
(
block
,
idx
,
var
,
quant_bits
)
def
_insert_dequant_op
(
self
,
block
,
idx
,
var
,
scale
,
quant_bits
):
"""
Insert fake_quantize_op
"""
dequant_var
=
block
.
create_var
(
name
=
_dequantized_var_name
(
var
.
name
),
type
=
var
.
type
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
)
# insert fake_dequantize_op
max_range
=
(
1
<<
(
quant_bits
-
1
))
-
1
dequant_op
=
block
.
_insert_op
(
idx
,
type
=
"fake_dequantize_max_abs"
,
attrs
=
{
'max_range'
:
float
(
max_range
)},
inputs
=
{
"X"
:
var
,
'Scale'
:
scale
},
outputs
=
{
"Out"
:
dequant_var
})
return
dequant_var
class
BNFuseTranspiler
(
InferenceTranspiler
):
def
_fuse_param
(
self
,
current_op
,
bn_op
,
bias_op
,
with_bias
):
def
_update_param
(
op
,
param_name
,
new_param
):
var
=
self
.
block
.
vars
[
param_name
]
tensor
=
self
.
scope
.
find_var
(
param_name
).
get_tensor
()
tensor
.
set
(
np
.
array
(
new_param
),
self
.
place
)
def
_load_param
(
param_name
):
return
np
.
array
(
self
.
scope
.
find_var
(
param_name
).
get_tensor
())
bias_bn
=
_load_param
(
bn_op
.
input
(
"Bias"
)[
0
])
#Bias
scale_bn
=
_load_param
(
bn_op
.
input
(
"Scale"
)[
0
])
#Scale
mean_bn
=
_load_param
(
bn_op
.
input
(
"Mean"
)[
0
])
#Mean
var_bn
=
_load_param
(
bn_op
.
input
(
"Variance"
)[
0
])
#Variance
if
current_op
.
type
in
[
'conv2d'
,
'depthwise_conv2d'
]:
current_param
=
_load_param
(
_original_var_name
(
current_op
.
input
(
"Filter"
)[
0
]))
elif
current_op
.
type
==
'mul'
:
current_param
=
_load_param
(
_original_var_name
(
current_op
.
input
(
"Y"
)[
0
]))
std_bn
=
np
.
float32
(
np
.
sqrt
(
np
.
add
(
var_bn
,
1e-5
)))
tmp
=
np
.
float32
(
np
.
divide
(
scale_bn
,
std_bn
))
# add bias of batch_norm_op to conv2d
if
with_bias
:
bias
=
_load_param
(
bias_op
.
input
(
"Y"
))
else
:
bias
=
np
.
zeros
(
bias_bn
.
shape
)
bias
=
np
.
float32
(
np
.
add
(
np
.
multiply
(
np
.
subtract
(
bias
,
mean_bn
),
tmp
),
bias_bn
))
# re-compute weight of conv2d/fc
tmp
=
tmp
.
reshape
(
tmp
.
shape
[
0
],
-
1
)
dst_param
=
current_param
.
reshape
((
tmp
.
shape
[
0
],
-
1
))
dst_param
=
np
.
float32
(
np
.
multiply
(
dst_param
,
tmp
))
dst_param
=
dst_param
.
reshape
(
current_param
.
shape
)
# update parameters
if
current_op
.
type
in
[
'conv2d'
,
'depthwise_conv2d'
]:
_update_param
(
current_op
,
_original_var_name
(
current_op
.
input
(
"Filter"
)[
0
]),
dst_param
)
elif
current_op
.
type
==
'mul'
:
_update_param
(
current_op
,
_original_var_name
(
current_op
.
input
(
"Y"
)[
0
]),
dst_param
)
_update_param
(
bias_op
,
bias_op
.
input
(
"Y"
)[
0
],
bias
)
# collect the renamed input
self
.
input_map
[
bn_op
.
output
(
"Y"
)[
0
]]
=
bias_op
.
output
(
"Out"
)[
0
]
python/paddle/fluid/contrib/tests/CMakeLists.txt
0 → 100644
浏览文件 @
a6fbf7ec
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
foreach
(
src
${
TEST_OPS
}
)
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endforeach
()
python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
0 → 100644
浏览文件 @
a6fbf7ec
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import
numpy
as
np
import
six
import
unittest
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.contrib.quantize.quantize_transpiler
import
_original_var_name
from
paddle.fluid.contrib.quantize.quantize_transpiler
import
QuantizeTranspiler
def
linear_fc
(
num
):
data
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
32
,
32
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
data
for
_
in
six
.
moves
.
xrange
(
num
):
hidden
=
fluid
.
layers
.
fc
(
hidden
,
size
=
128
,
act
=
'relu'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
hidden
,
label
=
label
)
loss
=
fluid
.
layers
.
mean
(
loss
)
return
loss
def
residual_block
(
num
):
def
conv_bn_layer
(
input
,
ch_out
,
filter_size
,
stride
,
padding
,
act
=
'relu'
,
bias_attr
=
False
):
tmp
=
fluid
.
layers
.
conv2d
(
input
=
input
,
filter_size
=
filter_size
,
num_filters
=
ch_out
,
stride
=
stride
,
padding
=
padding
,
act
=
None
,
bias_attr
=
bias_attr
)
return
fluid
.
layers
.
batch_norm
(
input
=
tmp
,
act
=
act
)
data
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
32
,
32
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
data
for
_
in
six
.
moves
.
xrange
(
num
):
conv
=
conv_bn_layer
(
hidden
,
16
,
3
,
1
,
1
,
act
=
None
,
bias_attr
=
True
)
short
=
conv_bn_layer
(
hidden
,
16
,
1
,
1
,
0
,
act
=
None
)
hidden
=
fluid
.
layers
.
elementwise_add
(
x
=
conv
,
y
=
short
,
act
=
'relu'
)
fc
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
fc
,
label
=
label
)
loss
=
fluid
.
layers
.
mean
(
loss
)
return
loss
def
conv_net
(
img
,
label
):
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
img
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
conv_pool_1
=
fluid
.
layers
.
batch_norm
(
conv_pool_1
)
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
conv_pool_2
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
return
avg_loss
class
TestQuantizeTranspiler
(
unittest
.
TestCase
):
def
setUp
(
self
):
# since quant_op and dequant_op is not ready, use cos and sin for test
self
.
weight_quant_op_type
=
'fake_quantize_abs_max'
self
.
dequant_op_type
=
'fake_dequantize_max_abs'
self
.
quantizable_op_and_inputs
=
{
'conv2d'
:
[
'Input'
,
'Filter'
],
'depthwise_conv2d'
:
[
'Input'
,
'Filter'
],
'mul'
:
[
'X'
,
'Y'
]
}
self
.
quantizable_op_grad_and_inputs
=
{
'conv2d_grad'
:
[
'Input'
,
'Filter'
],
'depthwise_conv2d_grad'
:
[
'Input'
,
'Filter'
],
'mul_grad'
:
[
'X'
,
'Y'
]
}
def
check_program
(
self
,
program
):
quantized_ops
=
{}
persistable_vars
=
[
v
.
name
for
v
in
filter
(
lambda
var
:
var
.
persistable
,
program
.
list_vars
())
]
for
block
in
program
.
blocks
:
for
idx
,
op
in
enumerate
(
block
.
ops
):
# check forward
if
op
.
type
in
self
.
quantizable_op_and_inputs
:
for
i
,
arg_name
in
enumerate
(
op
.
input_arg_names
):
quant_op_type
=
self
.
weight_quant_op_type
if
\
_original_var_name
(
arg_name
)
\
in
persistable_vars
else
self
.
act_quant_op_type
self
.
assertTrue
(
arg_name
.
endswith
(
'.quantized.dequantized'
))
if
arg_name
not
in
quantized_ops
:
self
.
assertEqual
(
block
.
ops
[
idx
-
2
*
i
-
1
].
type
,
self
.
dequant_op_type
)
self
.
assertEqual
(
block
.
ops
[
idx
-
2
*
i
-
2
].
type
,
quant_op_type
)
quantized_ops
[
arg_name
]
=
block
.
ops
[
idx
-
2
*
i
-
2
]
else
:
op_idx
=
block
.
ops
.
index
(
quantized_ops
[
arg_name
])
self
.
assertLess
(
op_idx
,
idx
)
# check backward
if
op
.
type
in
self
.
quantizable_op_grad_and_inputs
:
for
pname
in
self
.
quantizable_op_grad_and_inputs
[
op
.
type
]:
arg_name
=
op
.
input
(
pname
)[
0
]
self
.
assertTrue
(
arg_name
.
endswith
(
'.quantized.dequantized'
))
self
.
assertTrue
(
arg_name
in
quantized_ops
)
def
linear_fc_quant
(
self
,
quant_type
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
linear_fc
(
3
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
t
=
QuantizeTranspiler
(
activation_quantize_type
=
quant_type
)
t
.
training_transpile
(
main
)
self
.
check_program
(
main
)
def
test_linear_fc_quant_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_abs_max'
self
.
linear_fc_quant
(
'abs_max'
)
def
test_linear_fc_quant_range_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_range_abs_max'
self
.
linear_fc_quant
(
'range_abs_max'
)
def
residual_block_quant
(
self
,
quant_type
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
residual_block
(
2
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
t
=
QuantizeTranspiler
(
activation_quantize_type
=
quant_type
)
t
.
training_transpile
(
main
)
self
.
check_program
(
main
)
def
test_residual_block_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_abs_max'
self
.
residual_block_quant
(
'abs_max'
)
def
test_residual_block_range_abs_max
(
self
):
self
.
act_quant_op_type
=
'fake_quantize_range_abs_max'
self
.
residual_block_quant
(
'range_abs_max'
)
def
freeze_program
(
self
,
use_cuda
):
def
build_program
(
main
,
startup
,
is_test
):
with
fluid
.
unique_name
.
guard
():
with
fluid
.
program_guard
(
main
,
startup
):
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
loss
=
conv_net
(
img
,
label
)
if
not
is_test
:
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
return
[
img
,
label
],
loss
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
test_program
=
fluid
.
Program
()
feeds
,
loss
=
build_program
(
main
,
startup
,
False
)
build_program
(
test_program
,
startup
,
True
)
test_program
=
test_program
.
clone
(
for_test
=
True
)
quant_transpiler
=
QuantizeTranspiler
()
quant_transpiler
.
training_transpile
(
main
)
quant_transpiler
.
training_transpile
(
test_program
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
iter
=
5
batch_size
=
8
class_num
=
10
exe
.
run
(
startup
)
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
mnist
.
train
(),
buf_size
=
500
),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
feeds
,
place
=
place
)
with
fluid
.
program_guard
(
main
):
for
_
in
range
(
iter
):
data
=
next
(
train_reader
())
loss_v
=
exe
.
run
(
program
=
main
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
loss
])
with
fluid
.
program_guard
(
test_program
):
test_data
=
next
(
test_reader
())
w_var
=
fluid
.
framework
.
_get_var
(
'conv2d_1.w_0.quantized'
,
test_program
)
# Testing during training
test_loss1
,
w_quant
=
exe
.
run
(
program
=
test_program
,
feed
=
feeder
.
feed
(
test_data
),
fetch_list
=
[
loss
,
w_var
])
# Freeze program for inference, but the weight of fc/conv is still float type.
quant_transpiler
.
freeze_program
(
test_program
,
place
)
test_loss2
,
=
exe
.
run
(
program
=
test_program
,
feed
=
feeder
.
feed
(
test_data
),
fetch_list
=
[
loss
])
self
.
assertAlmostEqual
(
test_loss1
,
test_loss2
,
delta
=
1e-3
)
w_freeze
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'conv2d_1.w_0'
)
.
get_tensor
())
self
.
assertEqual
(
np
.
sum
(
w_freeze
),
np
.
sum
(
w_quant
))
# Convert parameter to 8-bit.
quant_transpiler
.
convert_to_int8
(
test_program
,
place
)
# Save the 8-bit parameter and model file.
fluid
.
io
.
save_inference_model
(
'model_8bit'
,
[
'image'
,
'label'
],
[
loss
],
exe
,
test_program
)
# Test whether the 8-bit parameter and model file can be loaded successfully.
[
infer
,
feed
,
fetch
]
=
fluid
.
io
.
load_inference_model
(
'model_8bit'
,
exe
)
# Check the loaded 8-bit weight.
w_8bit
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'conv2d_1.w_0.int8'
)
.
get_tensor
())
self
.
assertEqual
(
w_8bit
.
dtype
,
np
.
int8
)
self
.
assertEqual
(
np
.
sum
(
w_8bit
),
np
.
sum
(
w_freeze
))
def
test_freeze_program_cuda
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
with
fluid
.
unique_name
.
guard
():
self
.
freeze_program
(
True
)
def
test_freeze_program_cpu
(
self
):
with
fluid
.
unique_name
.
guard
():
self
.
freeze_program
(
False
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a6fbf7ec
...
@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
...
@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
list
(
REMOVE_ITEM TEST_OPS op_test
)
# op_test is a helper python file, not a test
list
(
REMOVE_ITEM TEST_OPS op_test
)
# op_test is a helper python file, not a test
list
(
REMOVE_ITEM TEST_OPS decorators
)
# decorators is a helper python file, not a test
list
(
REMOVE_ITEM TEST_OPS decorators
)
# decorators is a helper python file, not a test
if
(
APPLE
)
if
(
APPLE
)
if
(
NOT WITH_DISTRIBUTE
)
if
(
NOT WITH_DISTRIBUTE
)
list
(
REMOVE_ITEM TEST_OPS test_desc_clone
)
list
(
REMOVE_ITEM TEST_OPS test_desc_clone
)
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
a6fbf7ec
...
@@ -50,9 +50,7 @@ class TestDistRunnerBase(object):
...
@@ -50,9 +50,7 @@ class TestDistRunnerBase(object):
def
run_pserver
(
self
,
args
):
def
run_pserver
(
self
,
args
):
self
.
get_model
(
batch_size
=
2
)
self
.
get_model
(
batch_size
=
2
)
# NOTE: pserver should not call memory optimize
if
args
.
mem_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
)
args
.
trainers
,
args
.
sync_mode
)
...
@@ -70,7 +68,7 @@ class TestDistRunnerBase(object):
...
@@ -70,7 +68,7 @@ class TestDistRunnerBase(object):
self
.
get_model
(
batch_size
=
2
)
self
.
get_model
(
batch_size
=
2
)
if
args
.
mem_opt
:
if
args
.
mem_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
fluid
.
memory_optimize
(
fluid
.
default_main_program
()
,
skip_grads
=
True
)
if
args
.
is_dist
:
if
args
.
is_dist
:
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
fluid
.
default_main_program
(),
...
...
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
a6fbf7ec
...
@@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase):
...
@@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
# TODO(typhoonzero): fix this test
class
TestDistseResnXt2x2WithMemopt
(
TestDistBase
):
# class TestDistseResnXt2x2WithMemopt(TestDistBase):
def
_setup_config
(
self
):
# def _setup_config(self):
self
.
_sync_mode
=
True
# self._sync_mode = True
self
.
_mem_opt
=
True
# self._mem_opt = True
def
test_dist_train
(
self
):
# def test_dist_train(self):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
# self.check_with_place("dist_se_resnext.py", delta=1e-7)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
...
...
python/paddle/fluid/transpiler/__init__.py
浏览文件 @
a6fbf7ec
...
@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
...
@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
from
.ps_dispatcher
import
HashName
,
RoundRobin
from
.ps_dispatcher
import
HashName
,
RoundRobin
__all__
=
[
__all__
=
[
"DistributeTranspiler"
,
"memory_optimize"
,
"release_memory"
,
"HashName"
,
"DistributeTranspiler"
,
"RoundRobin"
,
"DistributeTranspilerConfig"
"memory_optimize"
,
"release_memory"
,
"HashName"
,
"RoundRobin"
,
"DistributeTranspilerConfig"
,
]
]
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
浏览文件 @
a6fbf7ec
...
@@ -14,10 +14,10 @@
...
@@ -14,10 +14,10 @@
from
__future__
import
print_function
from
__future__
import
print_function
from
collections
import
defaultdict
,
OrderedDict
,
Callable
from
collections
import
defaultdict
,
MutableSet
from
..
import
core
from
..
import
core
from
...
import
compat
as
cpt
from
...
import
compat
as
cpt
from
..framework
import
Program
,
default_main_program
,
Parameter
,
Variable
from
..framework
import
Program
,
default_main_program
,
Parameter
,
Variable
,
core
from
..backward
import
_rename_arg_
from
..backward
import
_rename_arg_
from
functools
import
reduce
from
functools
import
reduce
from
six.moves
import
range
from
six.moves
import
range
...
@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
...
@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
PRINT_LOG
=
False
PRINT_LOG
=
False
class
OrderedSet
(
MutableSet
):
def
__init__
(
self
,
iterable
=
None
):
self
.
end
=
end
=
[]
end
+=
[
None
,
end
,
end
]
# sentinel node for doubly linked list
self
.
map
=
{}
# key --> [key, prev, next]
if
iterable
is
not
None
:
self
|=
iterable
def
__len__
(
self
):
return
len
(
self
.
map
)
def
__contains__
(
self
,
key
):
return
key
in
self
.
map
def
add
(
self
,
key
):
if
key
not
in
self
.
map
:
end
=
self
.
end
curr
=
end
[
1
]
curr
[
2
]
=
end
[
1
]
=
self
.
map
[
key
]
=
[
key
,
curr
,
end
]
def
update
(
self
,
other
):
for
e
in
other
:
self
.
add
(
e
)
def
discard
(
self
,
key
):
if
key
in
self
.
map
:
key
,
prev
,
next
=
self
.
map
.
pop
(
key
)
prev
[
2
]
=
next
next
[
1
]
=
prev
def
remove
(
self
,
key
):
self
.
discard
(
key
)
def
__iter__
(
self
):
end
=
self
.
end
curr
=
end
[
2
]
while
curr
is
not
end
:
yield
curr
[
0
]
curr
=
curr
[
2
]
def
__reversed__
(
self
):
end
=
self
.
end
curr
=
end
[
1
]
while
curr
is
not
end
:
yield
curr
[
0
]
curr
=
curr
[
1
]
def
pop
(
self
,
last
=
True
):
if
not
self
:
raise
KeyError
(
'set is empty'
)
key
=
self
.
end
[
1
][
0
]
if
last
else
self
.
end
[
2
][
0
]
self
.
discard
(
key
)
return
key
def
__repr__
(
self
):
if
not
self
:
return
'%s()'
%
(
self
.
__class__
.
__name__
,
)
return
'%s(%r)'
%
(
self
.
__class__
.
__name__
,
list
(
self
))
def
__eq__
(
self
,
other
):
if
isinstance
(
other
,
OrderedSet
):
return
len
(
self
)
==
len
(
other
)
and
list
(
self
)
==
list
(
other
)
return
set
(
self
)
==
set
(
other
)
class
ControlFlowGraph
(
object
):
class
ControlFlowGraph
(
object
):
def
__init__
(
self
,
program
,
ops
,
forward_num
,
skip_opt
):
def
__init__
(
self
,
program
,
ops
,
forward_num
,
skip_opt
):
self
.
_program
=
program
self
.
_program
=
program
self
.
_ops
=
ops
self
.
_ops
=
ops
self
.
_forward_num
=
forward_num
self
.
_forward_num
=
forward_num
self
.
_successors
=
defaultdict
(
s
et
)
self
.
_successors
=
defaultdict
(
OrderedS
et
)
self
.
_presuccessors
=
defaultdict
(
s
et
)
self
.
_presuccessors
=
defaultdict
(
OrderedS
et
)
self
.
_uses
=
defaultdict
(
s
et
)
self
.
_uses
=
defaultdict
(
OrderedS
et
)
self
.
_defs
=
defaultdict
(
s
et
)
self
.
_defs
=
defaultdict
(
OrderedS
et
)
self
.
_live_in
=
defaultdict
(
s
et
)
self
.
_live_in
=
defaultdict
(
OrderedS
et
)
self
.
_live_out
=
defaultdict
(
s
et
)
self
.
_live_out
=
defaultdict
(
OrderedS
et
)
self
.
_skip_opt
=
skip_opt
self
.
_skip_opt
=
skip_opt
self
.
pool
=
[]
self
.
pool
=
[]
...
@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
...
@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
# NOTE: must sort the in_diff set for cases that get different cache var.
# NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize
=
[
can_optimize
=
[
x
for
x
in
sorted
(
list
(
in_diff
))
x
for
x
in
in_diff
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
]
]
if
can_optimize
:
if
can_optimize
:
...
@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
...
@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
if
self
.
pool
:
if
self
.
pool
:
# NOTE: must sort the in_diff set for cases that get different cache var.
# NOTE: must sort the in_diff set for cases that get different cache var.
defs_can_optimize
=
[
defs_can_optimize
=
[
x
for
x
in
s
orted
(
list
(
self
.
_defs
[
i
]))
x
for
x
in
s
elf
.
_defs
[
i
]
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
]
]
out_pair
=
[
out_pair
=
[
...
@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
...
@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
return
cfgs
return
cfgs
def
memory_optimize
(
input_program
,
skip_opt_set
=
None
,
print_log
=
False
,
level
=
0
):
def
_is_opt_role_op
(
op
):
op_maker
=
core
.
op_proto_and_checker_maker
optimize_role
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
if
op_maker
.
kOpRoleAttrName
()
in
op
.
attr_names
and
\
int
(
op
.
all_attrs
()[
op_maker
.
kOpRoleAttrName
()])
==
int
(
optimize_role
):
return
True
def
memory_optimize
(
input_program
,
skip_opt_set
=
None
,
print_log
=
False
,
level
=
0
,
skip_grads
=
False
):
"""Optimize memory by reusing var memory.
"""Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock.
Note: it doesn't not support subblock nested in subblock.
...
@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
...
@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
raise
ValueError
(
"only support opt_level 0 or 1."
)
raise
ValueError
(
"only support opt_level 0 or 1."
)
global
PRINT_LOG
global
PRINT_LOG
PRINT_LOG
=
print_log
PRINT_LOG
=
print_log
if
skip_grads
:
grad_set
=
set
()
OP_ROLE_VAR
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
()
for
op
in
input_program
.
global_block
().
ops
:
if
_is_opt_role_op
(
op
):
if
op
.
attr
(
OP_ROLE_VAR
):
grad_name
=
op
.
attr
(
OP_ROLE_VAR
)[
1
]
grad_set
.
add
(
grad_name
)
if
not
skip_opt_set
:
skip_opt_set
=
grad_set
else
:
skip_opt_set
.
update
(
grad_set
)
cfgs
=
_get_cfgs
(
input_program
)
cfgs
=
_get_cfgs
(
input_program
)
for
cfg
in
cfgs
:
for
cfg
in
cfgs
:
cfg
.
memory_optimize
(
skip_opt_set
=
skip_opt_set
,
level
=
level
)
cfg
.
memory_optimize
(
skip_opt_set
=
skip_opt_set
,
level
=
level
)
...
...
python/setup.py.in
浏览文件 @
a6fbf7ec
...
@@ -106,6 +106,7 @@ packages=['paddle',
...
@@ -106,6 +106,7 @@ packages=['paddle',
'paddle.fluid.layers',
'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.transpiler',
'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details']
'paddle.fluid.transpiler.details']
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录