Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
5670e9ea
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5670e9ea
编写于
11月 15, 2018
作者:
P
peizhilin
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into windows/build
上级
dc197967
9be99b14
变更
61
隐藏空白更改
内联
并排
Showing
61 changed file
with
1247 addition
and
493 deletion
+1247
-493
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+1
-0
paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+1
-0
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
...e/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+1
-1
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+1
-1
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+1
-1
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+1
-0
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+1
-1
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+1
-1
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+1
-1
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+2
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+2
-2
paddle/fluid/inference/tensorrt/CMakeLists.txt
paddle/fluid/inference/tensorrt/CMakeLists.txt
+1
-0
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+6
-3
paddle/fluid/inference/tensorrt/convert/concat_op.cc
paddle/fluid/inference/tensorrt/convert/concat_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/split_op.cc
paddle/fluid/inference/tensorrt/convert/split_op.cc
+75
-0
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+53
-0
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+6
-0
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+5
-0
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+1
-0
paddle/fluid/inference/tensorrt/plugin/serialize.h
paddle/fluid/inference/tensorrt/plugin/serialize.h
+111
-0
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+81
-0
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+74
-0
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+61
-0
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+80
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+2
-7
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+12
-14
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+4
-2
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+4
-2
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+4
-2
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+6
-4
paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+4
-2
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+4
-2
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
...nference/tests/api/analyzer_text_classification_tester.cc
+6
-3
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+4
-5
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+79
-0
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+64
-23
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+123
-122
paddle/fluid/operators/fc_op.cc
paddle/fluid/operators/fc_op.cc
+33
-12
paddle/fluid/operators/hash_op.cc
paddle/fluid/operators/hash_op.cc
+1
-1
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+33
-0
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+23
-0
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+7
-6
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+42
-99
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+108
-108
paddle/fluid/operators/math/jit_kernel_rnn.cc
paddle/fluid/operators/math/jit_kernel_rnn.cc
+19
-19
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+13
-13
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+1
-1
paddle/fluid/operators/math/sequence_pooling_test.cc
paddle/fluid/operators/math/sequence_pooling_test.cc
+4
-4
paddle/fluid/operators/math/softmax.cc
paddle/fluid/operators/math/softmax.cc
+4
-2
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+8
-3
paddle/fluid/operators/math/softmax.h
paddle/fluid/operators/math/softmax.h
+1
-1
paddle/fluid/operators/math/softmax_impl.h
paddle/fluid/operators/math/softmax_impl.h
+37
-4
paddle/fluid/operators/merge_ids_op.h
paddle/fluid/operators/merge_ids_op.h
+4
-4
paddle/fluid/operators/ref_by_trainer_id_op.h
paddle/fluid/operators/ref_by_trainer_id_op.h
+1
-1
paddle/fluid/operators/softmax_op.h
paddle/fluid/operators/softmax_op.h
+6
-1
paddle/fluid/operators/softmax_with_cross_entropy_op.h
paddle/fluid/operators/softmax_with_cross_entropy_op.h
+2
-2
paddle/fluid/operators/split_ids_op.h
paddle/fluid/operators/split_ids_op.h
+1
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+5
-5
python/paddle/fluid/tests/unittests/dist_save_load.py
python/paddle/fluid/tests/unittests/dist_save_load.py
+5
-1
python/paddle/fluid/tests/unittests/test_dist_save_load.py
python/paddle/fluid/tests/unittests/test_dist_save_load.py
+4
-4
python/requirements.txt
python/requirements.txt
+1
-1
未找到文件。
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
5670e9ea
...
@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
...
@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
desc
.
SetInput
(
"W"
,
std
::
vector
<
std
::
string
>
({
fc_Y_in
}));
desc
.
SetInput
(
"W"
,
std
::
vector
<
std
::
string
>
({
fc_Y_in
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
fc_bias_in
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
fc_bias_in
}));
desc
.
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
fc_out_out
}));
desc
.
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
fc_out_out
}));
desc
.
SetAttr
(
"in_num_col_dims"
,
mul
->
Op
()
->
GetAttr
(
"x_num_col_dims"
));
desc
.
SetType
(
"fc"
);
desc
.
SetType
(
"fc"
);
auto
fc_node
=
g
->
CreateOpNode
(
&
desc
);
// OpDesc will be copied.
auto
fc_node
=
g
->
CreateOpNode
(
&
desc
);
// OpDesc will be copied.
GraphSafeRemoveNodes
(
graph
.
get
(),
{
mul
,
elementwise_add
,
mul_out
});
GraphSafeRemoveNodes
(
graph
.
get
(),
{
mul
,
elementwise_add
,
mul_out
});
...
...
paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
浏览文件 @
5670e9ea
...
@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
...
@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
if
(
type
==
"mul"
)
{
if
(
type
==
"mul"
)
{
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Y"
,
{
inputs
[
1
]});
op
->
SetInput
(
"Y"
,
{
inputs
[
1
]});
op
->
SetAttr
(
"x_num_col_dims"
,
{
1
});
}
else
if
(
type
==
"elementwise_add"
)
{
}
else
if
(
type
==
"elementwise_add"
)
{
op
->
SetInput
(
"X"
,
inputs
);
op
->
SetInput
(
"X"
,
inputs
);
}
}
...
...
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
浏览文件 @
5670e9ea
...
@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) {
...
@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) {
void
SubGraphFuser
::
ReplaceNodesWithSubGraphs
()
{
void
SubGraphFuser
::
ReplaceNodesWithSubGraphs
()
{
auto
subgraphs
=
SubgraphDetector
(
graph_
,
node_inside_subgraph_teller_
)();
auto
subgraphs
=
SubgraphDetector
(
graph_
,
node_inside_subgraph_teller_
)();
for
(
auto
&
subgraph
:
subgraphs
)
{
for
(
auto
&
subgraph
:
subgraphs
)
{
if
(
subgraph
.
size
()
<=
min_subgraph_size_
)
continue
;
if
(
subgraph
.
size
()
<=
(
size_t
)
min_subgraph_size_
)
continue
;
LOG
(
INFO
)
<<
"detect a subgraph size "
<<
subgraph
.
size
();
LOG
(
INFO
)
<<
"detect a subgraph size "
<<
subgraph
.
size
();
std
::
unordered_set
<
Node
*>
subgraph_uniq
(
subgraph
.
begin
(),
subgraph
.
end
());
std
::
unordered_set
<
Node
*>
subgraph_uniq
(
subgraph
.
begin
(),
subgraph
.
end
());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// replace this sub-graph with the first node. Two steps: 1. Create a Block
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
5670e9ea
...
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output.
// it is either an OP's input or an OP's output.
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
for
(
in
t
index
=
0
;
index
<
block_desc
.
OpSize
();
index
++
)
{
for
(
size_
t
index
=
0
;
index
<
block_desc
.
OpSize
();
index
++
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
.
Op
(
index
)
->
Proto
();
framework
::
proto
::
OpDesc
*
op
=
block_desc
.
Op
(
index
)
->
Proto
();
auto
correspond_node
=
subgraph_nodes
[
index
];
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
5670e9ea
...
@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
...
@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std
::
unordered_set
<
std
::
string
>
teller_set
(
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"dropout"
});
"elementwise_add"
,
"dropout"
,
"split"
});
if
(
!
node
->
IsOp
())
return
false
;
if
(
!
node
->
IsOp
())
return
false
;
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
5670e9ea
...
@@ -548,4 +548,5 @@ USE_TRT_CONVERTER(batch_norm);
...
@@ -548,4 +548,5 @@ USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER
(
concat
);
USE_TRT_CONVERTER
(
concat
);
USE_TRT_CONVERTER
(
dropout
);
USE_TRT_CONVERTER
(
dropout
);
USE_TRT_CONVERTER
(
pad
);
USE_TRT_CONVERTER
(
pad
);
USE_TRT_CONVERTER
(
split
);
#endif
#endif
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
5670e9ea
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <thread>
#include <thread>
// NOLINT
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
...
...
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
浏览文件 @
5670e9ea
...
@@ -23,7 +23,7 @@ limitations under the License. */
...
@@ -23,7 +23,7 @@ limitations under the License. */
#include <memory>
#include <memory>
#include <thread> //NOLINT
#include <thread> //NOLINT
#include "utils.h"
#include "utils.h"
// NOLINT
DEFINE_string
(
dirname
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
dirname
,
""
,
"Directory of the inference model."
);
DEFINE_bool
(
use_gpu
,
false
,
"Whether use gpu."
);
DEFINE_bool
(
use_gpu
,
false
,
"Whether use gpu."
);
...
...
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
浏览文件 @
5670e9ea
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
5670e9ea
...
@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {
...
@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
);
int
max_batch_size
=
1
);
bool
use_tensorrt
()
const
{
return
use_tensorrt_
;
}
// NOTE this is just for internal development, please not use it.
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
// NOT stable yet.
void
EnableMKLDNN
();
void
EnableMKLDNN
();
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
5670e9ea
...
@@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy {
...
@@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy {
virtual
~
CpuPassStrategy
()
=
default
;
virtual
~
CpuPassStrategy
()
=
default
;
v
irtual
v
oid
EnableMKLDNN
()
override
{
void
EnableMKLDNN
()
override
{
// TODO(Superjomn) Consider the way to mix CPU with GPU.
// TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
passes_
.
insert
(
passes_
.
begin
(),
"mkldnn_placement_pass"
);
passes_
.
insert
(
passes_
.
begin
(),
"mkldnn_placement_pass"
);
...
@@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy {
...
@@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy {
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{}
:
PassStrategy
(
other
.
AllPasses
())
{}
v
irtual
v
oid
EnableMKLDNN
()
override
;
void
EnableMKLDNN
()
override
;
virtual
~
GpuPassStrategy
()
=
default
;
virtual
~
GpuPassStrategy
()
=
default
;
};
};
...
...
paddle/fluid/inference/tensorrt/CMakeLists.txt
浏览文件 @
5670e9ea
nv_library
(
tensorrt_engine SRCS engine.cc DEPS framework_proto device_context
)
nv_library
(
tensorrt_engine SRCS engine.cc DEPS framework_proto device_context
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
add_subdirectory
(
plugin
)
add_subdirectory
(
convert
)
add_subdirectory
(
convert
)
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
5670e9ea
# Add TRT tests
# Add TRT tests
nv_library
(
tensorrt_converter
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
pad_op.cc split_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine tensorrt_converter
)
${
FLUID_CORE_MODULES
}
tensorrt_engine tensorrt_converter
)
...
@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
...
@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine concat_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine concat_op SERIAL
)
nv_test
(
test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
nv_test
(
test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine dropout_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine dropout_op SERIAL
)
nv_test
(
test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
nv_test
(
test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pad_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pad_op SERIAL
)
nv_test
(
test_trt_split_op SRCS test_split_op.cc split_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine tensorrt_plugin
split_op concat_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/concat_op.cc
浏览文件 @
5670e9ea
...
@@ -19,7 +19,7 @@ namespace inference {
...
@@ -19,7 +19,7 @@ namespace inference {
namespace
tensorrt
{
namespace
tensorrt
{
/*
/*
*
MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
*
ConcatOp
*/
*/
class
ConcatOpConverter
:
public
OpConverter
{
class
ConcatOpConverter
:
public
OpConverter
{
public:
public:
...
...
paddle/fluid/inference/tensorrt/convert/split_op.cc
0 → 100644
浏览文件 @
5670e9ea
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* SplitOp.
*/
class
SplitOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
40
)
<<
"convert a fluid split op to tensorrt split layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
auto
*
input
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
auto
input_dims
=
input
->
getDimensions
();
int
input_num
=
op_desc
.
Input
(
"X"
).
size
();
size_t
output_num
=
op_desc
.
Output
(
"Out"
).
size
();
// Get Attrs
PADDLE_ENFORCE
(
input_num
==
1
);
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
std
::
vector
<
int
>
output_lengths
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"sections"
));
PADDLE_ENFORCE
(
axis
!=
0
);
if
(
axis
<
0
)
{
axis
+=
input_dims
.
nbDims
;
}
else
{
axis
-=
1
;
}
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
//
SplitPlugin
*
plugin
=
new
SplitPlugin
(
axis
,
output_lengths
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
std
::
string
layer_name
=
"split (Output: "
;
for
(
size_t
i
=
0
;
i
<
output_num
;
i
++
)
{
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
i
];
layer
->
getOutput
(
i
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
i
));
layer_name
+=
output_name
;
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
}
layer
->
setName
((
layer_name
+
")"
).
c_str
());
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
split
,
SplitOpConverter
);
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
0 → 100644
浏览文件 @
5670e9ea
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
split_op
,
test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"split_input"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out1"
,
nvinfer1
::
DimsCHW
(
2
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out2"
,
nvinfer1
::
DimsCHW
(
1
,
2
,
2
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"split"
);
desc
.
SetInput
(
"X"
,
{
"split_input"
});
desc
.
SetOutput
(
"Out"
,
{
"split_out1"
,
"split_out2"
});
int
num
=
0
;
int
axis
=
1
;
std
::
vector
<
int
>
output_lengths
=
{
2
,
1
};
desc
.
SetAttr
(
"axis"
,
axis
);
desc
.
SetAttr
(
"num"
,
num
);
desc
.
SetAttr
(
"sections"
,
output_lengths
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
split
);
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
5670e9ea
...
@@ -255,6 +255,12 @@ void TensorRTEngine::freshDeviceId() {
...
@@ -255,6 +255,12 @@ void TensorRTEngine::freshDeviceId() {
cudaSetDevice
(
device_
);
cudaSetDevice
(
device_
);
}
}
nvinfer1
::
IPluginLayer
*
TensorRTEngine
::
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
nbInputs
,
PluginTensorRT
*
plugin
)
{
owned_plugin_
.
emplace_back
(
plugin
);
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
nbInputs
,
*
plugin
);
}
}
// namespace tensorrt
}
// namespace tensorrt
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
5670e9ea
...
@@ -22,6 +22,7 @@ limitations under the License. */
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase {
...
@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase {
void
SetRuntimeBatch
(
size_t
batch_size
);
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
int
GetRuntimeBatch
();
int
GetDevice
()
{
return
device_
;
}
int
GetDevice
()
{
return
device_
;
}
nvinfer1
::
IPluginLayer
*
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
nbInputs
,
PluginTensorRT
*
);
// A pointer to CPU memory is needed of the TRT weight.
// A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage.
// Before TRT runs, fluid loads weight into GPU storage.
...
@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase {
...
@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase {
std
::
unordered_map
<
std
::
string
/*name*/
,
size_t
/*max size*/
>
buffer_sizes_
;
std
::
unordered_map
<
std
::
string
/*name*/
,
size_t
/*max size*/
>
buffer_sizes_
;
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
itensor_map_
;
itensor_map_
;
// The specific GPU id that the TensorRTEngine bounded to.
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
int
device_
;
std
::
vector
<
std
::
unique_ptr
<
PluginTensorRT
>>
owned_plugin_
;
// TensorRT related internal members
// TensorRT related internal members
template
<
typename
T
>
template
<
typename
T
>
...
...
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
0 → 100644
浏览文件 @
5670e9ea
nv_library
(
tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce
)
paddle/fluid/inference/tensorrt/plugin/serialize.h
0 → 100644
浏览文件 @
5670e9ea
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
);
template
<
typename
T
>
inline
void
DeserializeValue
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
);
namespace
{
template
<
typename
T
,
class
Enable
=
void
>
struct
Serializer
{};
template
<
typename
T
>
struct
Serializer
<
T
,
typename
std
::
enable_if
<
std
::
is_arithmetic
<
T
>::
value
||
std
::
is_enum
<
T
>::
value
||
std
::
is_pod
<
T
>::
value
>::
type
>
{
static
size_t
SerializedSize
(
T
const
&
value
)
{
return
sizeof
(
T
);
}
static
void
Serialize
(
void
**
buffer
,
T
const
&
value
)
{
std
::
memcpy
(
*
buffer
,
&
value
,
sizeof
(
T
));
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
sizeof
(
T
);
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
assert
(
*
buffer_size
>=
sizeof
(
T
));
std
::
memcpy
(
value
,
*
buffer
,
sizeof
(
T
));
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
sizeof
(
T
);
*
buffer_size
-=
sizeof
(
T
);
}
};
template
<
>
struct
Serializer
<
const
char
*>
{
static
size_t
SerializedSize
(
const
char
*
value
)
{
return
strlen
(
value
)
+
1
;
}
static
void
Serialize
(
void
**
buffer
,
const
char
*
value
)
{
std
::
strcpy
(
static_cast
<
char
*>
(
*
buffer
),
value
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
strlen
(
value
)
+
1
;
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
const
char
**
value
)
{
*
value
=
static_cast
<
char
const
*>
(
*
buffer
);
size_t
data_size
=
strnlen
(
*
value
,
*
buffer_size
)
+
1
;
assert
(
*
buffer_size
>=
data_size
);
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
data_size
;
*
buffer_size
-=
data_size
;
}
};
template
<
typename
T
>
struct
Serializer
<
std
::
vector
<
T
>
,
typename
std
::
enable_if
<
std
::
is_arithmetic
<
T
>::
value
||
std
::
is_enum
<
T
>::
value
||
std
::
is_pod
<
T
>::
value
>::
type
>
{
static
size_t
SerializedSize
(
std
::
vector
<
T
>
const
&
value
)
{
return
sizeof
(
value
.
size
())
+
value
.
size
()
*
sizeof
(
T
);
}
static
void
Serialize
(
void
**
buffer
,
std
::
vector
<
T
>
const
&
value
)
{
SerializeValue
(
buffer
,
value
.
size
());
size_t
nbyte
=
value
.
size
()
*
sizeof
(
T
);
std
::
memcpy
(
*
buffer
,
value
.
data
(),
nbyte
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
nbyte
;
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
std
::
vector
<
T
>*
value
)
{
size_t
size
;
DeserializeValue
(
buffer
,
buffer_size
,
&
size
);
value
->
resize
(
size
);
size_t
nbyte
=
value
->
size
()
*
sizeof
(
T
);
assert
(
*
buffer_size
>=
nbyte
);
std
::
memcpy
(
value
->
data
(),
*
buffer
,
nbyte
);
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
nbyte
;
*
buffer_size
-=
nbyte
;
}
};
}
// namespace
template
<
typename
T
>
inline
size_t
SerializedSize
(
T
const
&
value
)
{
return
Serializer
<
T
>::
SerializedSize
(
value
);
}
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
)
{
return
Serializer
<
T
>::
Serialize
(
buffer
,
value
);
}
template
<
typename
T
>
inline
void
DeserializeValue
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
return
Serializer
<
T
>::
Deserialize
(
buffer
,
buffer_size
,
value
);
}
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
0 → 100644
浏览文件 @
5670e9ea
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <cassert>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
assert
(
nbInputs
==
1
);
assert
(
index
<
this
->
getNbOutputs
());
nvinfer1
::
Dims
const
&
input_dims
=
inputDims
[
0
];
nvinfer1
::
Dims
output_dims
=
input_dims
;
output_dims
.
d
[
axis_
]
=
output_length_
.
at
(
index
);
return
output_dims
;
}
int
SplitPlugin
::
initialize
()
{
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]);
}
segment_offsets_
=
segment_offsets
;
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
nx_
=
1
;
for
(
int
i
=
dims
.
nbDims
-
1
;
i
>
axis_
;
--
i
)
{
nx_
*=
dims
.
d
[
i
];
}
ny_
=
dims
.
d
[
axis_
];
nz_
=
1
;
for
(
int
i
=
axis_
-
1
;
i
>=
0
;
--
i
)
{
nz_
*=
dims
.
d
[
i
];
}
return
0
;
}
int
SplitPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
const
&
input_dims
=
this
->
getInputDims
(
0
);
int
input_size
=
0
;
float
const
*
idata
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
float
**
odatas
=
reinterpret_cast
<
float
**>
(
outputs
);
// kernel impl here.
int
inputBatchOffset
=
nx_
*
ny_
*
nz_
;
for
(
size_t
i
=
0
;
i
<
this
->
getNbOutputs
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
batchSize
;
j
++
)
{
cudaMemcpyAsync
(
odatas
[
i
]
+
j
*
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
inputs
[
0
]
+
(
inputBatchOffset
*
j
+
segment_offsets_
[
i
]
*
nx_
)
*
sizeof
(
float
),
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
);
}
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// tensorrt
}
// inference
}
// paddle
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
0 → 100644
浏览文件 @
5670e9ea
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
class
SplitPlugin
:
public
PluginTensorRT
{
int
axis_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
protected:
virtual
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
output_length_
)
+
getBaseSerializationSize
();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
virtual
void
serialize
(
void
*
buffer
)
override
{
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
output_length_
);
}
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{
assert
(
axis
<=
nvinfer1
::
Dims
::
MAX_DIMS
);
}
// It was used for tensorrt deserialization.
// It should not be called by users.
SplitPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
deserializeBase
(
serialData
,
serialLength
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
axis_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
output_length_
);
}
SplitPlugin
*
clone
()
const
override
{
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
virtual
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
virtual
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
virtual
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
virtual
int
initialize
()
override
;
virtual
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
};
}
// tensorrt
}
// inference
}
// paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
0 → 100644
浏览文件 @
5670e9ea
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
void
PluginTensorRT
::
serializeBase
(
void
*&
buffer
)
{
SerializeValue
(
&
buffer
,
input_dims_
);
SerializeValue
(
&
buffer
,
max_batch_size_
);
SerializeValue
(
&
buffer
,
data_type_
);
SerializeValue
(
&
buffer
,
data_format_
);
}
void
PluginTensorRT
::
deserializeBase
(
void
const
*&
serialData
,
size_t
&
serialLength
)
{
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
input_dims_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
max_batch_size_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
data_type_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
data_format_
);
}
size_t
PluginTensorRT
::
getBaseSerializationSize
()
{
return
(
SerializedSize
(
input_dims_
)
+
SerializedSize
(
max_batch_size_
)
+
SerializedSize
(
data_type_
)
+
SerializedSize
(
data_format_
));
}
bool
PluginTensorRT
::
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
{
return
((
type
==
nvinfer1
::
DataType
::
kFLOAT
)
&&
(
format
==
nvinfer1
::
PluginFormat
::
kNCHW
));
}
void
PluginTensorRT
::
configureWithFormat
(
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
,
const
nvinfer1
::
Dims
*
outputDims
,
int
nbOutputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
maxBatchSize
)
{
data_type_
=
type
;
data_format_
=
format
;
input_dims_
.
assign
(
inputDims
,
inputDims
+
nbInputs
);
max_batch_size_
=
maxBatchSize
;
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
0 → 100644
浏览文件 @
5670e9ea
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <cstring>
#include <iostream>
#include <unordered_map>
#include <vector>
#include "NvInfer.h"
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
class
PluginTensorRT
:
public
nvinfer1
::
IPluginExt
{
public:
PluginTensorRT
()
{}
PluginTensorRT
(
const
void
*
serialized_data
,
size_t
length
)
{}
nvinfer1
::
Dims
const
&
getInputDims
(
int
index
)
const
{
return
input_dims_
.
at
(
index
);
}
size_t
getMaxBatchSize
()
const
{
return
max_batch_size_
;
}
nvinfer1
::
DataType
getDataType
()
const
{
return
data_type_
;
}
nvinfer1
::
PluginFormat
getDataFormat
()
const
{
return
data_format_
;
}
virtual
const
char
*
getPluginVersion
()
const
{
return
"1"
;
}
size_t
getWorkspaceSize
(
int
)
const
override
{
return
0
;
}
void
terminate
()
override
{}
virtual
~
PluginTensorRT
()
{}
// Check format support. The default is FLOAT32 and NCHW.
bool
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
override
;
void
configureWithFormat
(
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
,
const
nvinfer1
::
Dims
*
outputDims
,
int
nbOutputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
maxBatchSize
)
override
;
// *NOTE* The following functions need to be overrided in the subclass.
virtual
nvinfer1
::
IPluginExt
*
clone
()
const
=
0
;
virtual
const
char
*
getPluginType
()
const
=
0
;
// Initialize the layer for execution. This is called when the engine is
// created.
int
initialize
()
override
{
return
0
;
}
// Serialize the layer config to buffer.
virtual
void
serialize
(
void
*
buffer
)
=
0
;
virtual
size_t
getSerializationSize
()
=
0
;
virtual
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
=
0
;
protected:
// Deserialize input_dims, max_batch_size, data_type, data_format
void
deserializeBase
(
void
const
*&
serialData
,
size_t
&
serialLength
);
size_t
getBaseSerializationSize
();
// Serialize input_dims, max_batch_size, data_type, data_format
void
serializeBase
(
void
*&
buffer
);
std
::
vector
<
nvinfer1
::
Dims
>
input_dims_
;
size_t
max_batch_size_
;
nvinfer1
::
DataType
data_type_
;
nvinfer1
::
PluginFormat
data_format_
;
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
5670e9ea
...
@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
...
@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# DAM
# DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_dam SRCS analyzer_dam_tester.cc
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
)
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS
--infer_model=
${
DAM_INSTALL_DIR
}
/model
--infer_data=
${
DAM_INSTALL_DIR
}
/data.txt
--use_analysis=0
)
# chinese_ner
# chinese_ner
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
...
@@ -108,8 +104,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
...
@@ -108,8 +104,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
if
(
NOT EXISTS
${
TRT_MODEL_INSTALL_DIR
}
)
if
(
NOT EXISTS
${
TRT_MODEL_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
TRT_MODEL_INSTALL_DIR
}
${
INFERENCE_URL
}
/tensorrt_test
"trt_test_models.tar.gz"
)
inference_download_and_uncompress
(
${
TRT_MODEL_INSTALL_DIR
}
${
INFERENCE_URL
}
/tensorrt_test
"trt_test_models.tar.gz"
)
endif
()
endif
()
inference_analysis_test
(
test_trt_models SRCS trt_models_tester.cc
inference_analysis_test
(
test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
analysis
${
analysis_deps
}
ir_pass_manager analysis_predictor
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
analysis
${
analysis_deps
}
ir_pass_manager analysis_predictor
ARGS --
dirname
=
${
TRT_MODEL_INSTALL_DIR
}
/trt_test_models SERIAL
)
ARGS --
infer_model
=
${
TRT_MODEL_INSTALL_DIR
}
/trt_test_models SERIAL
)
endif
()
endif
()
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
5670e9ea
...
@@ -69,7 +69,7 @@ struct DataRecord {
...
@@ -69,7 +69,7 @@ struct DataRecord {
num_lines
++
;
num_lines
++
;
std
::
vector
<
std
::
string
>
data
;
std
::
vector
<
std
::
string
>
data
;
split
(
line
,
','
,
&
data
);
split
(
line
,
','
,
&
data
);
CHECK_EQ
(
data
.
size
(),
2
*
MAX_TURN_NUM
+
3
);
CHECK_EQ
(
data
.
size
(),
(
size_t
)(
2
*
MAX_TURN_NUM
+
3
)
);
// load turn data
// load turn data
std
::
vector
<
int64_t
>
turns_tmp
[
MAX_TURN_NUM
];
std
::
vector
<
int64_t
>
turns_tmp
[
MAX_TURN_NUM
];
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
...
@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
...
@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
PADDLE_ENFORCE_GT
(
outputs
.
size
(),
0
);
PADDLE_ENFORCE_GT
(
outputs
.
size
(),
0
);
...
@@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) {
...
@@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) {
contrib
::
AnalysisConfig
cfg
;
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
if
(
FLAGS_use_analysis
)
{
int
num_ops
;
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
317
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
317
);
EXPECT_EQ
(
num_ops
,
2020
);
EXPECT_EQ
(
num_ops
,
2020
);
}
}
}
// Compare result of NativeConfig and AnalysisConfig
// Compare result of NativeConfig and AnalysisConfig
...
@@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) {
...
@@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
if
(
FLAGS_use_analysis
)
{
CompareNativeAndAnalysis
(
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
}
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
浏览文件 @
5670e9ea
...
@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
...
@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
// the first inference result
// the first inference result
...
@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
...
@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
}
// namespace analysis
}
// namespace analysis
...
...
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
浏览文件 @
5670e9ea
...
@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
...
@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
// the first inference result
// the first inference result
...
@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
...
@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
浏览文件 @
5670e9ea
...
@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {
...
@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
}
TEST
(
Analyzer_resnet50
,
profile
)
{
profile
();
}
TEST
(
Analyzer_resnet50
,
profile
)
{
profile
();
}
...
@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {
...
@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
TEST
(
Analyzer_resnet50
,
compare
)
{
compare
();
}
TEST
(
Analyzer_resnet50
,
compare
)
{
compare
();
}
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
5670e9ea
...
@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {
...
@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
LOG
(
INFO
)
<<
"to test prediction"
;
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
}
// Check the fuse status
// Check the fuse status
...
@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
...
@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// Test Multi-Thread.
// Test Multi-Thread.
...
@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
...
@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
4
/* multi_thread */
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
4
/* multi_thread */
);
}
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
...
...
paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
浏览文件 @
5670e9ea
...
@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
...
@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
// the first inference result
// the first inference result
...
@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
...
@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
浏览文件 @
5670e9ea
...
@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
...
@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
// the first inference result
// the first inference result
...
@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
...
@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
浏览文件 @
5670e9ea
...
@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
...
@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
)
{
if
(
FLAGS_num_threads
==
1
)
{
// Get output
// Get output
...
@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {
...
@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
TEST
(
Analyzer_Text_Classification
,
compare_against_embedding_fc_lstm_fused
)
{
TEST
(
Analyzer_Text_Classification
,
compare_against_embedding_fc_lstm_fused
)
{
...
@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
...
@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
5670e9ea
...
@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
...
@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg
->
specify_input_name
=
true
;
cfg
->
specify_input_name
=
true
;
// TODO(TJ): fix fusion gru
// TODO(TJ): fix fusion gru
cfg
->
pass_builder
()
->
DeletePass
(
"fc_gru_fuse_pass"
);
cfg
->
pass_builder
()
->
DeletePass
(
"fc_gru_fuse_pass"
);
#ifdef PADDLE_WITH_MKLDNN
cfg
->
EnableMKLDNN
();
#endif
}
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
@@ -94,7 +91,8 @@ void profile(bool use_mkldnn = false) {
...
@@ -94,7 +91,8 @@ void profile(bool use_mkldnn = false) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
const
float
ocr_result_data
[]
=
{
const
float
ocr_result_data
[]
=
{
...
@@ -136,7 +134,8 @@ void compare(bool use_mkldnn = false) {
...
@@ -136,7 +134,8 @@ void compare(bool use_mkldnn = false) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
TEST
(
Analyzer_vis
,
compare
)
{
compare
();
}
TEST
(
Analyzer_vis
,
compare
)
{
compare
();
}
...
...
paddle/fluid/inference/tests/api/config_printer.h
0 → 100644
浏览文件 @
5670e9ea
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <ostream>
#include <string>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace
paddle
{
namespace
inference
{
thread_local
int
num_spaces
=
0
;
static
std
::
string
GenSpaces
(
int
num_spaces
)
{
std
::
ostringstream
os
;
for
(
int
i
=
0
;
i
<
num_spaces
;
++
i
)
{
os
<<
" "
;
}
return
os
.
str
();
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
PaddlePredictor
::
Config
&
config
)
{
os
<<
GenSpaces
(
num_spaces
)
<<
"PaddlePredictor::Config {
\n
"
;
num_spaces
++
;
os
<<
GenSpaces
(
num_spaces
)
<<
"model_dir: "
<<
config
.
model_dir
<<
"
\n
"
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
NativeConfig
&
config
)
{
os
<<
GenSpaces
(
num_spaces
)
<<
"NativeConfig {
\n
"
;
num_spaces
++
;
os
<<
*
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
config
);
os
<<
GenSpaces
(
num_spaces
)
<<
"use_gpu: "
<<
config
.
use_gpu
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"device: "
<<
config
.
device
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"fraction_of_gpu_memory: "
<<
config
.
fraction_of_gpu_memory
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file: "
<<
config
.
prog_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
param_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"specify_input_name: "
<<
config
.
specify_input_name
<<
"
\n
"
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
contrib
::
AnalysisConfig
&
config
)
{
os
<<
GenSpaces
(
num_spaces
)
<<
"contrib::AnalysisConfig {
\n
"
;
num_spaces
++
;
os
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
&
config
);
os
<<
GenSpaces
(
num_spaces
)
<<
"enable_ir_optim: "
<<
config
.
enable_ir_optim
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_feed_fetch_ops: "
<<
config
.
use_feed_fetch_ops
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_tensorrt: "
<<
config
.
use_tensorrt
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_mkldnn: "
<<
config
.
use_mkldnn
()
<<
"
\n
"
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
5670e9ea
...
@@ -19,13 +19,16 @@
...
@@ -19,13 +19,16 @@
#include <string>
#include <string>
#include <thread> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
...
@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
DEFINE_bool
(
use_analysis
,
true
,
DEFINE_bool
(
use_analysis
,
true
,
"Running the inference program in analysis mode."
);
"Running the inference program in analysis mode."
);
DECLARE_bool
(
profile
);
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
void
PrintConfig
(
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
)
{
if
(
use_analysis
)
{
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
return
;
}
LOG
(
INFO
)
<<
*
config
;
}
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
const
std
::
vector
<
PaddleTensor
>
&
ref_outputs
)
{
const
std
::
vector
<
PaddleTensor
>
&
ref_outputs
)
{
...
@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
...
@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
}
}
std
::
unique_ptr
<
PaddlePredictor
>
CreateTestPredictor
(
std
::
unique_ptr
<
PaddlePredictor
>
CreateTestPredictor
(
const
AnalysisConfig
&
config
,
bool
use_analysis
=
true
)
{
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
=
true
)
{
if
(
use_analysis
)
{
if
(
use_analysis
)
{
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
config
);
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
}
else
{
*
(
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
)));
return
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
}
}
return
CreatePaddlePredictor
<
NativeConfig
>
(
*
(
reinterpret_cast
<
const
NativeConfig
*>
(
config
)));
}
}
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
...
@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
...
@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
}
}
void
SetFakeImageInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
,
void
SetFakeImageInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
,
const
std
::
string
&
dirname
)
{
const
std
::
string
&
dirname
,
bool
is_combined
=
true
,
std
::
string
model_filename
=
"model"
,
std
::
string
params_filename
=
"params"
)
{
// Set fake_image_data
// Set fake_image_data
PADDLE_ENFORCE_EQ
(
FLAGS_test_all_data
,
0
,
"Only have single batch of data."
);
PADDLE_ENFORCE_EQ
(
FLAGS_test_all_data
,
0
,
"Only have single batch of data."
);
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
GetFeedTargetShapes
(
GetFeedTargetShapes
(
dirname
,
true
,
"model"
,
"params"
);
dirname
,
is_combined
,
model_filename
,
params_filename
);
std
::
ostringstream
os
;
for
(
size_t
i
=
0
;
i
<
feed_target_shapes
.
size
();
++
i
)
{
os
<<
"feed target "
<<
i
<<
": {"
<<
feed_target_shapes
[
i
][
0
];
for
(
size_t
j
=
1
;
j
<
feed_target_shapes
[
i
].
size
();
++
j
)
{
os
<<
", "
<<
feed_target_shapes
[
i
][
j
];
}
os
<<
"}
\n
"
;
}
LOG
(
INFO
)
<<
os
.
str
();
int
dim1
=
feed_target_shapes
[
0
][
1
];
int
dim1
=
feed_target_shapes
[
0
][
1
];
int
dim2
=
feed_target_shapes
[
0
][
2
];
int
dim2
=
feed_target_shapes
[
0
][
2
];
int
dim3
=
feed_target_shapes
[
0
][
3
];
int
dim3
=
feed_target_shapes
[
0
][
3
];
...
@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
...
@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
}
}
void
TestOneThreadPrediction
(
void
TestOneThreadPrediction
(
const
AnalysisConfig
&
config
,
const
PaddlePredictor
::
Config
*
config
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
outputs
,
bool
use_analysis
=
true
)
{
std
::
vector
<
PaddleTensor
>
*
outputs
,
bool
use_analysis
=
true
)
{
int
batch_size
=
FLAGS_batch_size
;
int
batch_size
=
FLAGS_batch_size
;
int
num_times
=
FLAGS_repeat
;
int
num_times
=
FLAGS_repeat
;
auto
predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
auto
predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
Timer
timer
;
timer
.
tic
();
// warmup run
for
(
int
i
=
0
;
i
<
num_times
;
i
++
)
{
LOG
(
INFO
)
<<
"Warm up run..."
;
for
(
size_t
j
=
0
;
j
<
inputs
.
size
();
j
++
)
{
{
predictor
->
Run
(
inputs
[
j
],
outputs
);
Timer
warmup_timer
;
warmup_timer
.
tic
();
predictor
->
Run
(
inputs
[
0
],
outputs
,
batch_size
);
PrintTime
(
batch_size
,
1
,
1
,
0
,
warmup_timer
.
toc
(),
1
);
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
paddle
::
platform
::
ResetProfiler
();
}
#endif
}
LOG
(
INFO
)
<<
"Run "
<<
num_times
<<
" times..."
;
{
Timer
run_timer
;
run_timer
.
tic
();
for
(
int
i
=
0
;
i
<
num_times
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
inputs
.
size
();
j
++
)
{
predictor
->
Run
(
inputs
[
j
],
outputs
,
batch_size
);
}
}
}
PrintTime
(
batch_size
,
num_times
,
1
,
0
,
run_timer
.
toc
()
/
num_times
,
inputs
.
size
());
}
}
PrintTime
(
batch_size
,
num_times
,
1
,
0
,
timer
.
toc
()
/
num_times
,
inputs
.
size
());
}
}
void
TestMultiThreadPrediction
(
void
TestMultiThreadPrediction
(
const
AnalysisConfig
&
config
,
const
PaddlePredictor
::
Config
*
config
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
outputs
,
int
num_threads
,
std
::
vector
<
PaddleTensor
>
*
outputs
,
int
num_threads
,
bool
use_analysis
=
true
)
{
bool
use_analysis
=
true
)
{
...
@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
...
@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
}
}
}
}
void
TestPrediction
(
const
AnalysisConfig
&
config
,
void
TestPrediction
(
const
PaddlePredictor
::
Config
*
config
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
outputs
,
int
num_threads
,
std
::
vector
<
PaddleTensor
>
*
outputs
,
int
num_threads
,
bool
use_analysis
=
FLAGS_use_analysis
)
{
bool
use_analysis
=
FLAGS_use_analysis
)
{
LOG
(
INFO
)
<<
"use_analysis: "
<<
use_analysis
PrintConfig
(
config
,
use_analysis
);
<<
", use_mkldnn: "
<<
config
.
use_mkldnn
();
if
(
num_threads
==
1
)
{
if
(
num_threads
==
1
)
{
TestOneThreadPrediction
(
config
,
inputs
,
outputs
,
use_analysis
);
TestOneThreadPrediction
(
config
,
inputs
,
outputs
,
use_analysis
);
}
else
{
}
else
{
...
@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
...
@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
}
}
void
CompareNativeAndAnalysis
(
void
CompareNativeAndAnalysis
(
const
AnalysisConfig
&
config
,
const
PaddlePredictor
::
Config
*
config
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
)
{
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
)
{
LOG
(
INFO
)
<<
"use_mkldnn: "
<<
config
.
use_mkldnn
(
);
PrintConfig
(
config
,
true
);
std
::
vector
<
PaddleTensor
>
native_outputs
,
analysis_outputs
;
std
::
vector
<
PaddleTensor
>
native_outputs
,
analysis_outputs
;
TestOneThreadPrediction
(
config
,
inputs
,
&
native_outputs
,
false
);
TestOneThreadPrediction
(
config
,
inputs
,
&
native_outputs
,
false
);
TestOneThreadPrediction
(
config
,
inputs
,
&
analysis_outputs
,
true
);
TestOneThreadPrediction
(
config
,
inputs
,
&
analysis_outputs
,
true
);
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
5670e9ea
/
/
Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/
*
Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
//
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
//
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
//
You may obtain a copy of the License at
You may obtain a copy of the License at
//
//
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
//
//
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
//
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
//
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
// limitations under the License.
limitations under the License. */
#include <gflags/gflags.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
paddle
{
using
paddle
::
contrib
::
AnalysisConfig
;
namespace
inference
{
DEFINE_string
(
dirname
,
""
,
"Directory of the inference model."
);
DEFINE_bool
(
use_tensorrt
,
true
,
"Test the performance of TensorRT engine."
);
DEFINE_string
(
prog_filename
,
""
,
"Name of model file."
);
NativeConfig
GetConfigNative
()
{
DEFINE_string
(
param_filename
,
""
,
"Name of parameters file."
);
NativeConfig
config
;
config
.
model_dir
=
FLAGS_dirname
;
template
<
typename
ConfigType
>
// LOG(INFO) << "dirname " << config.model_dir;
void
SetConfig
(
ConfigType
*
config
,
std
::
string
model_dir
,
bool
use_gpu
,
config
.
fraction_of_gpu_memory
=
0.15
;
bool
use_tensorrt
=
false
,
int
batch_size
=
-
1
)
{
config
.
use_gpu
=
true
;
if
(
!
FLAGS_prog_filename
.
empty
()
&&
!
FLAGS_param_filename
.
empty
())
{
config
.
device
=
0
;
config
->
prog_file
=
model_dir
+
"/"
+
FLAGS_prog_filename
;
return
config
;
config
->
param_file
=
model_dir
+
"/"
+
FLAGS_param_filename
;
}
}
else
{
config
->
model_dir
=
model_dir
;
void
PrepareTRTConfig
(
AnalysisConfig
*
config
)
{
}
config
->
model_dir
=
FLAGS_dirname
+
"/"
+
"mobilenet"
;
if
(
use_gpu
)
{
config
->
fraction_of_gpu_memory
=
0.15
;
config
->
use_gpu
=
true
;
config
->
EnableTensorRtEngine
(
1
<<
10
,
5
);
config
->
device
=
0
;
config
->
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
config
->
fraction_of_gpu_memory
=
0.15
;
config
->
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
}
config
->
pass_builder
()
->
TurnOnDebug
();
}
}
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
tensors
,
int
batch_size
)
{
template
<
>
PADDLE_ENFORCE_EQ
(
tensors
->
size
(),
1UL
);
void
SetConfig
<
contrib
::
AnalysisConfig
>
(
contrib
::
AnalysisConfig
*
config
,
auto
&
tensor
=
tensors
->
front
();
std
::
string
model_dir
,
bool
use_gpu
,
int
height
=
224
;
bool
use_tensorrt
,
int
batch_size
)
{
int
width
=
224
;
if
(
!
FLAGS_prog_filename
.
empty
()
&&
!
FLAGS_param_filename
.
empty
())
{
float
*
data
=
new
float
[
batch_size
*
3
*
height
*
width
];
config
->
prog_file
=
model_dir
+
"/"
+
FLAGS_prog_filename
;
memset
(
data
,
0
,
sizeof
(
float
)
*
(
batch_size
*
3
*
height
*
width
));
config
->
param_file
=
model_dir
+
"/"
+
FLAGS_param_filename
;
data
[
0
]
=
1.0
f
;
}
else
{
config
->
model_dir
=
model_dir
;
// Prepare inputs
}
tensor
.
name
=
"input_0"
;
if
(
use_gpu
)
{
tensor
.
shape
=
std
::
vector
<
int
>
({
batch_size
,
3
,
height
,
width
});
config
->
use_gpu
=
true
;
tensor
.
data
=
PaddleBuf
(
static_cast
<
void
*>
(
data
),
config
->
device
=
0
;
sizeof
(
float
)
*
(
batch_size
*
3
*
height
*
width
));
config
->
fraction_of_gpu_memory
=
0.15
;
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
if
(
use_tensorrt
)
{
config
->
EnableTensorRtEngine
(
1
<<
10
,
batch_size
);
config
->
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
config
->
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
config
->
pass_builder
()
->
TurnOnDebug
();
}
else
{
config
->
enable_ir_optim
=
true
;
}
}
}
}
void
CompareTensorRTWithFluid
(
int
batch_size
,
std
::
string
model_dirname
)
{
void
profile
(
std
::
string
model_dir
,
bool
use_analysis
,
bool
use_tensorrt
)
{
auto
config0
=
GetConfigNative
();
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
inputs_all
;
config0
.
model_dir
=
model_dirname
;
if
(
!
FLAGS_prog_filename
.
empty
()
&&
!
FLAGS_param_filename
.
empty
())
{
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
true
,
FLAGS_prog_filename
,
AnalysisConfig
config1
(
true
);
FLAGS_param_filename
);
PrepareTRTConfig
(
&
config1
);
}
else
{
config1
.
model_dir
=
model_dirname
;
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
false
,
"__model__"
,
""
);
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
>
(
config0
);
auto
predictor1
=
CreatePaddlePredictor
(
config1
);
// Prepare inputs
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
);
PrepareInputs
(
&
paddle_tensor_feeds
,
batch_size
);
// Prepare outputs
std
::
vector
<
PaddleTensor
>
outputs0
;
std
::
vector
<
PaddleTensor
>
outputs1
;
CHECK
(
predictor0
->
Run
(
paddle_tensor_feeds
,
&
outputs0
));
CHECK
(
predictor1
->
Run
(
paddle_tensor_feeds
,
&
outputs1
,
batch_size
));
const
size_t
num_elements
=
outputs0
.
front
().
data
.
length
()
/
sizeof
(
float
);
const
size_t
num_elements1
=
outputs1
.
front
().
data
.
length
()
/
sizeof
(
float
);
EXPECT_EQ
(
num_elements
,
num_elements1
);
auto
*
data0
=
static_cast
<
float
*>
(
outputs0
.
front
().
data
.
data
());
auto
*
data1
=
static_cast
<
float
*>
(
outputs1
.
front
().
data
.
data
());
ASSERT_GT
(
num_elements
,
0UL
);
for
(
size_t
i
=
0
;
i
<
std
::
min
(
num_elements
,
num_elements1
);
i
++
)
{
EXPECT_NEAR
(
data0
[
i
],
data1
[
i
],
1e-3
);
}
}
}
TEST
(
trt_models_test
,
mobilenet
)
{
std
::
vector
<
PaddleTensor
>
outputs
;
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/"
+
"mobilenet"
);
if
(
use_analysis
||
use_tensorrt
)
{
}
contrib
::
AnalysisConfig
config
(
true
);
TEST
(
trt_models_test
,
resnet50
)
{
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
config
,
model_dir
,
true
,
use_tensorrt
,
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/"
+
"resnet50"
);
FLAGS_batch_size
);
}
TestPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
config
),
TEST
(
trt_models_test
,
resnext50
)
{
inputs_all
,
&
outputs
,
FLAGS_num_threads
,
true
);
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/"
+
"resnext50"
);
}
else
{
NativeConfig
config
;
SetConfig
<
NativeConfig
>
(
&
config
,
model_dir
,
true
,
false
);
TestPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
config
),
inputs_all
,
&
outputs
,
FLAGS_num_threads
,
false
);
}
}
}
TEST
(
trt_models_test
,
raw_gpu
)
{
void
compare
(
std
::
string
model_dir
,
bool
use_tensorrt
)
{
std
::
string
model_dir
=
FLAGS_dirname
+
"/"
+
"mobilenet"
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
inputs_all
;
auto
config0
=
GetConfigNative
();
if
(
!
FLAGS_prog_filename
.
empty
()
&&
!
FLAGS_param_filename
.
empty
())
{
config0
.
model_dir
=
model_dir
;
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
true
,
FLAGS_prog_filename
,
int
batch_size
=
2
;
FLAGS_param_filename
);
}
else
{
AnalysisConfig
config1
(
true
);
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
false
,
"__model__"
,
""
);
config1
.
fraction_of_gpu_memory
=
0.1
;
}
config1
.
enable_ir_optim
=
true
;
config1
.
model_dir
=
model_dir
;
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
>
(
config0
);
std
::
vector
<
PaddleTensor
>
native_outputs
;
auto
predictor1
=
CreatePaddlePredictor
(
config1
);
NativeConfig
native_config
;
SetConfig
<
NativeConfig
>
(
&
native_config
,
model_dir
,
true
,
false
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
native_config
),
inputs_all
,
&
native_outputs
,
false
);
std
::
vector
<
PaddleTensor
>
analysis_outputs
;
contrib
::
AnalysisConfig
analysis_config
(
true
);
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
analysis_config
,
model_dir
,
true
,
use_tensorrt
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
analysis_config
),
inputs_all
,
&
analysis_outputs
,
true
);
CompareResult
(
native_outputs
,
analysis_outputs
);
}
// Prepare inputs
TEST
(
TensorRT_mobilenet
,
compare
)
{
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
);
std
::
string
model_dir
=
FLAGS_infer_model
+
"/mobilenet"
;
PrepareInputs
(
&
paddle_tensor_feeds
,
batch_size
);
compare
(
model_dir
,
/* use_tensorrt */
true
);
}
// Prepare outputs
TEST
(
TensorRT_resnet50
,
compare
)
{
std
::
vector
<
PaddleTensor
>
outputs0
;
std
::
string
model_dir
=
FLAGS_infer_model
+
"/resnet50"
;
std
::
vector
<
PaddleTensor
>
outputs1
;
compare
(
model_dir
,
/* use_tensorrt */
true
);
CHECK
(
predictor0
->
Run
(
paddle_tensor_feeds
,
&
outputs0
));
}
CHECK
(
predictor1
->
Run
(
paddle_tensor_feeds
,
&
outputs1
,
batch_size
));
const
size_t
num_elements
=
outputs0
.
front
().
data
.
length
()
/
sizeof
(
float
);
TEST
(
TensorRT_resnext50
,
compare
)
{
const
size_t
num_elements1
=
outputs1
.
front
().
data
.
length
()
/
sizeof
(
float
);
std
::
string
model_dir
=
FLAGS_infer_model
+
"/resnext50"
;
EXPECT_EQ
(
num_elements
,
num_elements1
);
compare
(
model_dir
,
/* use_tensorrt */
true
);
}
auto
*
data0
=
static_cast
<
float
*>
(
outputs0
.
front
().
data
.
data
());
TEST
(
TensorRT_resnext50
,
profile
)
{
auto
*
data1
=
static_cast
<
float
*>
(
outputs1
.
front
().
data
.
data
());
std
::
string
model_dir
=
FLAGS_infer_model
+
"/resnext50"
;
profile
(
model_dir
,
/* use_analysis */
true
,
FLAGS_use_tensorrt
);
}
ASSERT_GT
(
num_elements
,
0UL
);
TEST
(
TensorRT_mobilenet
,
analysis
)
{
for
(
size_t
i
=
0
;
i
<
std
::
min
(
num_elements
,
num_elements1
);
i
++
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"mobilenet"
;
EXPECT_NEAR
(
data0
[
i
],
data1
[
i
],
1e-3
);
compare
(
model_dir
,
/* use_tensorrt */
false
);
}
}
}
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
tensorrt_subgraph_pass
);
USE_PASS
(
tensorrt_subgraph_pass
);
paddle/fluid/operators/fc_op.cc
浏览文件 @
5670e9ea
...
@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"Out(Output) of Fully Connected should not be null."
);
"Out(Output) of Fully Connected should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"W(Input) of Fully Connected should not be null."
);
"W(Input) of Fully Connected should not be null."
);
// NCHW
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
// IO, I=C*H*W
auto
w_dims
=
ctx
->
GetInputDim
(
"W"
);
auto
w_dims
=
ctx
->
GetInputDim
(
"W"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
w_dims
[
1
]});
if
(
ctx
->
HasInput
(
"Bias"
))
{
if
(
ctx
->
HasInput
(
"Bias"
))
{
auto
bias_dims
=
ctx
->
GetInputDim
(
"Bias"
);
auto
bias_dims
=
ctx
->
GetInputDim
(
"Bias"
);
...
@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"The shape of Bias must be [1, dim]."
);
"The shape of Bias must be [1, dim]."
);
}
}
}
}
PADDLE_ENFORCE
(
in_dims
.
size
()
==
2
||
in_dims
.
size
()
==
4
,
"Fully Connected input should be 2-D or 4-D tensor."
);
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use_mkldnn"
))
{
PADDLE_ENFORCE
(
in_dims
.
size
()
==
2
||
in_dims
.
size
()
==
4
,
"Fully Connected input should be 2-D or 4-D tensor."
);
}
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2UL
,
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2UL
,
"Fully Connected input should be 2-D tensor."
);
"Fully Connected input should be 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
framework
::
product
(
in_dims
)
/
in_dims
[
0
],
w_dims
[
0
],
int
in_num_col_dims
=
ctx
->
Attrs
().
Get
<
int
>
(
"in_num_col_dims"
);
"Fully Connected input and weigth size do not match."
);
PADDLE_ENFORCE_GT
(
in_dims
.
size
(),
in_num_col_dims
,
"The input tensor Input's rank of FCOp should be larger than "
"in_num_col_dims."
);
auto
in_mat_dims
=
framework
::
flatten_to_2d
(
in_dims
,
in_num_col_dims
);
PADDLE_ENFORCE_EQ
(
in_mat_dims
[
1
],
w_dims
[
0
],
"Fully Connected input and weigth size do not match. %s, %s"
);
std
::
vector
<
int64_t
>
output_dims
;
output_dims
.
reserve
(
static_cast
<
size_t
>
(
in_num_col_dims
+
1
));
for
(
int
i
=
0
;
i
<
in_num_col_dims
;
++
i
)
{
output_dims
.
push_back
(
in_dims
[
i
]);
}
output_dims
.
push_back
(
w_dims
[
1
]);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_
shape
));
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_
dims
));
ctx
->
ShareLoD
(
"Input"
,
"Out"
);
ctx
->
ShareLoD
(
"Input"
,
"Out"
);
}
}
...
@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
...
@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
}
}
void
FCOpMaker
::
Make
()
{
void
FCOpMaker
::
Make
()
{
AddInput
(
"Input"
,
AddInput
(
"Input"
,
"(Tensor), The input tensor of fully connected operator."
);
"(Tensor), The input tensor of fully connected operator with format "
"(NCHW). "
);
AddInput
(
"W"
,
"(Tensor), The weight fc op with shape (I, O)."
);
AddInput
(
"W"
,
"(Tensor), The weight fc op with shape (I, O)."
);
AddInput
(
"Bias"
,
"(Tensor, optional) Bias vector with shape (1 x O"
)
AddInput
(
"Bias"
,
"(Tensor, optional) Bias vector with shape (1 x O"
)
.
AsDispensable
();
.
AsDispensable
();
AddAttr
<
int
>
(
"in_num_col_dims"
,
"(int, default 1), The fc op can take tensors with more than "
"two dimensions as its inputs."
)
.
SetDefault
(
1
)
.
EqualGreaterThan
(
1
);
AddOutput
(
"Out"
,
"(Tensor) The output tensor of fully connected operator. "
);
AddOutput
(
"Out"
,
"(Tensor) The output tensor of fully connected operator. "
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
"(bool, default false) Only used in mkldnn kernel"
)
...
@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> {
...
@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> {
auto
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
in_dims
=
input
->
dims
();
auto
in_dims
=
input
->
dims
();
auto
w_dims
=
w
->
dims
();
auto
w_dims
=
w
->
dims
();
auto
out_dims
=
output
->
dims
();
int
M
=
framework
::
product
(
out_dims
)
/
out_dims
[
out_dims
.
size
()
-
1
];
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
math
::
FCCompute
<
platform
::
CPUDeviceContext
,
T
>
(
math
::
FCCompute
<
platform
::
CPUDeviceContext
,
T
>
(
blas
,
in_dims
[
0
]
,
w_dims
[
1
],
w_dims
[
0
],
input_data
,
w_data
,
output_data
,
blas
,
M
,
w_dims
[
1
],
w_dims
[
0
],
input_data
,
w_data
,
output_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
bias
?
bias
->
data
<
T
>
()
:
NULL
);
// TODO(TJ): fuse act
// TODO(TJ): fuse act
...
...
paddle/fluid/operators/hash_op.cc
浏览文件 @
5670e9ea
...
@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel {
...
@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel {
std
::
vector
<
int64_t
>
out_dims
;
std
::
vector
<
int64_t
>
out_dims
;
out_dims
.
reserve
(
dims
.
size
()
+
1
);
out_dims
.
reserve
(
dims
.
size
()
+
1
);
// copy all dims except the last one
// copy all dims except the last one
for
(
size_
t
i
=
0u
;
i
!=
dims
.
size
()
-
1
;
++
i
)
{
for
(
in
t
i
=
0u
;
i
!=
dims
.
size
()
-
1
;
++
i
)
{
out_dims
.
emplace_back
(
dims
[
i
]);
out_dims
.
emplace_back
(
dims
[
i
]);
}
}
int
num_hash
=
ctx
->
Attrs
().
Get
<
int
>
(
"num_hash"
);
int
num_hash
=
ctx
->
Attrs
().
Get
<
int
>
(
"num_hash"
);
...
...
paddle/fluid/operators/math/jit_code.cc
浏览文件 @
5670e9ea
...
@@ -118,6 +118,39 @@ void VXXJitCode::generate() {
...
@@ -118,6 +118,39 @@ void VXXJitCode::generate() {
ret
();
ret
();
}
}
bool
ReluJitCode
::
init
(
int
d
)
{
return
MayIUse
(
avx
);
}
void
ReluJitCode
::
generate
()
{
int
offset
=
0
;
vxorps
(
ymm_zero
,
ymm_zero
,
ymm_zero
);
for
(
int
i
=
0
;
i
<
num_
/
AVX_FLOAT_BLOCK
;
++
i
)
{
vmovups
(
ymm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
ymm_dst
,
ymm_zero
,
ymm_src
);
vmovups
(
ptr
[
param2
+
offset
],
ymm_dst
);
offset
+=
sizeof
(
float
)
*
AVX_FLOAT_BLOCK
;
}
int
rest
=
num_
%
AVX_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
4
;
rest
-=
4
;
}
if
(
rest
>=
2
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovq
(
ptr
[
param2
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
2
;
rest
-=
2
;
}
if
(
rest
>
0
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovss
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
ret
();
}
}
// namespace gen
}
// namespace gen
}
// namespace jitkernel
}
// namespace jitkernel
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/jit_code.h
浏览文件 @
5670e9ea
...
@@ -85,6 +85,29 @@ class VXXJitCode : public JitCode {
...
@@ -85,6 +85,29 @@ class VXXJitCode : public JitCode {
ymm_t
ymm_zero
=
ymm_t
(
3
);
ymm_t
ymm_zero
=
ymm_t
(
3
);
};
};
class
ReluJitCode
:
public
JitCode
{
public:
DECLARE_JIT_CODE
(
ReluJitCode
);
explicit
ReluJitCode
(
int
d
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
num_
(
d
)
{}
static
bool
init
(
int
d
);
void
generate
()
override
;
private:
int
num_
;
reg64_t
param1
{
abi_param1
};
reg64_t
param2
{
abi_param2
};
xmm_t
xmm_zero
=
xmm_t
(
0
);
xmm_t
xmm_src
=
xmm_t
(
1
);
xmm_t
xmm_dst
=
xmm_t
(
1
);
ymm_t
ymm_zero
=
ymm_t
(
0
);
ymm_t
ymm_src
=
ymm_t
(
1
);
ymm_t
ymm_dst
=
ymm_t
(
1
);
};
}
// namespace gen
}
// namespace gen
}
// namespace jitkernel
}
// namespace jitkernel
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
5670e9ea
...
@@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel {
...
@@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel {
template
<
typename
T
>
template
<
typename
T
>
class
VActKernel
:
public
Kernel
{
class
VActKernel
:
public
Kernel
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
VReluKernel
:
public
VActKernel
<
T
>
{
class
VReluKernel
:
public
VActKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
ComputeDeprecated
(
const
T
*
x
,
T
*
y
)
const
=
0
;
void
(
*
Compute
)(
const
T
*
,
T
*
,
int
);
};
};
template
<
typename
T
>
template
<
typename
T
>
class
VIdentityKernel
:
public
VActKernel
<
T
>
{
class
VIdentityKernel
:
public
VActKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
VExpKernel
:
public
VActKernel
<
T
>
{
class
VExpKernel
:
public
VActKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
VSigmoidKernel
:
public
VActKernel
<
T
>
{
class
VSigmoidKernel
:
public
VActKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
VTanhKernel
:
public
VActKernel
<
T
>
{
class
VTanhKernel
:
public
VActKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
5670e9ea
...
@@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
...
@@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
}
}
}
}
template
<
typename
T
>
void
VReluRefer
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
template
<
typename
T
>
template
<
typename
T
>
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
...
@@ -344,124 +351,60 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
...
@@ -344,124 +351,60 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
}
}
#endif
#endif
#undef DECLARE_STATIC_FUNC
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vaddrelu
,
VAddReluKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vaddbias
,
VAddBiasKernel
);
/* VRelu JitKernel */
/* VRelu JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
template
<
typename
T
>
class
VReluKernelImpl
:
public
VReluKernel
<
T
>
{
class
VReluKernelImpl
:
public
VReluKernel
<
T
>
{
public:
public:
explicit
VReluKernelImpl
(
int
d
)
:
VReluKernel
<
T
>
()
{
this
->
num_
=
d
;
}
DECLARE_STATIC_FUNC
;
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
explicit
VReluKernelImpl
(
int
d
)
:
VReluKernel
<
T
>
()
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
this
->
num_
=
d
;
// TODO(TJ): remove me when ComputeDeprecated done
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
#ifdef PADDLE_WITH_XBYAK
if
(
useJIT
(
d
))
{
size_t
sz
=
96
/*init*/
+
d
/
AVX_FLOAT_BLOCK
*
4
/* instructions*/
*
8
/*everage byte for each instruction*/
;
jitcode_
.
reset
(
new
gen
::
ReluJitCode
(
d
,
sz
>
4096
?
sz
:
4096
));
this
->
Compute
=
jitcode_
->
getCode
<
void
(
*
)(
const
T
*
,
T
*
,
int
)
>
();
return
;
}
}
}
#endif
};
#define INTRI8_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
this
->
Compute
=
VReluRefer
<
T
>
;
template <> \
VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + this->rest_, tmp1); \
}
}
void
ComputeDeprecated
(
const
T
*
x
,
T
*
y
)
const
override
{
#define INTRI_GT16_FLOAT(isa) \
VReluRefer
(
x
,
y
,
this
->
num_
);
template <> \
VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp); \
} \
__m256 tmp = _mm256_loadu_ps(x + this->rest_); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + this->rest_, tmp); \
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef __AVX__
private:
INTRI8_FLOAT
(
jit
::
avx
);
std
::
unique_ptr
<
gen
::
ReluJitCode
>
jitcode_
{
nullptr
};
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx2
);
INTRI_GT16_FLOAT
(
jit
::
avx2
);
#endif
#endif
#ifdef __AVX512F__
};
// TODO(TJ): refine avx512
INTRI8_FLOAT
(
jit
::
avx512f
);
#ifdef PADDLE_WITH_XBYAK
INTRI16_FLOAT
(
jit
::
avx512f
);
template
<
>
INTRI_GT8LT16_FLOAT
(
jit
::
avx512f
);
bool
VReluKernelImpl
<
float
>::
useJIT
(
int
d
)
{
INTRI_GT16_FLOAT
(
jit
::
avx512f
);
return
gen
::
ReluJitCode
::
init
(
d
);
}
#endif
#endif
#undef INTRI8_FLOAT
#undef DECLARE_STATIC_FUNC
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
#undef INTRI_GT16_FLOAT
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vaddrelu
,
VAddReluKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vaddbias
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
/* An empty JitKernel */
/* An empty JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VIdentityKernelImpl
:
public
VIdentityKernel
<
T
>
{
class
VIdentityKernelImpl
:
public
VIdentityKernel
<
T
>
{
public:
public:
explicit
VIdentityKernelImpl
(
int
d
)
:
VIdentityKernel
<
T
>
()
{
this
->
num_
=
d
;
}
explicit
VIdentityKernelImpl
(
int
d
)
:
VIdentityKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
override
{}
};
};
REGISTER_JITKERNEL_DEPRECATED
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
videntity
,
VIdentityKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
videntity
,
VIdentityKernel
);
}
// namespace jitkernel
}
// namespace jitkernel
...
...
paddle/fluid/operators/math/jit_kernel_exp.cc
浏览文件 @
5670e9ea
...
@@ -35,7 +35,7 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
...
@@ -35,7 +35,7 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
class
VExpKernelImpl
:
public
VExpKernel
<
T
>
{
class
VExpKernelImpl
:
public
VExpKernel
<
T
>
{
public:
public:
explicit
VExpKernelImpl
(
int
d
)
:
VExpKernel
<
T
>
()
{
this
->
num_
=
d
;
}
explicit
VExpKernelImpl
(
int
d
)
:
VExpKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
...
@@ -43,18 +43,18 @@ class VExpKernelImpl : public VExpKernel<T> {
...
@@ -43,18 +43,18 @@ class VExpKernelImpl : public VExpKernel<T> {
};
};
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
#define MKL_FLOAT(isa, block)
\
template <> \
template <>
\
void VExpKernelImpl<float, isa, block>::Compute
(const float* x, float* y)
\
void VExpKernelImpl<float, isa, block>::Compute
Deprecated(const float* x,
\
const {
\
float* y) const {
\
platform::dynload::vsExp(this->num_, x, y); \
platform::dynload::vsExp(this->num_, x, y);
\
}
}
#define MKL_DOUBLE(isa, block)
\
#define MKL_DOUBLE(isa, block) \
template <>
\
template <> \
void VExpKernelImpl<double, isa, block>::Compute
(const double* x, double* y)
\
void VExpKernelImpl<double, isa, block>::Compute
Deprecated(
\
const
{
\
const
double* x, double* y) const {
\
platform::dynload::vdExp(this->num_, x, y);
\
platform::dynload::vdExp(this->num_, x, y); \
}
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kLT8
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kLT8
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT8LT16
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT8LT16
);
...
@@ -211,24 +211,24 @@ __m256 ExpAVX2(__m256 x) {
...
@@ -211,24 +211,24 @@ __m256 ExpAVX2(__m256 x) {
}
// namespace detail
}
// namespace detail
#define INTRI8_FLOAT(isa, expisa) \
#define INTRI8_FLOAT(isa, expisa)
\
template <> \
template <>
\
void VExpKernelImpl<float, isa, kEQ8>::Compute
(const float* x, float* y)
\
void VExpKernelImpl<float, isa, kEQ8>::Compute
Deprecated(const float* x,
\
const {
\
float* y) const {
\
__m256 tmp = _mm256_loadu_ps(x); \
__m256 tmp = _mm256_loadu_ps(x);
\
_mm256_storeu_ps(y, expisa(tmp)); \
_mm256_storeu_ps(y, expisa(tmp));
\
}
}
#define INTRI16_FLOAT(isa, expisa) \
#define INTRI16_FLOAT(isa, expisa)
\
template <> \
template <>
\
void VExpKernelImpl<float, isa, kEQ16>::Compute
(const float* x, float* y)
\
void VExpKernelImpl<float, isa, kEQ16>::Compute
Deprecated(const float* x,
\
const {
\
float* y) const {
\
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
tmp0 = expisa(tmp0); \
tmp0 = expisa(tmp0);
\
tmp1 = expisa(tmp1); \
tmp1 = expisa(tmp1);
\
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1); \
_mm256_storeu_ps(y + 8, tmp1);
\
}
}
#ifdef __AVX__
#ifdef __AVX__
...
@@ -260,14 +260,14 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -260,14 +260,14 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
this
->
num_
=
d
;
this
->
num_
=
d
;
vexp_
=
KernelPool
::
Instance
().
template
Get
<
VExpKernel
<
T
>
>
(
d
);
vexp_
=
KernelPool
::
Instance
().
template
Get
<
VExpKernel
<
T
>
>
(
d
);
}
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
override
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
}
vexp_
->
Compute
(
y
,
y
);
vexp_
->
Compute
Deprecated
(
y
,
y
);
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
...
@@ -285,30 +285,30 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -285,30 +285,30 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa, expisa)
\
#define INTRI8_FLOAT(isa, expisa) \
template <>
\
template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute
(const float* x, float* y)
\
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute
Deprecated(
\
const
{
\
const
float* x, float* y) const {
\
/* TODO(TJ): try to use static const*/
\
/* TODO(TJ): try to use static const*/
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
\
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x);
\
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max, expisa);
\
INTRI_SIGMOID(tmp, min, max, expisa); \
_mm256_storeu_ps(y, tmp);
\
_mm256_storeu_ps(y, tmp); \
}
}
#define INTRI16_FLOAT(isa, expisa)
\
#define INTRI16_FLOAT(isa, expisa) \
template <>
\
template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute
(const float* x,
\
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute
Deprecated(
\
float* y) const {
\
const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
\
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max, expisa);
\
INTRI_SIGMOID(tmp0, min, max, expisa); \
INTRI_SIGMOID(tmp1, min, max, expisa);
\
INTRI_SIGMOID(tmp1, min, max, expisa); \
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1);
\
_mm256_storeu_ps(y + 8, tmp1); \
}
}
#define INTRI_GT8LT16_FLOAT(isa, expisa) \
#define INTRI_GT8LT16_FLOAT(isa, expisa) \
...
@@ -322,8 +322,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -322,8 +322,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
} \
template <> \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute
(const float* x,
\
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute
Deprecated(
\
float* y) const {
\
const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
__m256 tmp = _mm256_loadu_ps(x); \
...
@@ -335,7 +335,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -335,7 +335,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
y[i] = 0.f - y[i]; \
} \
} \
vexp_->Compute
(y + this->end_, y + this->end_);
\
vexp_->Compute
Deprecated(y + this->end_, y + this->end_);
\
for (int i = this->end_; i < this->num_; ++i) { \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
y[i] = 1.f / (1.f + y[i]); \
} \
} \
...
@@ -352,8 +352,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -352,8 +352,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
} \
template <> \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute
(const float* x,
\
void VSigmoidKernelImpl<float, isa, kGT16>::Compute
Deprecated(
\
float* y) const {
\
const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
...
@@ -367,7 +367,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -367,7 +367,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
y[i] = 0.f - y[i]; \
} \
} \
vexp_->Compute
(y + this->end_, y + this->end_);
\
vexp_->Compute
Deprecated(y + this->end_, y + this->end_);
\
for (int i = this->end_; i < this->num_; ++i) { \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
y[i] = 1.f / (1.f + y[i]); \
} \
} \
...
@@ -408,10 +408,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
...
@@ -408,10 +408,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
vsigmoid_
=
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
d
);
vsigmoid_
=
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
d
);
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
}
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
void
Compute
Deprecated
(
const
T
*
x
,
T
*
y
)
const
override
{
const
T
a
=
static_cast
<
T
>
(
2
),
b
=
static_cast
<
T
>
(
-
1
);
const
T
a
=
static_cast
<
T
>
(
2
),
b
=
static_cast
<
T
>
(
-
1
);
vscal_
->
Compute
(
&
a
,
x
,
y
,
this
->
num_
);
vscal_
->
Compute
(
&
a
,
x
,
y
,
this
->
num_
);
vsigmoid_
->
Compute
(
y
,
y
);
vsigmoid_
->
Compute
Deprecated
(
y
,
y
);
vscal_
->
Compute
(
&
a
,
y
,
y
,
this
->
num_
);
vscal_
->
Compute
(
&
a
,
y
,
y
,
this
->
num_
);
vaddbias_
->
Compute
(
&
b
,
y
,
y
,
this
->
num_
);
vaddbias_
->
Compute
(
&
b
,
y
,
y
,
this
->
num_
);
}
}
...
@@ -430,25 +430,25 @@ class VTanhKernelImpl : public VTanhKernel<T> {
...
@@ -430,25 +430,25 @@ class VTanhKernelImpl : public VTanhKernel<T> {
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa, expisa) \
#define INTRI8_FLOAT(isa, expisa)
\
template <> \
template <>
\
void VTanhKernelImpl<float, isa, kEQ8>::Compute
(const float* x, float* y)
\
void VTanhKernelImpl<float, isa, kEQ8>::Compute
Deprecated(const float* x,
\
const {
\
float* y) const {
\
__m256 tmp = _mm256_loadu_ps(x); \
__m256 tmp = _mm256_loadu_ps(x);
\
INTRI_VTANH(tmp, expisa); \
INTRI_VTANH(tmp, expisa);
\
_mm256_storeu_ps(y, tmp); \
_mm256_storeu_ps(y, tmp);
\
}
}
#define INTRI16_FLOAT(isa, expisa) \
#define INTRI16_FLOAT(isa, expisa)
\
template <> \
template <>
\
void VTanhKernelImpl<float, isa, kEQ16>::Compute
(const float* x, float* y)
\
void VTanhKernelImpl<float, isa, kEQ16>::Compute
Deprecated(const float* x,
\
const {
\
float* y) const {
\
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
INTRI_VTANH(tmp0, expisa); \
INTRI_VTANH(tmp0, expisa);
\
INTRI_VTANH(tmp1, expisa); \
INTRI_VTANH(tmp1, expisa);
\
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1); \
_mm256_storeu_ps(y + 8, tmp1);
\
}
}
#define INTRI_GT8LT16_FLOAT(isa, expisa) \
#define INTRI_GT8LT16_FLOAT(isa, expisa) \
...
@@ -466,8 +466,8 @@ class VTanhKernelImpl : public VTanhKernel<T> {
...
@@ -466,8 +466,8 @@ class VTanhKernelImpl : public VTanhKernel<T> {
this->rest_); \
this->rest_); \
} \
} \
template <> \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute
(const float* x,
\
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute
Deprecated(
\
float* y) const {
\
const float* x, float* y) const {
\
__m256 tmp = _mm256_loadu_ps(x); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp, expisa); \
INTRI_VTANH(tmp, expisa); \
_mm256_storeu_ps(y, tmp); \
_mm256_storeu_ps(y, tmp); \
...
@@ -475,40 +475,40 @@ class VTanhKernelImpl : public VTanhKernel<T> {
...
@@ -475,40 +475,40 @@ class VTanhKernelImpl : public VTanhKernel<T> {
y += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
const float a = 2.f, b = -1.f; \
const float a = 2.f, b = -1.f; \
vscal_->Compute(&a, x, y, this->num_); \
vscal_->Compute(&a, x, y, this->num_); \
vsigmoid_->Compute
(y, y);
\
vsigmoid_->Compute
Deprecated(y, y);
\
vscal_->Compute(&a, y, y, this->num_); \
vscal_->Compute(&a, y, y, this->num_); \
vaddbias_->Compute(&b, y, y, this->num_); \
vaddbias_->Compute(&b, y, y, this->num_); \
}
}
#define INTRI_GT16_FLOAT(isa, expisa) \
#define INTRI_GT16_FLOAT(isa, expisa)
\
template <> \
template <>
\
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)
\
: VTanhKernel<float>() { \
: VTanhKernel<float>() {
\
this->num_ = d; \
this->num_ = d;
\
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->rest_ = d % AVX_FLOAT_BLOCK;
\
this->end_ = d - this->rest_; \
this->end_ = d - this->rest_;
\
vscal_ = \
vscal_ =
\
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_);
\
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(
\
this->rest_); \
this->rest_);
\
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(
\
this->rest_); \
this->rest_);
\
} \
}
\
template <> \
template <>
\
void VTanhKernelImpl<float, isa, kGT16>::Compute
(const float* x, float* y)
\
void VTanhKernelImpl<float, isa, kGT16>::Compute
Deprecated(const float* x,
\
const {
\
float* y) const {
\
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {
\
__m256 tmp = _mm256_loadu_ps(x + i); \
__m256 tmp = _mm256_loadu_ps(x + i);
\
INTRI_VTANH(tmp, expisa); \
INTRI_VTANH(tmp, expisa);
\
_mm256_storeu_ps(y + i, tmp); \
_mm256_storeu_ps(y + i, tmp);
\
} \
}
\
x += this->end_; \
x += this->end_;
\
y += this->end_; \
y += this->end_;
\
const float a = 2.f, b = -1.f; \
const float a = 2.f, b = -1.f;
\
vscal_->Compute(&a, x, y, this->num_); \
vscal_->Compute(&a, x, y, this->num_);
\
vsigmoid_->Compute
(y, y);
\
vsigmoid_->Compute
Deprecated(y, y);
\
vscal_->Compute(&a, y, y, this->num_); \
vscal_->Compute(&a, y, y, this->num_);
\
vaddbias_->Compute(&b, y, y, this->num_); \
vaddbias_->Compute(&b, y, y, this->num_);
\
}
}
#ifdef __AVX__
#ifdef __AVX__
...
...
paddle/fluid/operators/math/jit_kernel_rnn.cc
浏览文件 @
5670e9ea
...
@@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel<T> {
...
@@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel<T> {
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
T
*
checked
)
const
override
{
// gates: W_ch, W_ih, W_fh, W_oh
// gates: W_ch, W_ih, W_fh, W_oh
act_gate_d3_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_gate_d3_
->
Compute
Deprecated
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated */
/* C_t = C_t-1 * fgated + cand_gated * igated */
act_cand_d_
->
Compute
(
gates
,
gates
);
act_cand_d_
->
Compute
Deprecated
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
Deprecated
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_gate_d_
->
Compute
Deprecated
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
act_cand_d_
->
Compute
Deprecated
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
Deprecated
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
Deprecated
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
}
...
@@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
...
@@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
,
d_
);
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
,
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
,
d2_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
,
d2_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_gate_d2_
->
Compute
Deprecated
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_
->
Compute
(
gates
,
gates
);
act_cand_d_
->
Compute
Deprecated
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
/* get ogated*/
/* get ogated*/
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
Deprecated
(
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
Deprecated
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_gate_d_
->
Compute
Deprecated
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
act_cand_d_
->
Compute
Deprecated
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* get outgated, put W_oc * C_t on igated */
/* get outgated, put W_oc * C_t on igated */
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
Deprecated
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
Deprecated
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
}
...
@@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel<T> {
...
@@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel<T> {
}
}
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
override
{
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
override
{
act_gate_d_
->
Compute
(
gates
,
gates
);
act_gate_d_
->
Compute
Deprecated
(
gates
,
gates
);
act_state_d_
->
Compute
(
gates
+
d2_
,
gates
+
d2_
);
act_state_d_
->
Compute
Deprecated
(
gates
+
d2_
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d2_
,
ht
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d2_
,
ht
,
d_
);
}
}
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
// W: {W_update, W_reset; W_state}
// W: {W_update, W_reset; W_state}
act_gate_d2_
->
Compute
(
gates
,
gates
);
act_gate_d2_
->
Compute
Deprecated
(
gates
,
gates
);
vmul_d_
->
Compute
(
ht_1
,
gates
+
d_
,
ht
,
d_
);
vmul_d_
->
Compute
(
ht_1
,
gates
+
d_
,
ht
,
d_
);
}
}
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
T
*
y
=
gates
+
d2_
;
T
*
y
=
gates
+
d2_
;
act_state_d_
->
Compute
(
y
,
y
);
act_state_d_
->
Compute
Deprecated
(
y
,
y
);
// out = zt*ht~ + (1-zt)*ht_1
// out = zt*ht~ + (1-zt)*ht_1
for
(
int
i
=
0
;
i
<
d_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d_
;
++
i
)
{
ht
[
i
]
=
gates
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
gates
[
i
])
*
ht_1
[
i
];
ht
[
i
]
=
gates
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
gates
[
i
])
*
ht_1
[
i
];
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
5670e9ea
...
@@ -92,7 +92,7 @@ TEST(JitKernel, vrelu) {
...
@@ -92,7 +92,7 @@ TEST(JitKernel, vrelu) {
#endif
#endif
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
ztgt_data
,
d
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
...
@@ -181,7 +181,7 @@ TEST(JitKernel, vexp) {
...
@@ -181,7 +181,7 @@ TEST(JitKernel, vexp) {
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
ker
->
Compute
Deprecated
(
x_data
,
ztgt_data
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
...
@@ -222,7 +222,7 @@ void vsigmoid_better(
...
@@ -222,7 +222,7 @@ void vsigmoid_better(
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
0.
f
-
y
[
i
];
y
[
i
]
=
0.
f
-
y
[
i
];
}
}
vexp
->
Compute
(
y
,
y
);
vexp
->
Compute
Deprecated
(
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
}
}
...
@@ -253,7 +253,7 @@ TEST(JitKernel, vsigmoid) {
...
@@ -253,7 +253,7 @@ TEST(JitKernel, vsigmoid) {
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
ker
->
Compute
Deprecated
(
x_data
,
ztgt_data
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
...
@@ -287,7 +287,7 @@ void vtanh_better(
...
@@ -287,7 +287,7 @@ void vtanh_better(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
const
int
n
,
const
float
*
x
,
float
*
y
)
{
const
float
a
=
2.
f
,
b
=
-
1.
f
;
const
float
a
=
2.
f
,
b
=
-
1.
f
;
vscal
->
Compute
(
&
a
,
x
,
y
,
n
);
vscal
->
Compute
(
&
a
,
x
,
y
,
n
);
vsigmoid
->
Compute
(
y
,
y
);
vsigmoid
->
Compute
Deprecated
(
y
,
y
);
vscal
->
Compute
(
&
a
,
y
,
y
,
n
);
vscal
->
Compute
(
&
a
,
y
,
y
,
n
);
vaddbias
->
Compute
(
&
b
,
y
,
y
,
n
);
vaddbias
->
Compute
(
&
b
,
y
,
y
,
n
);
}
}
...
@@ -321,7 +321,7 @@ TEST(JitKernel, vtanh) {
...
@@ -321,7 +321,7 @@ TEST(JitKernel, vtanh) {
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
ker
->
Compute
Deprecated
(
x_data
,
ztgt_data
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
...
@@ -344,8 +344,8 @@ void lstm_ctht_ref(
...
@@ -344,8 +344,8 @@ void lstm_ctht_ref(
const
std
::
shared_ptr
<
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp_1
,
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp_1
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vsigmoid_3d
->
Compute
Deprecated
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
vtanh_d
->
Compute
Deprecated
(
gates
,
gates
);
const
float
*
i
=
gates
+
d
,
*
f
=
gates
+
d
*
2
,
*
o
=
gates
+
d
*
3
;
const
float
*
i
=
gates
+
d
,
*
f
=
gates
+
d
*
2
,
*
o
=
gates
+
d
*
3
;
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
...
@@ -355,7 +355,7 @@ void lstm_ctht_ref(
...
@@ -355,7 +355,7 @@ void lstm_ctht_ref(
// H_t = act_cell(C_t) * ogated
// H_t = act_cell(C_t) * ogated
float
tmp
=
ct
[
k
]
*
2
;
float
tmp
=
ct
[
k
]
*
2
;
tmp
=
0.
f
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
tmp
=
0.
f
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
vexp_1
->
Compute
(
&
tmp
,
&
tmp
);
vexp_1
->
Compute
Deprecated
(
&
tmp
,
&
tmp
);
tmp
=
2.
f
/
(
1.
f
+
tmp
)
-
1.
f
;
tmp
=
2.
f
/
(
1.
f
+
tmp
)
-
1.
f
;
ht
[
k
]
=
tmp
*
o
[
k
];
ht
[
k
]
=
tmp
*
o
[
k
];
}
}
...
@@ -373,13 +373,13 @@ void lstm_ctht_better(
...
@@ -373,13 +373,13 @@ void lstm_ctht_better(
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd_d
,
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd_d
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
int
d2
=
d
*
2
;
int
d2
=
d
*
2
;
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vsigmoid_3d
->
Compute
Deprecated
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
vtanh_d
->
Compute
Deprecated
(
gates
,
gates
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
,
d
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
,
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
,
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
,
d
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
,
d
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
,
d
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
vtanh_d
->
Compute
(
ct
,
gates
+
d2
);
vtanh_d
->
Compute
Deprecated
(
ct
,
gates
+
d2
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
,
d
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
,
d
);
}
}
...
@@ -736,7 +736,7 @@ void vaddrelu_better(
...
@@ -736,7 +736,7 @@ void vaddrelu_better(
const
paddle
::
operators
::
math
::
jitkernel
::
VReluKernel
<
float
>>&
vrelu
,
const
paddle
::
operators
::
math
::
jitkernel
::
VReluKernel
<
float
>>&
vrelu
,
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
d
)
{
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
d
)
{
vadd
->
Compute
(
x
,
y
,
z
,
d
);
vadd
->
Compute
(
x
,
y
,
z
,
d
);
vrelu
->
Compute
(
z
,
z
);
vrelu
->
Compute
Deprecated
(
z
,
z
);
}
}
TEST
(
JitKernel
,
vaddrelu
)
{
TEST
(
JitKernel
,
vaddrelu
)
{
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
5670e9ea
...
@@ -244,7 +244,7 @@ typename std::enable_if<
...
@@ -244,7 +244,7 @@ typename std::enable_if<
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add_to
(
const
DeviceContext
&
ctx
,
BlasT
<
DeviceContext
,
T
>*
blas
,
elementwise_add_to
(
const
DeviceContext
&
ctx
,
BlasT
<
DeviceContext
,
T
>*
blas
,
size_t
data_len
,
const
T
*
in
,
T
*
out
)
{
size_t
data_len
,
const
T
*
in
,
T
*
out
)
{
for
(
int64
_t
i
=
0
;
i
<
data_len
;
i
++
)
{
for
(
size
_t
i
=
0
;
i
<
data_len
;
i
++
)
{
out
[
i
]
+=
in
[
i
];
out
[
i
]
+=
in
[
i
];
}
}
}
}
...
...
paddle/fluid/operators/math/sequence_pooling_test.cc
浏览文件 @
5670e9ea
...
@@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
...
@@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
EXPECT_EQ
(
in_grad
.
lod
(),
lod
);
EXPECT_EQ
(
in_grad
.
lod
(),
lod
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
for
(
int64
_t
i
=
0
;
i
<
in_grad
.
lod
()[
0
].
size
()
-
1
;
++
i
)
{
for
(
size
_t
i
=
0
;
i
<
in_grad
.
lod
()[
0
].
size
()
-
1
;
++
i
)
{
int64_t
begin
=
in_grad
.
lod
()[
0
][
i
];
int64_t
begin
=
in_grad
.
lod
()[
0
][
i
];
int64_t
end
=
in_grad
.
lod
()[
0
][
i
+
1
];
int64_t
end
=
in_grad
.
lod
()[
0
][
i
+
1
];
paddle
::
framework
::
Tensor
tmp
=
in_grad
.
Slice
(
begin
,
end
);
paddle
::
framework
::
Tensor
tmp
=
in_grad
.
Slice
(
begin
,
end
);
for
(
int64
_t
j
=
0
;
j
!=
tmp
.
numel
()
/
second_dim
;
++
j
)
{
for
(
size
_t
j
=
0
;
j
!=
tmp
.
numel
()
/
second_dim
;
++
j
)
{
for
(
int64_t
m
=
0
;
m
!=
second_dim
;
++
m
)
{
for
(
int64_t
m
=
0
;
m
!=
second_dim
;
++
m
)
{
EXPECT_EQ
(
tmp
.
data
<
T
>
()[
m
+
j
*
second_dim
],
EXPECT_EQ
(
tmp
.
data
<
T
>
()[
m
+
j
*
second_dim
],
out_grad
.
data
<
T
>
()[
m
+
i
*
second_dim
]);
out_grad
.
data
<
T
>
()[
m
+
i
*
second_dim
]);
...
@@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
...
@@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
}
}
}
}
}
else
{
}
else
{
for
(
int64
_t
i
=
0
;
i
<
cpu_in_grad
.
lod
()[
0
].
size
()
-
1
;
++
i
)
{
for
(
size
_t
i
=
0
;
i
<
cpu_in_grad
.
lod
()[
0
].
size
()
-
1
;
++
i
)
{
int64_t
begin
=
cpu_in_grad
.
lod
()[
0
][
i
];
int64_t
begin
=
cpu_in_grad
.
lod
()[
0
][
i
];
int64_t
end
=
cpu_in_grad
.
lod
()[
0
][
i
+
1
];
int64_t
end
=
cpu_in_grad
.
lod
()[
0
][
i
+
1
];
paddle
::
framework
::
Tensor
tmp
=
cpu_in_grad
.
Slice
(
begin
,
end
);
paddle
::
framework
::
Tensor
tmp
=
cpu_in_grad
.
Slice
(
begin
,
end
);
for
(
int64
_t
j
=
0
;
j
!=
tmp
.
numel
()
/
second_dim
;
++
j
)
{
for
(
size
_t
j
=
0
;
j
!=
tmp
.
numel
()
/
second_dim
;
++
j
)
{
for
(
int64_t
m
=
0
;
m
!=
second_dim
;
++
m
)
{
for
(
int64_t
m
=
0
;
m
!=
second_dim
;
++
m
)
{
EXPECT_EQ
(
tmp
.
data
<
T
>
()[
m
+
j
*
second_dim
],
EXPECT_EQ
(
tmp
.
data
<
T
>
()[
m
+
j
*
second_dim
],
cpu_out_grad
.
data
<
T
>
()[
m
+
i
*
second_dim
]);
cpu_out_grad
.
data
<
T
>
()[
m
+
i
*
second_dim
]);
...
...
paddle/fluid/operators/math/softmax.cc
浏览文件 @
5670e9ea
...
@@ -19,8 +19,10 @@ namespace paddle {
...
@@ -19,8 +19,10 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
template
class
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
float
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
float
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
double
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
double
,
false
>;
template
class
SoftmaxGradFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CPUDeviceContext
,
double
>;
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
5670e9ea
...
@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
...
@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
true
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/math/softmax.h
浏览文件 @
5670e9ea
...
@@ -19,7 +19,7 @@ namespace paddle {
...
@@ -19,7 +19,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
>
class
SoftmaxFunctor
{
class
SoftmaxFunctor
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
...
...
paddle/fluid/operators/math/softmax_impl.h
浏览文件 @
5670e9ea
...
@@ -32,10 +32,10 @@ struct ValueClip {
...
@@ -32,10 +32,10 @@ struct ValueClip {
}
}
};
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
>
void
SoftmaxFunctor
<
DeviceContext
,
T
>::
operator
()(
const
DeviceContext
&
context
,
void
SoftmaxFunctor
<
DeviceContext
,
T
,
is_test
>::
operator
()(
const
framework
::
Tensor
*
X
,
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
framework
::
Tensor
*
Y
)
{
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
T
>::
From
(
*
Y
);
auto
softmax
=
EigenMatrix
<
T
>::
From
(
*
Y
);
...
@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
...
@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
.
broadcast
(
one_by_class
));
.
broadcast
(
one_by_class
));
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
SoftmaxFunctor
<
DeviceContext
,
T
,
true
>
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
T
>::
From
(
*
Y
);
const
int
kBatchDim
=
0
;
const
int
kClassDim
=
1
;
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
Eigen
::
DSizes
<
int
,
1
>
along_class
(
kClassDim
);
Eigen
::
DSizes
<
int
,
2
>
batch_by_one
(
batch_size
,
1
);
Eigen
::
DSizes
<
int
,
2
>
one_by_class
(
1
,
num_classes
);
auto
shifted_logits
=
(
logits
-
logits
.
maximum
(
along_class
)
.
eval
()
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
));
softmax
.
device
(
*
context
.
eigen_device
())
=
shifted_logits
.
exp
();
softmax
.
device
(
*
context
.
eigen_device
())
=
(
softmax
*
softmax
.
sum
(
along_class
)
.
inverse
()
.
eval
()
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
void
SoftmaxGradFunctor
<
DeviceContext
,
T
>::
operator
()(
void
SoftmaxGradFunctor
<
DeviceContext
,
T
>::
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
y
,
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
y
,
...
...
paddle/fluid/operators/merge_ids_op.h
浏览文件 @
5670e9ea
...
@@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
...
@@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ
(
ids
.
size
(),
outs
.
size
(),
PADDLE_ENFORCE_EQ
(
ids
.
size
(),
outs
.
size
(),
"the number of Ids and Out should be the same"
);
"the number of Ids and Out should be the same"
);
in
t
row_ids_size
=
0
;
size_
t
row_ids_size
=
0
;
int
row_size
=
0
;
int
row_size
=
0
;
int
embedding_size
=
0
;
int
embedding_size
=
0
;
for
(
in
t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
const
auto
*
x_tensor
=
x_tensors
[
i
];
const
auto
*
x_tensor
=
x_tensors
[
i
];
const
auto
*
row_id
=
row_ids
[
i
];
const
auto
*
row_id
=
row_ids
[
i
];
...
@@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
...
@@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
std
::
unordered_map
<
int64_t
,
std
::
tuple
<
int64_t
,
int64_t
>>
std
::
unordered_map
<
int64_t
,
std
::
tuple
<
int64_t
,
int64_t
>>
selected_rows_idx_map
;
selected_rows_idx_map
;
for
(
in
t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
const
auto
*
row_id
=
row_ids
[
i
];
const
auto
*
row_id
=
row_ids
[
i
];
for
(
int
j
=
0
;
j
<
row_id
->
numel
();
++
j
)
{
for
(
int
j
=
0
;
j
<
row_id
->
numel
();
++
j
)
{
...
@@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
...
@@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ
(
row_ids_size
,
selected_rows_idx_map
.
size
(),
PADDLE_ENFORCE_EQ
(
row_ids_size
,
selected_rows_idx_map
.
size
(),
"the rows and tensor map size should be the same"
);
"the rows and tensor map size should be the same"
);
for
(
in
t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
auto
*
out_ids
=
ids
[
i
];
auto
*
out_ids
=
ids
[
i
];
auto
*
out
=
outs
[
i
];
auto
*
out
=
outs
[
i
];
...
...
paddle/fluid/operators/ref_by_trainer_id_op.h
浏览文件 @
5670e9ea
...
@@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> {
...
@@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> {
}
else
{
}
else
{
trainer_id
=
*
trainer_id_data
;
trainer_id
=
*
trainer_id_data
;
}
}
PADDLE_ENFORCE_LT
(
trainer_id
,
in_list
.
size
());
PADDLE_ENFORCE_LT
(
(
size_t
)
trainer_id
,
in_list
.
size
());
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
out
->
ShareDataWith
(
*
(
in_list
[
trainer_id
]));
out
->
ShareDataWith
(
*
(
in_list
[
trainer_id
]));
}
}
...
...
paddle/fluid/operators/softmax_op.h
浏览文件 @
5670e9ea
...
@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
...
@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
Tensor
X_2d
=
framework
::
ReshapeToMatrix
(
*
X
,
rank
-
1
);
Tensor
X_2d
=
framework
::
ReshapeToMatrix
(
*
X
,
rank
-
1
);
Tensor
Out_2d
=
framework
::
ReshapeToMatrix
(
*
Out
,
rank
-
1
);
Tensor
Out_2d
=
framework
::
ReshapeToMatrix
(
*
Out
,
rank
-
1
);
math
::
SoftmaxFunctor
<
DeviceContext
,
T
>
()(
#ifdef ON_INFER
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
true
>
()(
context
.
template
device_context
<
DeviceContext
>(),
&
X_2d
,
&
Out_2d
);
context
.
template
device_context
<
DeviceContext
>(),
&
X_2d
,
&
Out_2d
);
#else
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
false
>
()(
context
.
template
device_context
<
DeviceContext
>(),
&
X_2d
,
&
Out_2d
);
#endif
}
}
};
};
...
...
paddle/fluid/operators/softmax_with_cross_entropy_op.h
浏览文件 @
5670e9ea
...
@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
...
@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
T
>
()(
dev_ctx
,
logits
,
math
::
SoftmaxFunctor
<
platform
::
CPUDeviceContext
,
T
,
false
>
()(
softmax
);
dev_ctx
,
logits
,
softmax
);
math
::
CrossEntropyFunctor
<
platform
::
CPUDeviceContext
,
T
>
()(
math
::
CrossEntropyFunctor
<
platform
::
CPUDeviceContext
,
T
>
()(
dev_ctx
,
loss
,
softmax
,
labels
,
context
.
Attr
<
bool
>
(
"soft_label"
),
dev_ctx
,
loss
,
softmax
,
labels
,
context
.
Attr
<
bool
>
(
"soft_label"
),
context
.
Attr
<
int
>
(
"ignore_index"
));
context
.
Attr
<
int
>
(
"ignore_index"
));
...
...
paddle/fluid/operators/split_ids_op.h
浏览文件 @
5670e9ea
...
@@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
...
@@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
out_ids
.
resize
(
outs
.
size
());
out_ids
.
resize
(
outs
.
size
());
// split id by their shard_num.
// split id by their shard_num.
for
(
in
t
i
=
0
;
i
<
all_ids
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
all_ids
.
size
();
++
i
)
{
T
id
=
all_ids
[
i
];
T
id
=
all_ids
[
i
];
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
out_ids
[
shard_id
].
push_back
(
id
);
out_ids
[
shard_id
].
push_back
(
id
);
...
...
python/paddle/fluid/__init__.py
浏览文件 @
5670e9ea
...
@@ -112,11 +112,11 @@ def __bootstrap__():
...
@@ -112,11 +112,11 @@ def __bootstrap__():
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
read_env_flags
=
[
read_env_flags
=
[
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'
eager_delete_scope'
,
'use_mkldnn'
,
'use_ngraph
'
,
'
use_mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb
'
,
'init
ial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory
'
,
'init
_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads
'
,
'
paddle_num_threads'
,
'dist_threadpool_size
'
,
'
dist_threadpool_size'
,
'eager_delete_tensor_gb
'
,
'
eager_delete_tensor_gb'
,
'
reader_queue_speed_test_mode'
'reader_queue_speed_test_mode'
]
]
if
os
.
name
!=
'nt'
:
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
read_env_flags
.
append
(
'warpctc_dir'
)
...
...
python/paddle/fluid/tests/unittests/dist_save_load.py
浏览文件 @
5670e9ea
...
@@ -26,6 +26,7 @@ from multiprocessing import Process
...
@@ -26,6 +26,7 @@ from multiprocessing import Process
from
functools
import
reduce
from
functools
import
reduce
import
numpy
as
np
import
numpy
as
np
import
pickle
import
unittest
import
unittest
import
six
import
six
...
@@ -166,7 +167,10 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
...
@@ -166,7 +167,10 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
var
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'__fc_b__'
).
get_tensor
())
var
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'__fc_b__'
).
get_tensor
())
print
(
np
.
ravel
(
var
).
tolist
())
if
six
.
PY2
:
print
(
pickle
.
dumps
(
np
.
ravel
(
var
).
tolist
()))
else
:
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
np
.
ravel
(
var
).
tolist
()))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_save_load.py
浏览文件 @
5670e9ea
...
@@ -65,14 +65,14 @@ class TestDistSaveLoadDense2x2(TestDistBase):
...
@@ -65,14 +65,14 @@ class TestDistSaveLoadDense2x2(TestDistBase):
shutil
.
rmtree
(
model_dir
)
shutil
.
rmtree
(
model_dir
)
local_np
=
np
.
array
(
eval
(
local_var
[
0
]))
local_np
=
np
.
array
(
local_var
)
train0_np
=
np
.
array
(
eval
(
tr0_var
[
0
]))
train0_np
=
np
.
array
(
tr0_var
)
train1_np
=
np
.
array
(
eval
(
tr1_var
[
0
]))
train1_np
=
np
.
array
(
tr1_var
)
self
.
assertAlmostEqual
(
local_np
.
all
(),
train0_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
local_np
.
all
(),
train0_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
local_np
.
all
(),
train1_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
local_np
.
all
(),
train1_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
train0_np
.
all
(),
train1_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
train0_np
.
all
(),
train1_np
.
all
(),
delta
=
delta
)
@
unittest
.
skip
(
reason
=
"CI fail"
)
def
test_dist
(
self
):
def
test_dist
(
self
):
need_envs
=
{
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_DISTRIBUTED"
:
'0'
,
...
...
python/requirements.txt
浏览文件 @
5670e9ea
requests==2.9.2
requests==2.9.2
numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
protobuf==3.1
protobuf==3.1
recordio>=0.1.0
; sys_platform != 'win32'
recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile
rarfile
scipy>=0.19.0
scipy>=0.19.0
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录