Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
25adf970
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
25adf970
编写于
11月 20, 2018
作者:
P
peizhilin
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into windows/build
上级
301ed153
f17b05d4
变更
36
隐藏空白更改
内联
并排
Showing
36 changed file
with
911 addition
and
507 deletion
+911
-507
CMakeLists.txt
CMakeLists.txt
+8
-7
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+21
-7
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+11
-2
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+1
-1
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+2
-1
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+7
-6
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+46
-24
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+1
-1
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/split_op.cc
paddle/fluid/inference/tensorrt/convert/split_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
...e/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+56
-22
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+9
-9
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+1
-1
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+3
-2
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+2
-2
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+3
-1
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
.../fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+138
-0
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
...e/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+87
-0
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+2
-0
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+2
-0
paddle/fluid/inference/tensorrt/plugin/serialize.h
paddle/fluid/inference/tensorrt/plugin/serialize.h
+24
-8
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+13
-12
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+35
-38
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+14
-14
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+51
-21
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+1
-1
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+107
-268
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+178
-12
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+1
-0
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+9
-6
paddle/fluid/operators/math/softmax.h
paddle/fluid/operators/math/softmax.h
+2
-1
paddle/fluid/operators/math/softmax_impl.h
paddle/fluid/operators/math/softmax_impl.h
+39
-28
paddle/fluid/operators/softmax_op.h
paddle/fluid/operators/softmax_op.h
+4
-2
paddle/fluid/operators/stack_op.h
paddle/fluid/operators/stack_op.h
+18
-6
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+2
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+11
-1
未找到文件。
CMakeLists.txt
浏览文件 @
25adf970
...
...
@@ -311,6 +311,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set
(
CMAKE_CXX_FLAGS_RELWITHDEBINFO
"-O3 -g -DNDEBUG"
)
set
(
CMAKE_C_FLAGS_RELWITHDEBINFO
"-O3 -g -DNDEBUG"
)
if
(
ON_INFER
)
message
(
STATUS
"On inference mode, will take place some specific optimization."
)
add_definitions
(
-DPADDLE_ON_INFERENCE
)
else
()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message
(
WARNING
"On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
)
endif
()
add_subdirectory
(
paddle
)
if
(
WITH_PYTHON
)
add_subdirectory
(
python
)
...
...
@@ -321,10 +329,3 @@ if(WITH_DOC)
find_python_module
(
recommonmark REQUIRED
)
add_subdirectory
(
doc
)
endif
()
if
(
ON_INFER
)
message
(
STATUS
"On inference mode, will take place some specific optimization."
)
else
()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message
(
WARNING
"On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
)
endif
()
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
25adf970
...
...
@@ -15,8 +15,15 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm>
#include <deque>
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_set>
DEFINE_string
(
print_sub_graph_dir
,
""
,
"FLAGS_print_sub_graph_dir is used "
"to print the nodes of sub_graphs."
);
namespace
paddle
{
namespace
framework
{
namespace
ir
{
...
...
@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
graph_nodes
.
emplace_back
(
g_nodes
);
}
if
(
VLOG_IS_ON
(
100
))
{
VLOG
(
100
)
<<
"graph_num: "
<<
graph_nodes
.
size
();
for
(
auto
&
g_n
:
graph_nodes
)
{
VLOG
(
100
)
<<
"graph_nodes: "
<<
g_n
.
size
();
if
(
g_n
.
size
()
<
10
)
{
std
::
stringstream
out
;
if
(
FLAGS_print_sub_graph_dir
.
size
())
{
if
(
graph_nodes
.
size
()
>
1
)
{
std
::
stringstream
out
;
for
(
auto
&
g_n
:
graph_nodes
)
{
out
<<
"graph_nodes: "
<<
g_n
.
size
()
<<
"
\n
"
;
}
out
<<
"
\n\n
"
;
for
(
auto
&
g_n
:
graph_nodes
)
{
out
<<
"graph_nodes: "
<<
g_n
.
size
();
for
(
auto
&
node
:
g_n
)
{
out
<<
"
\n
Node: "
<<
node
->
Name
()
<<
" in ["
;
for
(
auto
&
n
:
node
->
inputs
)
{
...
...
@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
}
out
<<
"]"
;
}
VLOG
(
100
)
<<
out
.
str
()
;
out
<<
"
\n\n\n
"
;
}
std
::
unique_ptr
<
std
::
ostream
>
fout
(
new
std
::
ofstream
(
FLAGS_print_sub_graph_dir
));
PADDLE_ENFORCE
(
fout
->
good
());
*
fout
<<
out
.
str
();
}
}
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
25adf970
...
...
@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
}
// If the loss_var_name is given, the number of graph should be only one.
if
(
loss_var_name
.
size
())
{
PADDLE_ENFORCE_EQ
(
ir
::
GraphNum
(
*
graph
),
1
,
"The number of graph should be only one"
);
size_t
graph_num
=
ir
::
GraphNum
(
*
graph
);
if
(
graph_num
>
1
)
{
LOG
(
WARNING
)
<<
"The number of graph should be only one, "
"but the current graph has "
<<
ir
::
GraphNum
(
*
graph
)
<<
" sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
"please don't pass loss_var_name."
;
}
}
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
25adf970
...
...
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output.
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
for
(
size_t
index
=
0
;
index
<
block_desc
.
OpSize
();
index
++
)
{
for
(
size_t
index
=
0
;
index
<
block_desc
.
OpSize
();
++
index
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
.
Op
(
index
)
->
Proto
();
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
25adf970
...
...
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
});
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
});
if
(
!
node
->
IsOp
())
return
false
;
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
25adf970
# Add TRT tests
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry
)
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine tensorrt_converter
)
...
...
@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine pool_op SERIAL
)
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine elementwise_add_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine tensorrt_plugin
elementwise_add_op elementwise_mul_op SERIAL
)
nv_test
(
test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine softmax_op SERIAL
)
nv_test
(
test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
...
...
@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine pad_op SERIAL
)
nv_test
(
test_trt_split_op SRCS test_split_op.cc split_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine tensorrt_plugin
split_op concat_op SERIAL
)
split_op concat_op SERIAL
)
nv_test
(
test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine tensorrt_plugin
prelu_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
25adf970
...
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
...
...
@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
static
bool
CheckDims
(
const
nvinfer1
::
Dims
&
dims_x
,
const
nvinfer1
::
Dims
&
dims_y
)
{
if
(
dims_x
.
nbDims
!=
dims_y
.
nbDims
)
{
return
false
;
}
for
(
int
i
=
0
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_x
.
d
[
i
]
!=
dims_y
.
d
[
i
])
{
return
false
;
}
}
return
true
;
}
class
ElementwiseWeightOpConverter
:
public
OpConverter
{
public:
ElementwiseWeightOpConverter
()
{}
...
...
@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"
convert a fluid elementwise op to tensorrt
IScaleLayer"
;
VLOG
(
3
)
<<
"
Convert a fluid elementwise op to TensorRT
IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
...
...
@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
ElementwiseTensorOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
auto
op_pair
=
ops
.
find
(
op_type_
);
PADDLE_ENFORCE
(
op_pair
!=
ops
.
end
(),
"Wrong elementwise op type!"
);
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"convert a fluid elementwise op to tensorrt IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
...
...
@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
nvinfer1
::
Dims
dims_x
=
X
->
getDimensions
();
nvinfer1
::
Dims
dims_y
=
Y
->
getDimensions
();
// The two input tensor should have the same dims
PADDLE_ENFORCE
(
dims_x
.
nbDims
>=
3
);
if
(
dims_x
.
nbDims
==
dims_y
.
nbDims
)
{
for
(
int
i
=
0
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_x
.
d
[
i
]
!=
dims_y
.
d
[
i
])
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
}
else
{
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
if
(
CheckDims
(
dims_x
,
dims_y
))
{
// The two input tensor should have the same dims
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IElementWiseLayer"
;
auto
op_pair
=
ops
.
find
(
op_type_
);
if
(
op_pair
==
ops
.
end
())
{
PADDLE_THROW
(
"Wrong elementwise op type!"
);
}
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
((
"elementwise (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
layer
->
setName
((
"elementwise (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
}
else
{
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT "
"ElementWisePluginLayer"
;
plugin
::
ElementWisePlugin
*
plugin
=
new
plugin
::
ElementWisePlugin
(
op_pair
->
second
,
dims_x
,
dims_y
,
axis
);
plugin
->
AddInput
(
X
);
plugin
->
AddInput
(
Y
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
const_cast
<
nvinfer1
::
ITensor
*
const
*>
(
plugin
->
GetInputs
().
data
()),
2
,
reinterpret_cast
<
plugin
::
PluginTensorRT
*>
(
plugin
));
layer
->
setName
((
"elementwise (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
}
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
...
...
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
25adf970
...
...
@@ -61,7 +61,7 @@ class OpConverter {
// TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"};
static
std
::
unordered_set
<
std
::
string
>
add_weight_op_set
{
"add"
};
static
std
::
unordered_set
<
std
::
string
>
add_weight_op_set
{
"add"
,
"mul"
};
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1UL
);
int
op_type_len
=
op_desc
.
Type
().
size
();
std
::
string
op_type
=
op_desc
.
Type
().
substr
(
op_type_len
-
3
,
op_type_len
);
...
...
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
浏览文件 @
25adf970
...
...
@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
TensorRTEngine
::
Weight
alpha_rt
(
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
alpha_data
),
alpha_tensor_device
->
numel
());
PReluPlugin
*
plugin
=
new
PReluPlugin
(
alpha_rt
,
mode
);
plugin
::
PReluPlugin
*
plugin
=
new
plugin
::
PReluPlugin
(
alpha_rt
,
mode
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
// keep alpha tensor to avoid release it's memory
...
...
paddle/fluid/inference/tensorrt/convert/split_op.cc
浏览文件 @
25adf970
...
...
@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
//
SplitPlugin
*
plugin
=
new
SplitPlugin
(
axis
,
output_lengths
);
plugin
::
SplitPlugin
*
plugin
=
new
plugin
::
SplitPlugin
(
axis
,
output_lengths
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
...
...
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
浏览文件 @
25adf970
...
...
@@ -20,13 +20,12 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
TEST
(
elementwise_op
,
add_weight
_test
)
{
TEST
(
elementwise_op
,
add_weight
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"elementwise_add-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclParamVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
1
,
1
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
...
...
@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
validator
.
Execute
(
8
);
}
TEST
(
elementwise_op
,
add_tensor_test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
8
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
3
,
3
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetInput
(
"X"
,
{
"elementwise_add-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_add-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_add-Out"
});
// the defalut axis of elementwise op is -1
validator
.
SetOp
(
*
desc
.
Proto
());
TEST
(
elementwise_op
,
native
)
{
for
(
std
::
string
type
:
{
"add"
,
"mul"
})
{
int
batch_size
=
8
;
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
batch_size
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-Y"
,
nvinfer1
::
Dims3
(
10
,
3
,
3
));
validator
.
DeclOutputVar
(
"elementwise_"
+
type
+
"-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_"
+
type
);
desc
.
SetInput
(
"X"
,
{
"elementwise_"
+
type
+
"-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_"
+
type
+
"-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_"
+
type
+
"-Out"
});
int
axis
=
-
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
batch_size
);
}
}
validator
.
Execute
(
8
);
TEST
(
elementwise_op
,
plugin
)
{
for
(
std
::
string
type
:
{
"add"
,
"mul"
})
{
int
batch_size
=
8
;
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
batch_size
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-Y"
,
nvinfer1
::
Dims3
(
10
,
1
,
1
));
validator
.
DeclOutputVar
(
"elementwise_"
+
type
+
"-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_"
+
type
);
desc
.
SetInput
(
"X"
,
{
"elementwise_"
+
type
+
"-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_"
+
type
+
"-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_"
+
type
+
"-Out"
});
int
axis
=
-
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
batch_size
);
}
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
elementwise_add
);
USE_OP
(
elementwise_mul
);
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
浏览文件 @
25adf970
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
25adf970
...
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
25adf970
...
...
@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
}
nvinfer1
::
IPluginLayer
*
TensorRTEngine
::
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
nbInputs
,
PluginTensorRT
*
plugin
)
{
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
plugin
::
PluginTensorRT
*
plugin
)
{
owned_plugin_
.
emplace_back
(
plugin
);
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
n
bI
nputs
,
*
plugin
);
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
n
um_i
nputs
,
*
plugin
);
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
25adf970
...
...
@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
int
GetRuntimeBatch
();
int
GetDevice
()
{
return
device_
;
}
nvinfer1
::
IPluginLayer
*
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
n
bInputs
,
PluginTensorRT
*
);
int
n
um_inputs
,
plugin
::
PluginTensorRT
*
);
// A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage.
...
...
@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
std
::
vector
<
std
::
unique_ptr
<
PluginTensorRT
>>
owned_plugin_
;
std
::
vector
<
std
::
unique_ptr
<
plugin
::
PluginTensorRT
>>
owned_plugin_
;
// TensorRT related internal members
template
<
typename
T
>
...
...
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
浏览文件 @
25adf970
nv_library
(
tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context
)
nv_library
(
tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
DEPS enforce device_context
)
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
0 → 100644
浏览文件 @
25adf970
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
namespace
details
{
template
<
typename
T
>
struct
Add
{
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
+
b
;
}
};
template
<
typename
T
>
struct
Mul
{
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
};
template
<
typename
T
,
typename
Operator
>
__global__
void
ColumnWiseKernel
(
Operator
op
,
const
T
*
x
,
const
T
*
y
,
T
*
out
,
int
batch_size
,
int
num_rows
,
int
num_cols
)
{
for
(
int
batch_id
=
0
;
batch_id
<
batch_size
;
++
batch_id
)
{
int
row
=
blockIdx
.
x
;
for
(;
row
<
num_rows
;
row
+=
gridDim
.
x
)
{
T
value_y
=
y
[
batch_id
*
num_rows
+
row
];
int
col
=
threadIdx
.
x
;
int
offset
=
(
batch_id
*
num_rows
+
row
)
*
num_cols
;
for
(;
col
<
num_cols
;
col
+=
blockDim
.
x
)
{
T
value_x
=
x
[
offset
+
col
];
out
[
offset
+
col
]
=
op
(
value_x
,
value_y
);
}
}
}
}
template
<
typename
T
,
typename
Operator
>
static
void
ElementWise
(
Operator
op
,
const
T
*
x
,
const
T
*
y
,
T
*
out
,
int
batch_size
,
int
prev
,
int
midd
,
int
post
,
cudaStream_t
stream
)
{
const
int
kThreadsPerBlock
=
1024
;
const
int
kMaximumBlocks
=
65535
;
if
(
prev
==
1
)
{
int
num_threads
=
(
post
>
kThreadsPerBlock
)
?
kThreadsPerBlock
:
(((
post
+
31
)
>>
5
)
<<
5
);
int
num_blocks
=
(
midd
<
kMaximumBlocks
)
?
midd
:
kMaximumBlocks
;
ColumnWiseKernel
<<<
num_blocks
,
num_threads
,
0
,
stream
>>>
(
op
,
x
,
y
,
out
,
batch_size
,
midd
,
post
);
}
else
if
(
post
==
1
)
{
PADDLE_THROW
(
"Not implemented."
);
}
else
{
PADDLE_THROW
(
"Not implemented."
);
}
}
}
// namespace details
nvinfer1
::
Dims
ElementWisePlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
index
,
0
);
PADDLE_ENFORCE_EQ
(
num_inputs
,
2
);
PADDLE_ENFORCE_NOT_NULL
(
input_dims
);
return
input_dims
[
0
];
}
int
ElementWisePlugin
::
initialize
()
{
PADDLE_ENFORCE_GT
(
dims_y_
.
nbDims
,
0
);
axis_
=
(
axis_
==
-
1
)
?
dims_x_
.
nbDims
-
dims_y_
.
nbDims
:
axis_
;
int
trimed_nb_dims
=
dims_y_
.
nbDims
;
for
(;
trimed_nb_dims
>
0
;
--
trimed_nb_dims
)
{
if
(
dims_y_
.
d
[
trimed_nb_dims
-
1
]
!=
1
)
{
break
;
}
}
dims_y_
.
nbDims
=
trimed_nb_dims
;
PADDLE_ENFORCE_GE
(
dims_x_
.
nbDims
,
dims_y_
.
nbDims
+
axis_
);
PADDLE_ENFORCE_LT
(
axis_
,
dims_x_
.
nbDims
);
prev_size_
=
1
;
midd_size_
=
1
;
post_size_
=
1
;
for
(
int
i
=
0
;
i
<
axis_
;
++
i
)
{
prev_size_
*=
dims_x_
.
d
[
i
];
}
for
(
int
i
=
0
;
i
<
dims_y_
.
nbDims
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
dims_x_
.
d
[
i
+
axis_
],
dims_y_
.
d
[
i
],
"Broadcast dimension mismatch."
);
midd_size_
*=
dims_y_
.
d
[
i
];
}
for
(
int
i
=
axis_
+
dims_y_
.
nbDims
;
i
<
dims_x_
.
nbDims
;
++
i
)
{
post_size_
*=
dims_x_
.
d
[
i
];
}
return
0
;
}
int
ElementWisePlugin
::
enqueue
(
int
batch_size
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
const
float
*
x
=
reinterpret_cast
<
const
float
*>
(
inputs
[
0
]);
const
float
*
y
=
reinterpret_cast
<
const
float
*>
(
inputs
[
1
]);
float
*
out
=
reinterpret_cast
<
float
*>
(
outputs
[
0
]);
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kSUM
)
{
details
::
ElementWise
(
details
::
Add
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kPROD
)
{
details
::
ElementWise
(
details
::
Mul
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
{
PADDLE_THROW
(
"Not implemented."
);
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
0 → 100644
浏览文件 @
25adf970
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
ElementWisePlugin
:
public
PluginTensorRT
{
public:
ElementWisePlugin
(
nvinfer1
::
ElementWiseOperation
type
,
nvinfer1
::
Dims
const
&
dims_x
,
nvinfer1
::
Dims
const
&
dims_y
,
int
axis
)
:
type_
(
type
),
dims_x_
(
dims_x
),
dims_y_
(
dims_y
),
axis_
(
axis
),
prev_size_
(
1
),
midd_size_
(
1
),
post_size_
(
1
)
{}
ElementWisePlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_x_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_y_
);
}
ElementWisePlugin
*
clone
()
const
override
{
// return new ElementWisePlugin(dims_x_, dims_y_, axis_);
return
nullptr
;
}
const
char
*
getPluginType
()
const
override
{
return
"elementwise"
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
override
;
int
initialize
()
override
;
// execute the layer
int
enqueue
(
int
batch_size
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
);
protected:
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
dims_x_
)
+
SerializedSize
(
dims_y_
)
+
getBaseSerializationSize
();
}
void
serialize
(
void
*
buffer
)
override
{
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
dims_x_
);
SerializeValue
(
&
buffer
,
dims_y_
);
}
nvinfer1
::
ElementWiseOperation
type_
;
nvinfer1
::
Dims
dims_x_
;
nvinfer1
::
Dims
dims_y_
;
int
axis_
;
int
prev_size_
;
int
midd_size_
;
int
post_size_
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
浏览文件 @
25adf970
...
...
@@ -20,6 +20,7 @@
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
static
const
int
CUDA_NUM_THREADS
=
1024
;
static
const
int
CUDA_MAX_NUM_BLOCKS
=
65535
;
...
...
@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
浏览文件 @
25adf970
...
...
@@ -21,6 +21,7 @@
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
PReluPlugin
:
public
PluginTensorRT
{
TensorRTEngine
::
Weight
alpha_
;
...
...
@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
void
*
workspace
,
cudaStream_t
stream
)
override
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/serialize.h
浏览文件 @
25adf970
...
...
@@ -14,10 +14,15 @@
#pragma once
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
);
...
...
@@ -26,7 +31,7 @@ template <typename T>
inline
void
DeserializeValue
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
);
namespace
{
namespace
details
{
template
<
typename
T
,
class
Enable
=
void
>
struct
Serializer
{};
...
...
@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
std
::
is_enum
<
T
>::
value
||
std
::
is_pod
<
T
>::
value
>::
type
>
{
static
size_t
SerializedSize
(
T
const
&
value
)
{
return
sizeof
(
T
);
}
static
void
Serialize
(
void
**
buffer
,
T
const
&
value
)
{
std
::
memcpy
(
*
buffer
,
&
value
,
sizeof
(
T
));
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
sizeof
(
T
);
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
assert
(
*
buffer_size
>=
sizeof
(
T
));
std
::
memcpy
(
value
,
*
buffer
,
sizeof
(
T
));
...
...
@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
template
<
>
struct
Serializer
<
const
char
*>
{
static
size_t
SerializedSize
(
const
char
*
value
)
{
return
strlen
(
value
)
+
1
;
}
static
void
Serialize
(
void
**
buffer
,
const
char
*
value
)
{
std
::
strcpy
(
static_cast
<
char
*>
(
*
buffer
),
value
);
std
::
strcpy
(
static_cast
<
char
*>
(
*
buffer
),
value
);
// NOLINT
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
strlen
(
value
)
+
1
;
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
const
char
**
value
)
{
*
value
=
static_cast
<
char
const
*>
(
*
buffer
);
...
...
@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
static
size_t
SerializedSize
(
std
::
vector
<
T
>
const
&
value
)
{
return
sizeof
(
value
.
size
())
+
value
.
size
()
*
sizeof
(
T
);
}
static
void
Serialize
(
void
**
buffer
,
std
::
vector
<
T
>
const
&
value
)
{
SerializeValue
(
buffer
,
value
.
size
());
size_t
nbyte
=
value
.
size
()
*
sizeof
(
T
);
std
::
memcpy
(
*
buffer
,
value
.
data
(),
nbyte
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
nbyte
;
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
std
::
vector
<
T
>*
value
)
{
size_t
size
;
DeserializeValue
(
buffer
,
buffer_size
,
&
size
);
value
->
resize
(
size
);
size_t
nbyte
=
value
->
size
()
*
sizeof
(
T
);
assert
(
*
buffer_size
>=
nbyte
);
PADDLE_ENFORCE_GE
(
*
buffer_size
,
nbyte
);
std
::
memcpy
(
value
->
data
(),
*
buffer
,
nbyte
);
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
nbyte
;
*
buffer_size
-=
nbyte
;
}
};
}
// namespace
}
// namespace
details
template
<
typename
T
>
inline
size_t
SerializedSize
(
T
const
&
value
)
{
return
Serializer
<
T
>::
SerializedSize
(
value
);
return
details
::
Serializer
<
T
>::
SerializedSize
(
value
);
}
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
)
{
return
Serializer
<
T
>::
Serialize
(
buffer
,
value
);
return
details
::
Serializer
<
T
>::
Serialize
(
buffer
,
value
);
}
template
<
typename
T
>
inline
void
DeserializeValue
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
return
Serializer
<
T
>::
Deserialize
(
buffer
,
buffer_size
,
value
);
return
details
::
Serializer
<
T
>::
Deserialize
(
buffer
,
buffer_size
,
value
);
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
25adf970
...
...
@@ -12,26 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <cassert>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
assert
(
nbInputs
==
1
);
assert
(
index
<
this
->
getNbOutputs
());
nvinfer1
::
Dims
const
&
input_dims
=
inputDims
[
0
];
nvinfer1
::
Dims
output_dims
=
input_dims
;
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
num_inputs
,
1
);
PADDLE_ENFORCE_LT
(
index
,
this
->
getNbOutputs
());
nvinfer1
::
Dims
output_dims
=
input_dims
[
0
];
output_dims
.
d
[
axis_
]
=
output_length_
.
at
(
index
);
return
output_dims
;
}
int
SplitPlugin
::
initialize
()
{
PADDLE_ENFORCE_LE
(
axis_
,
nvinfer1
::
Dims
::
MAX_DIMS
);
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]);
...
...
@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// tensorrt
}
// inference
}
// paddle
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
25adf970
...
...
@@ -14,61 +14,58 @@
#pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
SplitPlugin
:
public
PluginTensorRT
{
int
axis_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{}
SplitPlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
output_length_
);
}
SplitPlugin
*
clone
()
const
override
{
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
override
;
int
initialize
()
override
;
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
protected:
virtual
size_t
getSerializationSize
()
override
{
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
output_length_
)
+
getBaseSerializationSize
();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
virtual
void
serialize
(
void
*
buffer
)
override
{
void
serialize
(
void
*
buffer
)
override
{
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
output_length_
);
}
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{
assert
(
axis
<=
nvinfer1
::
Dims
::
MAX_DIMS
);
}
// It was used for tensorrt deserialization.
// It should not be called by users.
SplitPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
deserializeBase
(
serialData
,
serialLength
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
axis_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
output_length_
);
}
SplitPlugin
*
clone
()
const
override
{
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
virtual
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
virtual
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
virtual
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
virtual
int
initialize
()
override
;
virtual
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
int
axis_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
};
}
// tensorrt
}
// inference
}
// paddle
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
浏览文件 @
25adf970
...
...
@@ -17,6 +17,7 @@
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
void
PluginTensorRT
::
serializeBase
(
void
*&
buffer
)
{
SerializeValue
(
&
buffer
,
input_dims_
);
...
...
@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue
(
&
buffer
,
data_format_
);
}
void
PluginTensorRT
::
deserializeBase
(
void
const
*&
serial
D
ata
,
size_t
&
serial
L
ength
)
{
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
input_dims_
);
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
max_batch_size_
);
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
data_type_
);
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
data_format_
);
void
PluginTensorRT
::
deserializeBase
(
void
const
*&
serial
_d
ata
,
size_t
&
serial
_l
ength
)
{
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
input_dims_
);
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
max_batch_size_
);
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
data_type_
);
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
data_format_
);
}
size_t
PluginTensorRT
::
getBaseSerializationSize
()
{
...
...
@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
(
format
==
nvinfer1
::
PluginFormat
::
kNCHW
));
}
void
PluginTensorRT
::
configureWithFormat
(
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
,
const
nvinfer1
::
Dims
*
outputDims
,
int
nbOutputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
maxBatchSize
)
{
void
PluginTensorRT
::
configureWithFormat
(
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
,
const
nvinfer1
::
Dims
*
output_dims
,
int
num_outputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
max_batch_size
)
{
data_type_
=
type
;
data_format_
=
format
;
input_dims_
.
assign
(
input
Dims
,
inputDims
+
nbI
nputs
);
max_batch_size_
=
max
BatchS
ize
;
input_dims_
.
assign
(
input
_dims
,
input_dims
+
num_i
nputs
);
max_batch_size_
=
max
_batch_s
ize
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
浏览文件 @
25adf970
...
...
@@ -14,23 +14,30 @@
#pragma once
#include <
cassert
>
#include <
NvInfer.h
>
#include <cstring>
#include <iostream>
#include <unordered_map>
#include <vector>
#include "NvInfer.h"
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
PluginTensorRT
:
public
nvinfer1
::
IPluginExt
{
public:
PluginTensorRT
()
{}
// It was used for TensorRT deserialization.
// It should not be called by users.
PluginTensorRT
(
const
void
*
serialized_data
,
size_t
length
)
{}
virtual
~
PluginTensorRT
()
{}
nvinfer1
::
Dims
const
&
getInputDims
(
int
index
)
const
{
return
input_dims_
.
at
(
index
);
}
...
...
@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
nvinfer1
::
DataType
getDataType
()
const
{
return
data_type_
;
}
nvinfer1
::
PluginFormat
getDataFormat
()
const
{
return
data_format_
;
}
virtual
const
char
*
getPluginVersion
()
const
{
return
"1"
;
}
void
AddInput
(
nvinfer1
::
ITensor
*
input
)
{
inputs_
.
push_back
(
input
);
}
std
::
vector
<
nvinfer1
::
ITensor
*>&
GetInputs
()
{
return
inputs_
;
}
virtual
nvinfer1
::
IPluginExt
*
clone
()
const
=
0
;
virtual
const
char
*
getPluginType
()
const
=
0
;
// Following functions are inherit from nvinfer1::IPluginExt
// Get the number of outputs from the layer
int
getNbOutputs
()
const
{
return
1
;
}
// Get the dimension of an output tensor
virtual
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
=
0
;
// Find the workspace size required by the layer
size_t
getWorkspaceSize
(
int
)
const
override
{
return
0
;
}
// Initialize the layer for execution.
// This is called when the engine is created.
int
initialize
()
override
{
return
0
;
}
// Shutdown the layer. This is called when the engine is destroyed
void
terminate
()
override
{}
virtual
~
PluginTensorRT
()
{}
// Execute the layer
virtual
int
enqueue
(
int
batch_size
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
=
0
;
// Find the size of the serialization buffer required
virtual
size_t
getSerializationSize
()
=
0
;
// Serialize the layer config to buffer.
// TensorRT will call this func to serialize the configuration of TensorRT
// engine. It should not be called by users.
virtual
void
serialize
(
void
*
buffer
)
=
0
;
// Check format support. The default is FLOAT32 and NCHW.
bool
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
override
;
void
configureWithFormat
(
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
,
const
nvinfer1
::
Dims
*
outputDims
,
int
nbOutputs
,
// Configure the layer
void
configureWithFormat
(
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
,
const
nvinfer1
::
Dims
*
output_dims
,
int
num_outputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
maxBatchSize
)
override
;
// *NOTE* The following functions need to be overrided in the subclass.
virtual
nvinfer1
::
IPluginExt
*
clone
()
const
=
0
;
virtual
const
char
*
getPluginType
()
const
=
0
;
// Initialize the layer for execution. This is called when the engine is
// created.
int
initialize
()
override
{
return
0
;
}
// Serialize the layer config to buffer.
virtual
void
serialize
(
void
*
buffer
)
=
0
;
virtual
size_t
getSerializationSize
()
=
0
;
virtual
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
=
0
;
int
max_batch_size
)
override
;
protected:
// Deserialize input_dims, max_batch_size, data_type, data_format
void
deserializeBase
(
void
const
*&
serialData
,
size_t
&
serialLength
);
void
deserializeBase
(
void
const
*&
serial_data
,
// NOLINT
size_t
&
serial_length
);
// NOLINT
size_t
getBaseSerializationSize
();
// Serialize input_dims, max_batch_size, data_type, data_format
void
serializeBase
(
void
*&
buffer
);
void
serializeBase
(
void
*&
buffer
);
// NOLINT
std
::
vector
<
nvinfer1
::
Dims
>
input_dims_
;
size_t
max_batch_size_
;
nvinfer1
::
DataType
data_type_
;
nvinfer1
::
PluginFormat
data_format_
;
std
::
vector
<
nvinfer1
::
ITensor
*>
inputs_
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
25adf970
...
...
@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
return
;
}
LOG
(
INFO
)
<<
*
config
;
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
config
)
;
}
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
...
...
paddle/fluid/operators/math/jit_code.cc
浏览文件 @
25adf970
...
...
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me
namespace
paddle
{
namespace
operators
{
...
...
@@ -60,257 +59,83 @@ void VXXJitCode::generate() {
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
int
rest
=
num_
%
YMM_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
if
(
scalar_index_
!=
1
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
type_
==
operand_type
::
mul
)
{
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
vmovups
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
4
;
rest
-=
4
;
}
if
(
rest
>=
2
)
{
if
(
scalar_index_
!=
1
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
while
(
rest
>
0
)
{
int
block
=
XMM_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
block
=
4
;
if
(
scalar_index_
!=
1
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
}
else
if
(
rest
>=
2
)
{
block
=
2
;
if
(
scalar_index_
!=
1
)
{
vmovq
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovq
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
}
else
{
block
=
1
;
if
(
scalar_index_
!=
1
)
{
vmovss
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovss
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
}
if
(
scalar_index_
!=
2
)
{
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
type_
==
operand_type
::
mul
)
{
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
switch
(
type_
)
{
case
operand_type
::
mul
:
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
break
;
case
operand_type
::
add
:
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
break
;
default:
break
;
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
vmovq
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
2
;
rest
-=
2
;
}
if
(
rest
>
0
)
{
if
(
scalar_index_
!=
1
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
type_
==
operand_type
::
mul
)
{
vmulss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
if
(
rest
>=
4
)
{
vmovups
(
ptr
[
param3
+
offset
],
xmm_dst
);
}
else
if
(
rest
>=
2
)
{
vmovq
(
ptr
[
param3
+
offset
],
xmm_dst
);
}
else
{
vmovss
(
ptr
[
param3
+
offset
],
xmm_dst
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
vmovss
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
block
;
rest
-=
block
;
}
ret
();
}
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_EXP_C1 0.693359375
#define CEPHES_EXP_C2 -2.12194440e-4
#define CEPHES_EXP_P0 1.9875691500E-4
#define CEPHES_EXP_P1 1.3981999507E-3
#define CEPHES_EXP_P2 8.3334519073E-3
#define CEPHES_EXP_P3 4.1665795894E-2
#define CEPHES_EXP_P4 1.6666665459E-1
#define CEPHES_EXP_P5 5.0000001201E-1
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
static
const
float
exp_float_consts
[]
ALIGN32
=
{
REPEAT_8TIMES
(
1.
f
),
REPEAT_8TIMES
(
2.
f
),
REPEAT_8TIMES
(
0.5
f
),
REPEAT_8TIMES
(
EXP_HIG
),
REPEAT_8TIMES
(
EXP_LOW
),
REPEAT_8TIMES
(
CEPHES_LOG2EF
),
REPEAT_8TIMES
(
CEPHES_EXP_C1
),
REPEAT_8TIMES
(
CEPHES_EXP_C2
),
REPEAT_8TIMES
(
CEPHES_EXP_P0
),
REPEAT_8TIMES
(
CEPHES_EXP_P1
),
REPEAT_8TIMES
(
CEPHES_EXP_P2
),
REPEAT_8TIMES
(
CEPHES_EXP_P3
),
REPEAT_8TIMES
(
CEPHES_EXP_P4
),
REPEAT_8TIMES
(
CEPHES_EXP_P5
),
REPEAT_8TIMES
(
EXP_MAX_INPUT
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MAX
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MIN
)};
static
const
int
exp_int_0x7f
[]
ALIGN32
=
{
REPEAT_8TIMES
(
0x7f
)};
static
int
g_tmp_mem
[
16
]
ALIGN32
=
{
0
};
const
float
exp_float_consts
[]
ALIGN32
=
{
REPEAT_8TIMES
(
1.
f
),
REPEAT_8TIMES
(
2.
f
),
REPEAT_8TIMES
(
0.5
f
),
REPEAT_8TIMES
(
EXP_HIG
),
REPEAT_8TIMES
(
EXP_LOW
),
REPEAT_8TIMES
(
CEPHES_LOG2EF
),
REPEAT_8TIMES
(
CEPHES_EXP_C1
),
REPEAT_8TIMES
(
CEPHES_EXP_C2
),
REPEAT_8TIMES
(
CEPHES_EXP_P0
),
REPEAT_8TIMES
(
CEPHES_EXP_P1
),
REPEAT_8TIMES
(
CEPHES_EXP_P2
),
REPEAT_8TIMES
(
CEPHES_EXP_P3
),
REPEAT_8TIMES
(
CEPHES_EXP_P4
),
REPEAT_8TIMES
(
CEPHES_EXP_P5
),
REPEAT_8TIMES
(
EXP_MAX_INPUT
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MAX
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MIN
)};
const
int
exp_int_0x7f
[]
ALIGN32
=
{
REPEAT_8TIMES
(
0x7f
)};
int
g_tmp_mem
[
16
]
ALIGN32
=
{
0
};
bool
VActJitCode
::
init
(
int
d
,
operand_type
type
)
{
bool
ok
=
MayIUse
(
avx
);
if
(
type
==
operand_type
::
relu
)
{
return
ok
;
}
else
if
(
type
==
operand_type
::
exp
)
{
// exp is slower than mkl when d >= 256
return
ok
&&
d
%
8
==
0
&&
d
<
256
;
}
else
{
// TODO(TJ): support more
return
ok
&&
d
%
8
==
0
;
}
}
void
VActJitCode
::
relu_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
ymm_t
&
ymm_zero
)
{
vmaxps
(
ymm_dst
,
ymm_zero
,
ymm_src
);
}
void
VActJitCode
::
exp_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
int
fx_idx
,
int
fy_idx
,
int
mask_idx
,
int
tmp_idx
)
{
assert
(
ymm_src
.
getIdx
()
!=
ymm_dst
.
getIdx
());
// TODO(TJ): use enfore
// check all idx can not equal
ymm_t
ymm_fx
=
ymm_t
(
fx_idx
);
ymm_t
ymm_fy
=
ymm_t
(
fy_idx
);
ymm_t
ymm_mask
=
ymm_t
(
mask_idx
);
ymm_t
ymm_tmp
=
ymm_t
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_HIG
]);
vminps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOW
]);
vmaxps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
// express exp(x) as exp(g + n*log(2))
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOG2EF
]);
vmulps
(
ymm_fx
,
ymm_src
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_0P5
]);
vaddps
(
ymm_fx
,
ymm_fx
,
ymm_tmp
);
vroundps
(
ymm_fy
,
ymm_fx
,
0x01
);
// if greater, substract 1
vcmpgtps
(
ymm_mask
,
ymm_fy
,
ymm_fx
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
]);
vandps
(
ymm_mask
,
ymm_mask
,
ymm_tmp
);
vsubps
(
ymm_fx
,
ymm_fy
,
ymm_mask
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C1
]);
vmulps
(
ymm_fy
,
ymm_fx
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C2
]);
ymm_t
ymm_z
=
ymm_t
(
ymm_mask
.
getIdx
());
vmulps
(
ymm_z
,
ymm_fx
,
ymm_tmp
);
vsubps
(
ymm_src
,
ymm_src
,
ymm_fy
);
vsubps
(
ymm_src
,
ymm_src
,
ymm_z
);
vmulps
(
ymm_z
,
ymm_src
,
ymm_src
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P0
]);
vmulps
(
ymm_dst
,
ymm_src
,
ymm_tmp
);
for
(
size_t
i
=
OFFSET_EXP_P1
;
i
<
OFFSET_EXP_P5
;
i
+=
(
YMM_FLOAT_BLOCK
*
sizeof
(
float
)))
{
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
i
]);
// P1~P4
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vmulps
(
ymm_dst
,
ymm_dst
,
ymm_src
);
}
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P5
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vmulps
(
ymm_dst
,
ymm_dst
,
ymm_z
);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_src
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
// build 2^n
ymm_t
ymm_int
=
ymm_fx
;
vcvttps2dq
(
ymm_int
,
ymm_fx
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
vmovdqa
(
ymm_tmp
,
ptr
[
reg_ptr_global
]);
if
(
MayIUse
(
avx2
))
{
vpaddd
(
ymm_int
,
ymm_int
,
ymm_tmp
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
}
else
if
(
MayIUse
(
avx
))
{
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
ymm_tmp
.
getIdx
());
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
mov
(
reg_ptr_tmp
,
reinterpret_cast
<
size_t
>
(
g_tmp_mem
));
vmovdqa
(
ptr
[
reg_ptr_tmp
],
ymm_int
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
YMM_FLOAT_BLOCK
*
sizeof
(
float
)],
ymm_tmp
);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
],
xtmp1
);
// next 128bits
vmovdqa
(
xtmp1
,
ptr
[
reg_ptr_tmp
+
4
/*xmm float block*/
*
sizeof
(
float
)]);
vmovdqa
(
xtmp2
,
ptr
[
reg_ptr_tmp
+
(
YMM_FLOAT_BLOCK
+
4
/*xmm float block*/
)
*
sizeof
(
float
)]);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
4
/*xmm float block*/
*
sizeof
(
float
)],
xtmp1
);
// load out
vmovdqa
(
ymm_int
,
ptr
[
reg_ptr_tmp
]);
}
vmulps
(
ymm_dst
,
ymm_dst
,
ymm_int
);
pop
(
reg_ptr_global
);
}
void
VActJitCode
::
sigmoid_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
int
fx_idx
,
int
fy_idx
,
int
mask_idx
,
int
tmp_idx
)
{
// y = 1 / (1 + e^-x)
ymm_t
ymm_tmp
=
ymm_t
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MAX
]);
vminps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MIN
]);
vmaxps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
vxorps
(
ymm_tmp
,
ymm_tmp
,
ymm_tmp
);
vsubps
(
ymm_src
,
ymm_tmp
,
ymm_src
);
exp_ymm
(
ymm_dst
,
ymm_src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vdivps
(
ymm_dst
,
ymm_tmp
,
ymm_dst
);
pop
(
reg_ptr_global
);
}
void
VActJitCode
::
tanh_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
int
fx_idx
,
int
fy_idx
,
int
mask_idx
,
int
tmp_idx
)
{
// y = 2 / (1 + e^(-2x)) - 1
ymm_t
ymm_tmp
=
ymm_t
(
tmp_idx
);
ymm_t
ymm_zero
=
ymm_t
(
mask_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vxorps
(
ymm_zero
,
ymm_zero
,
ymm_zero
);
vsubps
(
ymm_tmp
,
ymm_zero
,
ymm_tmp
);
vmulps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
exp_ymm
(
ymm_dst
,
ymm_src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vdivps
(
ymm_dst
,
ymm_tmp
,
ymm_dst
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vsubps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
pop
(
reg_ptr_global
);
// TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
return
MayIUse
(
avx
);
}
void
VActJitCode
::
generate
()
{
...
...
@@ -324,16 +149,16 @@ void VActJitCode::generate() {
vmovups
(
ymm_src
,
ptr
[
param1
+
offset
]);
switch
(
type_
)
{
case
operand_type
::
relu
:
relu_
ymm
(
ymm_dst
,
ymm_src
,
ymm_zero
);
relu_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
ymm_zero
);
break
;
case
operand_type
::
exp
:
exp_
ymm
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
exp_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_
ymm
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
sigmoid_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
tanh
:
tanh_
ymm
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
tanh_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
identity
:
break
;
...
...
@@ -343,30 +168,44 @@ void VActJitCode::generate() {
vmovups
(
ptr
[
param2
+
offset
],
ymm_dst
);
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
if
(
type_
!=
operand_type
::
relu
)
{
// TODO(TJ): remove me
ret
();
return
;
}
int
rest
=
num_
%
YMM_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
4
;
rest
-=
4
;
}
if
(
rest
>=
2
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovq
(
ptr
[
param2
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
2
;
rest
-=
2
;
}
if
(
rest
>
0
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovss
(
ptr
[
param2
+
offset
],
xmm_dst
);
while
(
rest
>
0
)
{
int
block
=
XMM_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
block
=
4
;
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
}
else
if
(
rest
>=
2
)
{
block
=
2
;
vmovq
(
xmm_src
,
ptr
[
param1
+
offset
]);
}
else
{
block
=
1
;
vmovss
(
xmm_src
,
ptr
[
param1
+
offset
]);
}
switch
(
type_
)
{
case
operand_type
::
relu
:
relu_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
xmm_zero
);
break
;
case
operand_type
::
exp
:
exp_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
tanh
:
tanh_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
default:
break
;
}
if
(
rest
>=
4
)
{
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
else
if
(
rest
>=
2
)
{
vmovq
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
else
{
vmovss
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
offset
+=
sizeof
(
float
)
*
block
;
rest
-=
block
;
}
ret
();
}
...
...
paddle/fluid/operators/math/jit_code.h
浏览文件 @
25adf970
...
...
@@ -16,6 +16,8 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -40,6 +42,51 @@ typedef enum {
identity
}
operand_type
;
extern
const
float
exp_float_consts
[];
extern
const
int
exp_int_0x7f
[];
extern
int
g_tmp_mem
[];
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_EXP_C1 0.693359375
#define CEPHES_EXP_C2 -2.12194440e-4
#define CEPHES_EXP_P0 1.9875691500E-4
#define CEPHES_EXP_P1 1.3981999507E-3
#define CEPHES_EXP_P2 8.3334519073E-3
#define CEPHES_EXP_P3 4.1665795894E-2
#define CEPHES_EXP_P4 1.6666665459E-1
#define CEPHES_EXP_P5 5.0000001201E-1
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
class
VXXJitCode
:
public
JitCode
{
public:
...
...
@@ -127,21 +174,140 @@ class VActJitCode : public JitCode {
void
generate
()
override
;
protected:
// compute relu with ymm
void
relu_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
const
Xbyak
::
Ymm
&
zero
);
// compute relu with ymm, xmm
template
<
typename
JMM
>
void
relu_jmm
(
JMM
&
dst
,
JMM
&
src
,
JMM
&
zero
)
{
// NOLINT
vmaxps
(
dst
,
src
,
zero
);
}
// compute exp with ymm
void
exp_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
);
// compute exp with ymm, xmm
template
<
typename
JMM
>
void
exp_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
// NOLINT
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
using
namespace
platform
::
jit
;
// NOLINT
assert
(
src
.
getIdx
()
!=
dst
.
getIdx
());
// TODO(TJ): use enfore
// check all idx can not equal
JMM
jmm_fx
=
JMM
(
fx_idx
);
JMM
jmm_fy
=
JMM
(
fy_idx
);
JMM
jmm_mask
=
JMM
(
mask_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_HIG
]);
vminps
(
src
,
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOW
]);
vmaxps
(
src
,
src
,
jmm_tmp
);
// express exp(x) as exp(g + n*log(2))
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOG2EF
]);
vmulps
(
jmm_fx
,
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_0P5
]);
vaddps
(
jmm_fx
,
jmm_fx
,
jmm_tmp
);
vroundps
(
jmm_fy
,
jmm_fx
,
0x01
);
// if greater, substract 1
vcmpgtps
(
jmm_mask
,
jmm_fy
,
jmm_fx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vandps
(
jmm_mask
,
jmm_mask
,
jmm_tmp
);
vsubps
(
jmm_fx
,
jmm_fy
,
jmm_mask
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C1
]);
vmulps
(
jmm_fy
,
jmm_fx
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C2
]);
JMM
ymm_z
=
JMM
(
jmm_mask
.
getIdx
());
vmulps
(
ymm_z
,
jmm_fx
,
jmm_tmp
);
vsubps
(
src
,
src
,
jmm_fy
);
vsubps
(
src
,
src
,
ymm_z
);
vmulps
(
ymm_z
,
src
,
src
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P0
]);
vmulps
(
dst
,
src
,
jmm_tmp
);
for
(
size_t
i
=
OFFSET_EXP_P1
;
i
<
OFFSET_EXP_P5
;
i
+=
(
YMM_FLOAT_BLOCK
*
sizeof
(
float
)))
{
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
i
]);
// P1~P4
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmulps
(
dst
,
dst
,
src
);
}
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P5
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmulps
(
dst
,
dst
,
ymm_z
);
vaddps
(
dst
,
dst
,
src
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
// build 2^n
JMM
ymm_int
=
jmm_fx
;
vcvttps2dq
(
ymm_int
,
jmm_fx
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
vmovdqa
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
if
(
MayIUse
(
avx2
)
||
std
::
is_same
<
JMM
,
xmm_t
>::
value
)
{
vpaddd
(
ymm_int
,
ymm_int
,
jmm_tmp
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
}
else
if
(
MayIUse
(
avx
))
{
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
jmm_tmp
.
getIdx
());
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
mov
(
reg_ptr_tmp
,
reinterpret_cast
<
size_t
>
(
g_tmp_mem
));
vmovdqa
(
ptr
[
reg_ptr_tmp
],
ymm_int
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
YMM_FLOAT_BLOCK
*
sizeof
(
float
)],
jmm_tmp
);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
],
xtmp1
);
// next 128bits
vmovdqa
(
xtmp1
,
ptr
[
reg_ptr_tmp
+
XMM_FLOAT_BLOCK
*
sizeof
(
float
)]);
vmovdqa
(
xtmp2
,
ptr
[
reg_ptr_tmp
+
(
YMM_FLOAT_BLOCK
+
XMM_FLOAT_BLOCK
)
*
sizeof
(
float
)]);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
XMM_FLOAT_BLOCK
*
sizeof
(
float
)],
xtmp1
);
// load out
vmovdqa
(
ymm_int
,
ptr
[
reg_ptr_tmp
]);
}
vmulps
(
dst
,
dst
,
ymm_int
);
pop
(
reg_ptr_global
);
}
// compute sigmoid with ymm
void
sigmoid_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
);
// compute sigmoid with ymm, xmm
template
<
typename
JMM
>
void
sigmoid_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
// NOLINT
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
// y = 1 / (1 + e^-x)
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MAX
]);
vminps
(
src
,
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MIN
]);
vmaxps
(
src
,
src
,
jmm_tmp
);
vxorps
(
jmm_tmp
,
jmm_tmp
,
jmm_tmp
);
vsubps
(
src
,
jmm_tmp
,
src
);
exp_jmm
<
JMM
>
(
dst
,
src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vdivps
(
dst
,
jmm_tmp
,
dst
);
pop
(
reg_ptr_global
);
}
// compute tanh with ymm
void
tanh_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
);
// compute tanh with ymm, xmm
template
<
typename
JMM
>
void
tanh_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
// NOLINT
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
// y = 2 / (1 + e^(-2x)) - 1
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_zero
=
JMM
(
mask_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vxorps
(
jmm_zero
,
jmm_zero
,
jmm_zero
);
vsubps
(
jmm_tmp
,
jmm_zero
,
jmm_tmp
);
vmulps
(
src
,
src
,
jmm_tmp
);
exp_jmm
<
JMM
>
(
dst
,
src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vdivps
(
dst
,
jmm_tmp
,
dst
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vsubps
(
dst
,
dst
,
jmm_tmp
);
pop
(
reg_ptr_global
);
}
protected:
int
num_
;
...
...
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
25adf970
...
...
@@ -26,6 +26,7 @@ namespace operators {
namespace
math
{
namespace
jitkernel
{
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
25adf970
...
...
@@ -33,6 +33,9 @@ limitations under the License. */
constexpr
int
repeat
=
20000
;
// TODO(TJ): benchmark and test should be seperated,
// benchmark should verify more sizes
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
...
...
@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
3
,
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
10.
f
,
1.
f
);
...
...
@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vexp
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
128
,
256
})
{
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
12
,
15
,
16
,
20
,
30
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -231,7 +234,7 @@ void vsigmoid_better(
TEST
(
JitKernel
,
vsigmoid
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -295,7 +298,7 @@ void vtanh_better(
TEST
(
JitKernel
,
vtanh
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -386,7 +389,7 @@ void lstm_ctht_better(
TEST
(
JitKernel
,
lstm
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
int
d4
=
d
*
4
;
int
d3
=
d
*
3
;
std
::
vector
<
float
>
x
(
d4
),
xref
(
d4
);
...
...
@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_ref
(
d
,
x_data
,
y_data
,
zref_data
);
vadd
relu
_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
...
...
paddle/fluid/operators/math/softmax.h
浏览文件 @
25adf970
...
...
@@ -19,7 +19,8 @@ namespace paddle {
namespace
operators
{
namespace
math
{
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
>
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
,
typename
Enable
=
void
>
class
SoftmaxFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
...
...
paddle/fluid/operators/math/softmax_impl.h
浏览文件 @
25adf970
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -32,8 +33,8 @@ struct ValueClip {
}
};
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
>
void
SoftmaxFunctor
<
DeviceContext
,
T
,
is_test
>::
operator
()(
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
,
typename
Enable
>
void
SoftmaxFunctor
<
DeviceContext
,
T
,
is_test
,
Enable
>::
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
...
...
@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
.
broadcast
(
one_by_class
));
}
template
<
typename
DeviceContext
,
typename
T
>
class
SoftmaxFunctor
<
DeviceContext
,
T
,
true
>
{
template
<
class
DeviceContext
>
using
enable_if_CPU
=
typename
std
::
enable_if
<
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
;
template
<
typename
DeviceContext
>
class
SoftmaxFunctor
<
DeviceContext
,
float
,
true
,
enable_if_CPU
<
DeviceContext
>>
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
T
>::
From
(
*
Y
);
auto
in_dims
=
X
->
dims
();
auto
out_dims
=
Y
->
dims
();
const
float
*
in_data
=
X
->
data
<
float
>
();
float
*
out_data
=
Y
->
data
<
float
>
();
const
int
kBatchDim
=
0
;
const
int
kClassDim
=
1
;
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
Eigen
::
DSizes
<
int
,
1
>
along_class
(
kClassDim
);
Eigen
::
DSizes
<
int
,
2
>
batch_by_one
(
batch_size
,
1
);
Eigen
::
DSizes
<
int
,
2
>
one_by_class
(
1
,
num_classes
);
auto
shifted_logits
=
(
logits
-
logits
.
maximum
(
along_class
)
.
eval
()
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
));
softmax
.
device
(
*
context
.
eigen_device
())
=
shifted_logits
.
exp
();
softmax
.
device
(
*
context
.
eigen_device
())
=
(
softmax
*
softmax
.
sum
(
along_class
)
.
inverse
()
.
eval
()
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
));
// 2D data. Batch x C
const
int
batch_size
=
in_dims
[
kBatchDim
];
const
int
num_classes
=
in_dims
[
kClassDim
];
std
::
vector
<
float
>
entities
(
batch_size
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
float
>
(
context
);
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
entities
[
n
]
=
in_data
[
n
*
num_classes
];
for
(
int
c
=
1
;
c
<
num_classes
;
++
c
)
{
entities
[
n
]
=
in_data
[
n
*
num_classes
+
c
]
>
entities
[
n
]
?
in_data
[
n
*
num_classes
+
c
]
:
entities
[
n
];
}
for
(
int
c
=
0
;
c
<
num_classes
;
++
c
)
{
out_data
[
n
*
num_classes
+
c
]
=
in_data
[
n
*
num_classes
+
c
]
-
entities
[
n
];
}
}
blas
.
VEXP
(
num_classes
*
batch_size
,
out_data
,
out_data
);
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
entities
[
n
]
=
out_data
[
n
*
num_classes
];
for
(
int
c
=
1
;
c
<
num_classes
;
++
c
)
{
entities
[
n
]
+=
out_data
[
n
*
num_classes
+
c
];
}
blas
.
SCAL
(
num_classes
,
1.0
f
/
entities
[
n
],
&
out_data
[
n
*
num_classes
]);
}
}
};
...
...
paddle/fluid/operators/softmax_op.h
浏览文件 @
25adf970
...
...
@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> {
Tensor
X_2d
=
framework
::
ReshapeToMatrix
(
*
X
,
rank
-
1
);
Tensor
Out_2d
=
framework
::
ReshapeToMatrix
(
*
Out
,
rank
-
1
);
#ifdef ON_INFER
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
true
>
()(
#ifdef PADDLE_ON_INFERENCE
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>
()(
context
.
template
device_context
<
DeviceContext
>(),
&
X_2d
,
&
Out_2d
);
#else
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
false
>
()(
...
...
paddle/fluid/operators/stack_op.h
浏览文件 @
25adf970
...
...
@@ -147,20 +147,32 @@ class StackKernel : public framework::OpKernel<T> {
auto
&
dim
=
x
[
0
]
->
dims
();
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
#ifdef __NVCC__
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
thrust
::
device_vector
<
const
T
*>
device_x_vec
(
x_datas
);
auto
x_data_arr
=
device_x_vec
.
data
().
get
();
#else
auto
x_data_arr
=
x_datas
.
data
();
#endif
StackFunctorForRange
(
dev_ctx
,
x_data_arr
,
y_data
,
total_num
,
n
,
post
);
#ifdef __NVCC__
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
#else
auto
x_data_arr
=
x_datas
.
data
();
size_t
x_offset
=
0
;
size_t
y_offset
=
0
;
for
(
int
i
=
0
;
i
<
pre
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
std
::
memcpy
(
y_data
+
y_offset
,
x_data_arr
[
j
]
+
x_offset
,
post
*
sizeof
(
T
));
y_offset
+=
post
;
}
x_offset
+=
post
;
}
#endif
}
};
...
...
python/paddle/fluid/__init__.py
浏览文件 @
25adf970
...
...
@@ -117,7 +117,8 @@ def __bootstrap__():
'use_mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
]
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
25adf970
...
...
@@ -5776,7 +5776,7 @@ def image_resize(input,
Examples:
.. code-block:: python
out = fluid.layers.image_resize(input, out_shape=[12, 12])
out = fluid.layers.image_resize(input, out_shape=[12, 12]
, resample="NEAREST"
)
"""
resample_methods
=
{
'BILINEAR'
:
'bilinear'
,
...
...
@@ -5879,6 +5879,11 @@ def resize_bilinear(input,
Returns:
${out_comment}.
Examples:
.. code-block:: python
out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'BILINEAR'
,
actual_shape
)
...
...
@@ -5925,6 +5930,11 @@ def resize_nearest(input,
Returns:
${out_comment}.
Examples:
.. code-block:: python
out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'NEAREST'
,
actual_shape
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录