Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e52d90a3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
e52d90a3
编写于
11月 23, 2018
作者:
Z
Zhaolong Xing
提交者:
GitHub
11月 23, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #14527 from hjchen2/develop
Refine split TensorRT plugin
上级
45312813
1adda8e0
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
211 addition
and
56 deletion
+211
-56
paddle/fluid/inference/tensorrt/convert/split_op.cc
paddle/fluid/inference/tensorrt/convert/split_op.cc
+2
-10
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+75
-13
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+127
-31
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+7
-2
未找到文件。
paddle/fluid/inference/tensorrt/convert/split_op.cc
浏览文件 @
e52d90a3
...
...
@@ -19,9 +19,6 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
/*
* SplitOp.
*/
class
SplitOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
...
...
@@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter {
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
std
::
vector
<
int
>
output_lengths
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"sections"
));
// split on batch is not supported in TensorRT
PADDLE_ENFORCE
(
axis
!=
0
);
if
(
axis
<
0
)
{
axis
+=
input_dims
.
nbDims
;
}
else
{
axis
-=
1
;
}
axis
+=
(
axis
<
0
)
?
input_dims
.
nbDims
:
-
1
;
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
//
plugin
::
SplitPlugin
*
plugin
=
new
plugin
::
SplitPlugin
(
axis
,
output_lengths
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
...
...
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
浏览文件 @
e52d90a3
...
...
@@ -20,30 +20,92 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
TEST
(
split_op
,
test
)
{
template
<
int
BatchSize
,
int
Axis
>
void
TensorRTSplitTest
(
const
std
::
vector
<
int
>
&
in_shape
,
const
std
::
vector
<
int
>
&
sections
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"split_input"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out1"
,
nvinfer1
::
DimsCHW
(
2
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out2"
,
nvinfer1
::
DimsCHW
(
1
,
2
,
2
));
TRTConvertValidation
validator
(
BatchSize
+
1
,
parameters
,
scope
,
10000
);
auto
make_dim
=
[](
const
std
::
vector
<
int
>
&
shape
)
{
nvinfer1
::
DimsCHW
dim
;
dim
.
c
()
=
shape
[
0
];
dim
.
h
()
=
shape
[
1
];
dim
.
w
()
=
shape
[
2
];
return
dim
;
};
validator
.
DeclInputVar
(
"split_input"
,
make_dim
(
in_shape
));
std
::
vector
<
std
::
string
>
output_vars
;
for
(
size_t
i
=
0
;
i
<
sections
.
size
();
++
i
)
{
auto
out_shape
=
in_shape
;
out_shape
[
Axis
-
1
]
=
sections
[
i
];
std
::
string
output_name
=
"split_out"
+
std
::
to_string
(
i
);
validator
.
DeclOutputVar
(
output_name
,
make_dim
(
out_shape
));
output_vars
.
push_back
(
output_name
);
}
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"split"
);
desc
.
SetInput
(
"X"
,
{
"split_input"
});
desc
.
SetOutput
(
"Out"
,
{
"split_out1"
,
"split_out2"
}
);
desc
.
SetOutput
(
"Out"
,
output_vars
);
int
num
=
0
;
int
axis
=
1
;
std
::
vector
<
int
>
output_lengths
=
{
2
,
1
};
desc
.
SetAttr
(
"axis"
,
axis
);
desc
.
SetAttr
(
"num"
,
num
);
desc
.
SetAttr
(
"sections"
,
output_lengths
);
desc
.
SetAttr
(
"axis"
,
Axis
);
desc
.
SetAttr
(
"num"
,
0
);
desc
.
SetAttr
(
"sections"
,
sections
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
validator
.
Execute
(
BatchSize
);
}
// batch = 0, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 0, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
// batch = 10, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 10, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
e52d90a3
...
...
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_fp16.h>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
...
...
@@ -19,6 +21,52 @@ namespace inference {
namespace
tensorrt
{
namespace
plugin
{
// copied from operators::math::SplitFunctor
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
*
out_cols
,
int
out_cols_size
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
curr_segment
=
0
;
int
curr_offset
=
out_cols
[
0
];
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
while
(
curr_col_offset
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs_data
[
curr_segment
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
fixed_out_col
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
/
fixed_out_col
;
int
in_offset
=
tid_x
-
split
*
fixed_out_col
;
T
*
output_ptr
=
outputs_data
[
split
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
fixed_out_col
+
in_offset
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
num_inputs
,
1
);
...
...
@@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
int
SplitPlugin
::
initialize
()
{
PADDLE_ENFORCE_LE
(
axis_
,
nvinfer1
::
Dims
::
MAX_DIMS
);
// notice input dims is [C, H, W]
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
outer_rows_
=
1
;
inner_cols_
=
1
;
for
(
int
i
=
0
;
i
<
axis_
;
++
i
)
{
outer_rows_
*=
dims
.
d
[
i
];
}
for
(
int
i
=
axis_
+
1
;
i
<
dims
.
nbDims
;
++
i
)
{
inner_cols_
*=
dims
.
d
[
i
];
}
same_shape_
=
true
;
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]);
if
(
output_length_
[
i
]
!=
output_length_
[
0
])
{
same_shape_
=
false
;
}
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]
*
inner_cols_
);
}
segment_offsets_
=
segment_offsets
;
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
nx_
=
1
;
for
(
int
i
=
dims
.
nbDims
-
1
;
i
>
axis_
;
--
i
)
{
nx_
*=
dims
.
d
[
i
];
inner_cols_
*=
dims
.
d
[
axis_
];
d_segment_offsets_
=
segment_offsets
;
segment_offsets_
=
std
::
move
(
segment_offsets
);
d_output_ptrs_
.
resize
(
this
->
getNbOutputs
(),
nullptr
);
return
0
;
}
template
<
typename
T
>
inline
void
Split
(
cudaStream_t
stream
,
const
bool
same_shape
,
const
int
outer_rows
,
const
int
inner_cols
,
const
std
::
vector
<
int
>&
segment_offsets
,
const
int
*
d_segment_offsets
,
const
T
*
input
,
T
**
outputs
)
{
const
int
kThreadsPerBlock
=
1024
;
const
int
kMaxBlocks
=
65535
;
int
block_cols
=
kThreadsPerBlock
;
if
(
inner_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
inner_cols
+
31
)
>>
5
)
<<
5
;
}
ny_
=
dims
.
d
[
axis_
];
nz_
=
1
;
for
(
int
i
=
axis_
-
1
;
i
>=
0
;
--
i
)
{
nz_
*=
dims
.
d
[
i
];
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_cols
=
std
::
min
((
inner_cols
+
block_cols
-
1
)
/
block_cols
,
kMaxBlocks
);
int
grid_rows
=
std
::
min
(
kMaxBlocks
/
grid_cols
,
std
::
max
(
outer_rows
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
same_shape
)
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
segment_offsets
[
1
],
outputs
);
}
else
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
d_segment_offsets
,
static_cast
<
int
>
(
segment_offsets
.
size
()),
outputs
);
}
return
0
;
}
int
SplitPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
const
&
input_dims
=
this
->
getInputDims
(
0
);
int
input_size
=
0
;
float
const
*
idata
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
float
**
odatas
=
reinterpret_cast
<
float
**>
(
outputs
);
// kernel impl here.
int
inputBatchOffset
=
nx_
*
ny_
*
nz_
;
for
(
size_t
i
=
0
;
i
<
this
->
getNbOutputs
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
batchSize
;
j
++
)
{
cudaMemcpyAsync
(
odatas
[
i
]
+
j
*
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
inputs
[
0
]
+
(
inputBatchOffset
*
j
+
segment_offsets_
[
i
]
*
nx_
)
*
sizeof
(
float
),
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
);
float
const
*
input_ptr
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
if
(((
batchSize
==
1
&&
axis_
==
0
)
||
axis_
==
-
1
)
&&
this
->
getNbOutputs
()
<
10
)
{
float
**
output_ptrs
=
reinterpret_cast
<
float
**>
(
outputs
);
int
data_type_size
=
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
?
sizeof
(
float
)
:
sizeof
(
__half
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
output_ptrs
[
i
],
input_ptr
+
segment_offsets_
[
i
],
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
data_type_size
,
cudaMemcpyDeviceToDevice
,
stream
)
==
cudaSuccess
);
}
}
else
{
outer_rows_
*=
batchSize
;
const
int
*
d_segment_offsets_ptr
=
thrust
::
raw_pointer_cast
(
&
d_segment_offsets_
[
0
]);
float
**
output_ptrs
=
thrust
::
raw_pointer_cast
(
&
d_output_ptrs_
[
0
]);
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
output_ptrs
,
outputs
,
this
->
getNbOutputs
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
,
stream
)
==
cudaSuccess
);
if
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
input_ptr
,
output_ptrs
);
}
else
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
(
__half
*
)
input_ptr
,
// NOLINT
(
__half
**
)
output_ptrs
);
// NOLINT
}
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
e52d90a3
...
...
@@ -14,6 +14,7 @@
#pragma once
#include <thrust/device_vector.h>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
...
@@ -25,7 +26,7 @@ namespace plugin {
class
SplitPlugin
:
public
PluginTensorRT
{
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{}
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
SplitPlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
...
...
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
}
int
axis_
;
int
outer_rows_
;
int
inner_cols_
;
bool
same_shape_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
thrust
::
device_vector
<
int
>
d_segment_offsets_
;
thrust
::
device_vector
<
float
*>
d_output_ptrs_
;
};
}
// namespace plugin
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录