Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2372daff
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2372daff
编写于
7月 23, 2018
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
there is no batchsize concept in tensorrt's tensor
上级
4a076178
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
74 addition
and
17 deletion
+74
-17
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+32
-12
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+6
-2
paddle/fluid/inference/tensorrt/test_engine.cc
paddle/fluid/inference/tensorrt/test_engine.cc
+36
-3
未找到文件。
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
2372daff
...
@@ -26,6 +26,8 @@ namespace paddle {
...
@@ -26,6 +26,8 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
int
TensorRTEngine
::
runtime_batch_
=
1
;
void
TensorRTEngine
::
Build
(
const
DescType
&
paddle_model
)
{
void
TensorRTEngine
::
Build
(
const
DescType
&
paddle_model
)
{
PADDLE_ENFORCE
(
false
,
"not implemented"
);
PADDLE_ENFORCE
(
false
,
"not implemented"
);
}
}
...
@@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) {
...
@@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) {
}
}
infer_context_
->
enqueue
(
batch_size
,
buffers
.
data
(),
*
stream_
,
nullptr
);
infer_context_
->
enqueue
(
batch_size
,
buffers
.
data
(),
*
stream_
,
nullptr
);
cudaStreamSynchronize
(
*
stream_
);
cudaStreamSynchronize
(
*
stream_
);
SetRuntimeBatch
(
batch_size
);
}
}
TensorRTEngine
::~
TensorRTEngine
()
{
TensorRTEngine
::~
TensorRTEngine
()
{
...
@@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() {
...
@@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() {
auto
dims
=
infer_engine_
->
getBindingDimensions
(
slot_offset
);
auto
dims
=
infer_engine_
->
getBindingDimensions
(
slot_offset
);
item
.
second
=
kDataTypeSize
[
static_cast
<
int
>
(
item
.
second
=
kDataTypeSize
[
static_cast
<
int
>
(
infer_engine_
->
getBindingDataType
(
slot_offset
))]
*
infer_engine_
->
getBindingDataType
(
slot_offset
))]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
max_batch_
;
}
}
auto
&
buf
=
buffer
(
item
.
first
);
auto
&
buf
=
buffer
(
item
.
first
);
CHECK
(
buf
.
buffer
==
nullptr
);
// buffer should be allocated only once.
CHECK
(
buf
.
buffer
==
nullptr
);
// buffer should be allocated only once.
PADDLE_ENFORCE_EQ
(
0
,
cudaMalloc
(
&
buf
.
buffer
,
item
.
second
));
PADDLE_ENFORCE_EQ
(
0
,
cudaMalloc
(
&
buf
.
buffer
,
item
.
second
*
max_batch_
));
VLOG
(
4
)
<<
"buffer malloc "
<<
item
.
first
<<
" "
<<
item
.
second
<<
" "
VLOG
(
4
)
<<
"buffer malloc "
<<
item
.
first
<<
" "
<<
item
.
second
<<
" "
<<
buf
.
buffer
;
<<
buf
.
buffer
;
buf
.
size
=
buf
.
max_size
=
item
.
second
;
buf
.
size
=
item
.
second
;
buf
.
max_size
=
item
.
second
*
max_batch_
;
buf
.
device
=
DeviceType
::
GPU
;
buf
.
device
=
DeviceType
::
GPU
;
}
}
}
}
...
@@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
...
@@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
auto
*
input
=
infer_network_
->
addInput
(
name
.
c_str
(),
dtype
,
dims
);
auto
*
input
=
infer_network_
->
addInput
(
name
.
c_str
(),
dtype
,
dims
);
PADDLE_ENFORCE
(
input
,
"infer network add input %s failed"
,
name
);
PADDLE_ENFORCE
(
input
,
"infer network add input %s failed"
,
name
);
buffer_sizes_
[
name
]
=
kDataTypeSize
[
static_cast
<
int
>
(
dtype
)]
*
buffer_sizes_
[
name
]
=
kDataTypeSize
[
static_cast
<
int
>
(
dtype
)]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
max_batch_
;
PADDLE_ENFORCE
(
input
->
isNetworkInput
());
PADDLE_ENFORCE
(
input
->
isNetworkInput
());
TensorRTEngine
::
SetITensor
(
name
,
input
);
TensorRTEngine
::
SetITensor
(
name
,
input
);
return
input
;
return
input
;
...
@@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
...
@@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
return
buffer
(
name
).
buffer
;
return
buffer
(
name
).
buffer
;
}
}
void
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
void
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
)
{
size_t
max_size
)
{
// determine data size
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_
GE
(
max
_size
,
it
->
second
);
PADDLE_ENFORCE_
LE
(
dst
_size
,
it
->
second
);
auto
&
buf
=
buffer
(
name
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
it
->
second
,
PADDLE_ENFORCE_EQ
(
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToDevice
,
*
stream_
),
cudaMemcpyDeviceToDevice
,
*
stream_
),
0
);
0
);
}
}
void
TensorRTEngine
::
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
void
TensorRTEngine
::
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
)
{
size_t
max_size
)
{
// determine data size
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_
GE
(
max
_size
,
it
->
second
);
PADDLE_ENFORCE_
LE
(
dst
_size
,
it
->
second
);
auto
&
buf
=
buffer
(
name
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
it
->
second
,
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToHost
,
*
stream_
));
cudaMemcpyDeviceToHost
,
*
stream_
));
}
}
...
@@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
...
@@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
return
itensor_map_
[
name
];
return
itensor_map_
[
name
];
}
}
void
TensorRTEngine
::
SetRuntimeBatch
(
size_t
batch_size
)
{
runtime_batch_
=
batch_size
;
}
int
TensorRTEngine
::
GetRuntimeBatch
()
{
return
runtime_batch_
;
}
}
// namespace tensorrt
}
// namespace tensorrt
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
2372daff
...
@@ -104,10 +104,10 @@ class TensorRTEngine : public EngineBase {
...
@@ -104,10 +104,10 @@ class TensorRTEngine : public EngineBase {
// Return the output's GPU memory address without copy.
// Return the output's GPU memory address without copy.
void
*
GetOutputInGPU
(
const
std
::
string
&
name
);
void
*
GetOutputInGPU
(
const
std
::
string
&
name
);
// Copy data into dst inside the GPU device.
// Copy data into dst inside the GPU device.
void
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
);
void
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU.
// to CPU.
void
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
);
void
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
);
// Fill an ITensor into map itensor_map_.
// Fill an ITensor into map itensor_map_.
void
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
);
void
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
);
// Get an ITensor called name.
// Get an ITensor called name.
...
@@ -115,10 +115,14 @@ class TensorRTEngine : public EngineBase {
...
@@ -115,10 +115,14 @@ class TensorRTEngine : public EngineBase {
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
INetworkDefinition
*
network
()
{
return
infer_network_
.
get
();
}
nvinfer1
::
INetworkDefinition
*
network
()
{
return
infer_network_
.
get
();
}
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
private:
private:
// the max batch size
// the max batch size
int
max_batch_
;
int
max_batch_
;
// the runtime batch size
static
int
runtime_batch_
;
// the max memory size the engine uses
// the max memory size the engine uses
int
max_workspace_
;
int
max_workspace_
;
cudaStream_t
*
stream_
;
cudaStream_t
*
stream_
;
...
...
paddle/fluid/inference/tensorrt/test_engine.cc
浏览文件 @
2372daff
...
@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
...
@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected:
protected:
void
SetUp
()
override
{
void
SetUp
()
override
{
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream_
));
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream_
));
engine_
=
new
TensorRTEngine
(
1
,
1
<<
10
,
&
stream_
);
engine_
=
new
TensorRTEngine
(
1
0
,
1
<<
10
,
&
stream_
);
engine_
->
InitNetwork
();
engine_
->
InitNetwork
();
}
}
...
@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
...
@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG
(
INFO
)
<<
"to get output"
;
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
;
float
y_cpu
;
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
,
sizeof
(
float
)
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
);
LOG
(
INFO
)
<<
"to checkout output"
;
LOG
(
INFO
)
<<
"to checkout output"
;
ASSERT_EQ
(
y_cpu
,
x_v
*
2
+
3
);
ASSERT_EQ
(
y_cpu
,
x_v
*
2
+
3
);
...
@@ -103,11 +103,44 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
...
@@ -103,11 +103,44 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG
(
INFO
)
<<
"to get output"
;
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
[
2
]
=
{
-
1.
,
-
1.
};
float
y_cpu
[
2
]
=
{
-
1.
,
-
1.
};
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
]
,
sizeof
(
float
)
*
2
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
]);
ASSERT_EQ
(
y_cpu
[
0
],
4.5
);
ASSERT_EQ
(
y_cpu
[
0
],
4.5
);
ASSERT_EQ
(
y_cpu
[
1
],
14.5
);
ASSERT_EQ
(
y_cpu
[
1
],
14.5
);
}
}
TEST_F
(
TensorRTEngineTest
,
test_conv2d_temp
)
{
// Weight in CPU memory.
float
raw_weight
[
9
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
float
raw_bias
[
1
]
=
{
0
};
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
9
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
1
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
{
1
,
3
,
3
});
auto
*
conv_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
x
,
1
,
nvinfer1
::
DimsHW
{
3
,
3
},
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
conv_layer
!=
nullptr
);
conv_layer
->
setStride
(
nvinfer1
::
DimsHW
{
1
,
1
});
conv_layer
->
setPadding
(
nvinfer1
::
DimsHW
{
1
,
1
});
engine_
->
DeclareOutput
(
conv_layer
,
0
,
"y"
);
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
18
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
18
*
sizeof
(
float
));
engine_
->
Execute
(
2
);
LOG
(
INFO
)
<<
"to get output"
;
float
*
y_cpu
=
new
float
[
18
];
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
]);
ASSERT_EQ
(
y_cpu
[
0
],
4.0
);
ASSERT_EQ
(
y_cpu
[
1
],
6.0
);
}
}
// namespace tensorrt
}
// namespace tensorrt
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录