Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
6cbe597a
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6cbe597a
编写于
5月 15, 2018
作者:
T
Tao Luo
提交者:
GitHub
5月 15, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #10495 from luotao1/refine_relu_test
refine EngineIOConverter, and use io_convert in test_trt_activation_op
上级
dfdcb7ea
1992f709
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
137 addition
and
71 deletion
+137
-71
paddle/fluid/inference/analysis/dot.h
paddle/fluid/inference/analysis/dot.h
+1
-0
paddle/fluid/inference/engine.h
paddle/fluid/inference/engine.h
+3
-2
paddle/fluid/inference/tensorrt/CMakeLists.txt
paddle/fluid/inference/tensorrt/CMakeLists.txt
+0
-1
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+2
-2
paddle/fluid/inference/tensorrt/convert/io_converter.cc
paddle/fluid/inference/tensorrt/convert/io_converter.cc
+30
-13
paddle/fluid/inference/tensorrt/convert/io_converter.h
paddle/fluid/inference/tensorrt/convert/io_converter.h
+34
-19
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
...le/fluid/inference/tensorrt/convert/test_activation_op.cc
+24
-14
paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+43
-20
未找到文件。
paddle/fluid/inference/analysis/dot.h
浏览文件 @
6cbe597a
...
...
@@ -21,6 +21,7 @@
#include <glog/logging.h>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
...
...
paddle/fluid/inference/engine.h
浏览文件 @
6cbe597a
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/framework.pb.h"
namespace
paddle
{
...
...
@@ -58,8 +59,8 @@ class EngineBase {
struct
Buffer
{
void
*
buffer
{
nullptr
};
// buffer should be allocated only once.
int
max_size
;
// buffer allocated space.
int
size
;
// data size.
size_t
max_size
;
// buffer allocated space.
size_t
size
;
// data size.
DeviceType
device
{
DeviceType
::
UNK
};
// tells which device this buffer is on.
};
...
...
paddle/fluid/inference/tensorrt/CMakeLists.txt
浏览文件 @
6cbe597a
nv_library
(
tensorrt_engine SRCS engine.cc DEPS framework_proto
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
add_subdirectory
(
convert
)
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
6cbe597a
nv_test
(
test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc
op_converter.h
DEPS
${
FLUID_CORE_MODULES
}
)
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
nv_test
(
test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
${
FLUID_CORE_MODULES
}
)
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
io_converter.cc
DEPS
${
FLUID_CORE_MODULES
}
activation_op tensorrt_engine
)
nv_test
(
test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor
)
paddle/fluid/inference/tensorrt/convert/io_converter.cc
浏览文件 @
6cbe597a
...
...
@@ -23,26 +23,42 @@ namespace tensorrt {
using
platform
::
is_gpu_place
;
using
platform
::
is_cpu_place
;
class
DefaultI
nputConverter
:
public
EngineInput
Converter
{
class
DefaultI
OConverter
:
public
EngineIO
Converter
{
public:
DefaultI
nput
Converter
()
{}
DefaultI
O
Converter
()
{}
// NOTE out is GPU memory.
virtual
void
operator
()(
const
LoDTensor
&
in
,
void
*
out
,
size_t
max_size
)
override
{
PADDLE_ENFORCE
(
out
!=
nullptr
);
PADDLE_ENFORCE
_LE
(
in
.
memory_size
(),
max_size
);
PADDLE_ENFORCE
(
stream_
!=
nullptr
);
const
auto
&
place
=
in
.
place
();
size_t
size
=
in
.
memory_size
();
PADDLE_ENFORCE_LE
(
size
,
max_size
);
if
(
is_cpu_place
(
place
))
{
PADDLE_ENFORCE
(
stream_
!=
nullptr
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
out
,
in
.
data
<
float
>
(),
in
.
memory_size
(),
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
out
,
in
.
data
<
float
>
(),
size
,
cudaMemcpyHostToDevice
,
*
stream_
));
}
else
if
(
is_gpu_place
(
place
))
{
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
out
,
in
.
data
<
float
>
(),
in
.
memory_size
(),
cudaMemcpyHostToHost
,
*
stream_
));
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
out
,
in
.
data
<
float
>
(),
size
,
cudaMemcpyDeviceToDevice
,
*
stream_
));
}
else
{
PADDLE_THROW
(
"Unknown device for converter"
);
}
cudaStreamSynchronize
(
*
stream_
);
}
// NOTE in is GPU memory.
virtual
void
operator
()(
const
void
*
in
,
LoDTensor
*
out
,
size_t
max_size
)
override
{
PADDLE_ENFORCE
(
in
!=
nullptr
);
PADDLE_ENFORCE
(
stream_
!=
nullptr
);
const
auto
&
place
=
out
->
place
();
size_t
size
=
out
->
memory_size
();
PADDLE_ENFORCE_LE
(
size
,
max_size
);
if
(
is_cpu_place
(
place
))
{
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
out
->
data
<
float
>
(),
in
,
size
,
cudaMemcpyDeviceToHost
,
*
stream_
));
}
else
if
(
is_gpu_place
(
place
))
{
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
out
->
data
<
float
>
(),
in
,
size
,
cudaMemcpyDeviceToDevice
,
*
stream_
));
}
else
{
PADDLE_THROW
(
"Unknown device for converter"
);
}
...
...
@@ -50,7 +66,8 @@ class DefaultInputConverter : public EngineInputConverter {
}
};
REGISTER_TENSORRT_INPUT_CONVERTER
(
default
,
DefaultInputConverter
);
// fluid LodTensor <-> tensorrt ITensor
REGISTER_TENSORRT_IO_CONVERTER
(
default
,
DefaultIOConverter
);
}
// namespace tensorrt
}
// namespace inference
...
...
paddle/fluid/inference/tensorrt/convert/io_converter.h
浏览文件 @
6cbe597a
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/utils/singleton.h"
...
...
@@ -25,43 +26,57 @@ namespace tensorrt {
using
framework
::
LoDTensor
;
/*
* Convert Input from Fluid to an Engine.
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
* most cases just need to copy the data.
* Convert Input from Fluid to TensorRT Engine.
* Convert Output from TensorRT Engine to Fluid.
*
* Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
* major,
* so in the default case just need to copy the data.
*/
class
EngineI
nput
Converter
{
class
EngineI
O
Converter
{
public:
EngineI
nput
Converter
()
{}
EngineI
O
Converter
()
{}
virtual
void
operator
()(
const
LoDTensor
&
in
,
void
*
out
,
size_t
max_size
)
{}
virtual
void
operator
()(
const
void
*
in
,
LoDTensor
*
out
,
size_t
max_size
)
{}
void
SetStream
(
cudaStream_t
*
stream
)
{
stream_
=
stream
;
}
static
void
Run
(
const
std
::
string
&
in_op_type
,
const
LoDTensor
&
in
,
void
*
out
,
size_t
max_size
,
cudaStream_t
*
stream
)
{
static
void
ConvertInput
(
const
std
::
string
&
op_type
,
const
LoDTensor
&
in
,
void
*
out
,
size_t
max_size
,
cudaStream_t
*
stream
)
{
PADDLE_ENFORCE
(
stream
!=
nullptr
);
auto
*
converter
=
Registry
<
EngineI
nput
Converter
>::
Lookup
(
in_
op_type
,
"default"
/* default_type */
);
auto
*
converter
=
Registry
<
EngineI
O
Converter
>::
Lookup
(
op_type
,
"default"
/* default_type */
);
PADDLE_ENFORCE_NOT_NULL
(
converter
);
converter
->
SetStream
(
stream
);
(
*
converter
)(
in
,
out
,
max_size
);
}
virtual
~
EngineInputConverter
()
{}
static
void
ConvertOutput
(
const
std
::
string
&
op_type
,
const
void
*
in
,
LoDTensor
*
out
,
size_t
max_size
,
cudaStream_t
*
stream
)
{
PADDLE_ENFORCE
(
stream
!=
nullptr
);
auto
*
converter
=
Registry
<
EngineIOConverter
>::
Lookup
(
op_type
,
"default"
/* default_type */
);
PADDLE_ENFORCE_NOT_NULL
(
converter
);
converter
->
SetStream
(
stream
);
(
*
converter
)(
in
,
out
,
max_size
);
}
virtual
~
EngineIOConverter
()
{}
protected:
cudaStream_t
*
stream_
{
nullptr
};
};
#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \
struct trt_io_##op_type__##_converter { \
trt_io_##op_type__##_converter() { \
Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
} \
}; \
trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
struct trt_input_##in_op_type__##_converter { \
trt_input_##in_op_type__##_converter() { \
::paddle::inference::Registry<EngineInputConverter>::Register< \
Converter__>(#in_op_type__); \
} \
}; \
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
浏览文件 @
6cbe597a
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
...
...
@@ -26,7 +27,7 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
void
Compare
(
float
input
,
float
expect
)
{
void
Compare
(
const
std
::
string
op_type
,
float
input
,
float
expect
)
{
framework
::
Scope
scope
;
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
...
...
@@ -35,6 +36,7 @@ void Compare(float input, float expect) {
auto
x_var
=
scope
.
Var
(
"X"
);
auto
x_tensor
=
x_var
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
({
1
,
1
});
x_tensor
->
mutable_data
<
float
>
(
place
);
std
::
vector
<
float
>
init
;
init
.
push_back
(
input
);
framework
::
TensorFromVector
(
init
,
ctx
,
x_tensor
);
...
...
@@ -45,14 +47,15 @@ void Compare(float input, float expect) {
out_tensor
->
mutable_data
<
float
>
(
place
);
framework
::
OpDesc
op_desc
;
op_desc
.
SetType
(
"relu"
);
op_desc
.
SetType
(
op_type
);
op_desc
.
SetInput
(
"X"
,
{
"X"
});
op_desc
.
SetOutput
(
"Out"
,
{
"Out"
});
auto
relu_
op
=
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
.
Proto
());
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
.
Proto
());
// run fluid op
relu_op
->
Run
(
scope
,
place
);
op
->
Run
(
scope
,
place
);
// get fluid output
std
::
vector
<
float
>
out1
;
framework
::
TensorToVector
(
*
out_tensor
,
ctx
,
&
out1
);
...
...
@@ -63,21 +66,28 @@ void Compare(float input, float expect) {
engine
->
InitNetwork
();
engine
->
DeclareInput
(
"X"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
DimsCHW
{
1
,
1
,
1
});
// convert op
OpConverter
op_converter
;
op_converter
.
ConvertOp
(
*
op_desc
.
Proto
(),
engine
);
engine
->
DeclareOutput
(
"Out"
);
engine
->
FreezeNetwork
();
engine
->
SetInputFromCPU
(
"X"
,
&
input
,
1
*
sizeof
(
float
));
// run tensorrt op
// convert LoDTensor to ITensor
size_t
size
=
x_tensor
->
memory_size
();
EngineIOConverter
::
ConvertInput
(
op_type
,
*
x_tensor
,
engine
->
buffer
(
"X"
).
buffer
,
size
,
&
stream
);
// run tensorrt Outp
engine
->
Execute
(
1
);
float
out2
;
engine
->
GetOutputInCPU
(
"Out"
,
&
out2
,
1
*
sizeof
(
float
));
ASSERT_EQ
(
out1
[
0
],
out2
);
// convert ITensor to LoDTensor
EngineIOConverter
::
ConvertOutput
(
op_type
,
engine
->
buffer
(
"Out"
).
buffer
,
out_tensor
,
size
,
&
stream
);
// get tensorrt output
std
::
vector
<
float
>
out2
;
framework
::
TensorToVector
(
*
out_tensor
,
ctx
,
&
out2
);
// compare
ASSERT_EQ
(
out1
[
0
],
out2
[
0
]);
ASSERT_EQ
(
out1
[
0
],
expect
);
delete
engine
;
...
...
@@ -85,8 +95,8 @@ void Compare(float input, float expect) {
}
TEST
(
OpConverter
,
ConvertRelu
)
{
Compare
(
1
,
1
);
// relu(1) = 1
Compare
(
-
5
,
0
);
// relu(-5) = 0
Compare
(
"relu"
,
1
,
1
);
// relu(1) = 1
Compare
(
"relu"
,
-
5
,
0
);
// relu(-5) = 0
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
浏览文件 @
6cbe597a
...
...
@@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include <gtest/gtest.h>
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
class
EngineInputConverterTester
:
public
::
testing
::
Test
{
public:
void
SetUp
()
override
{
tensor
.
Resize
({
10
,
10
});
}
void
IOConverterTester
(
const
platform
::
DeviceContext
&
ctx
)
{
cudaStream_t
stream
;
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream
));
framework
::
LoDTensor
tensor
;
};
// init fluid in_tensor
framework
::
LoDTensor
in_tensor
;
in_tensor
.
Resize
({
10
,
10
});
auto
place
=
ctx
.
GetPlace
();
in_tensor
.
mutable_data
<
float
>
(
place
);
std
::
vector
<
float
>
init
;
for
(
int64_t
i
=
0
;
i
<
10
*
10
;
++
i
)
{
init
.
push_back
(
i
);
}
framework
::
TensorFromVector
(
init
,
ctx
,
&
in_tensor
);
TEST_F
(
EngineInputConverterTester
,
DefaultCPU
)
{
// init tensorrt buffer
void
*
buffer
;
tensor
.
mutable_data
<
float
>
(
platform
::
CPUPlace
()
);
ASSERT_EQ
(
cudaMalloc
(
&
buffer
,
tensor
.
memory_size
()
),
0
);
size_t
size
=
in_tensor
.
memory_size
(
);
ASSERT_EQ
(
cudaMalloc
(
&
buffer
,
size
),
0
);
cudaStream_t
stream
;
EngineInputConverter
::
Run
(
"test"
,
tensor
,
buffer
,
tensor
.
memory_size
(),
&
stream
);
// convert fluid in_tensor to tensorrt buffer
EngineIOConverter
::
ConvertInput
(
"test"
,
in_tensor
,
buffer
,
size
,
&
stream
);
// convert tensorrt buffer to fluid out_tensor
framework
::
LoDTensor
out_tensor
;
out_tensor
.
Resize
({
10
,
10
});
out_tensor
.
mutable_data
<
float
>
(
place
);
EngineIOConverter
::
ConvertOutput
(
"test"
,
buffer
,
&
out_tensor
,
size
,
&
stream
);
// compare in_tensor and out_tensor
std
::
vector
<
float
>
result
;
framework
::
TensorToVector
(
out_tensor
,
ctx
,
&
result
);
EXPECT_EQ
(
init
.
size
(),
result
.
size
());
for
(
size_t
i
=
0
;
i
<
init
.
size
();
i
++
)
{
EXPECT_EQ
(
init
[
i
],
result
[
i
]);
}
cudaStreamDestroy
(
stream
);
}
TEST_F
(
EngineInputConverterTester
,
DefaultGPU
)
{
void
*
buffer
;
tensor
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
());
ASSERT_EQ
(
cudaMalloc
(
&
buffer
,
tensor
.
memory_size
()),
0
);
TEST
(
EngineIOConverterTester
,
DefaultCPU
)
{
platform
::
CPUPlace
place
;
platform
::
CPUDeviceContext
ctx
(
place
);
IOConverterTester
(
ctx
);
}
cudaStream_t
stream
;
EngineInputConverter
::
Run
(
"test"
,
tensor
,
buffer
,
tensor
.
memory_size
(),
&
stream
);
TEST
(
EngineIOConverterTester
,
DefaultGPU
)
{
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
IOConverterTester
(
ctx
);
}
}
// namespace tensorrt
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录