Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
975687d5
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
975687d5
编写于
2月 15, 2019
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
差异文件
Merge conflicts caused by FPGA
上级
ccd30dda
3a8d22a4
变更
54
隐藏空白更改
内联
并排
Showing
54 changed file
with
1524 addition
and
197 deletion
+1524
-197
CMakeLists.txt
CMakeLists.txt
+1
-1
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+44
-24
src/fpga/V1/api.h
src/fpga/V1/api.h
+1
-0
src/fpga/V1/deconv_filter.cpp
src/fpga/V1/deconv_filter.cpp
+1
-0
src/fpga/V1/image.cpp
src/fpga/V1/image.cpp
+16
-15
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+0
-3
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+1
-1
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+10
-9
src/framework/executor.cpp
src/framework/executor.cpp
+35
-2
src/framework/executor.h
src/framework/executor.h
+2
-0
src/framework/operator.cpp
src/framework/operator.cpp
+22
-0
src/framework/operator.h
src/framework/operator.h
+3
-1
src/framework/program/program_desc.cpp
src/framework/program/program_desc.cpp
+2
-1
src/framework/scope.cpp
src/framework/scope.cpp
+24
-0
src/framework/scope.h
src/framework/scope.h
+6
-0
src/framework/tensor.h
src/framework/tensor.h
+7
-1
src/io/api_paddle_mobile.cc
src/io/api_paddle_mobile.cc
+85
-0
src/io/api_paddle_mobile.h
src/io/api_paddle_mobile.h
+8
-1
src/io/paddle_inference_api.h
src/io/paddle_inference_api.h
+16
-0
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+10
-0
src/io/paddle_mobile.h
src/io/paddle_mobile.h
+2
-0
src/operators/detection_ops.cpp
src/operators/detection_ops.cpp
+13
-0
src/operators/kernel/detection_kernel.h
src/operators/kernel/detection_kernel.h
+8
-0
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+70
-0
src/operators/kernel/fpga/V1/concat_kernel.cpp
src/operators/kernel/fpga/V1/concat_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+5
-4
src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+6
-4
src/operators/kernel/fpga/V1/conv_add_kernel.cpp
src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+4
-4
src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+5
-4
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/feed_kernel.cpp
src/operators/kernel/fpga/V1/feed_kernel.cpp
+26
-6
src/operators/kernel/fpga/V1/fetch_kernel.cpp
src/operators/kernel/fpga/V1/fetch_kernel.cpp
+36
-21
src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+1
-1
src/operators/kernel/fpga/V1/pool_kernel.cpp
src/operators/kernel/fpga/V1/pool_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+440
-0
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+204
-0
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+136
-0
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/slice_kernel.cpp
src/operators/kernel/fpga/V1/slice_kernel.cpp
+57
-0
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+57
-34
src/operators/kernel/fpga/V1/split_kernel.cpp
src/operators/kernel/fpga/V1/split_kernel.cpp
+13
-3
src/operators/kernel/fpga/V1/tanh_kernel.cpp
src/operators/kernel/fpga/V1/tanh_kernel.cpp
+5
-3
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+15
-1
src/operators/op_param.h
src/operators/op_param.h
+3
-13
src/operators/reshape2_op.cpp
src/operators/reshape2_op.cpp
+3
-0
test/CMakeLists.txt
test/CMakeLists.txt
+3
-0
test/fpga/test_resnet50.cpp
test/fpga/test_resnet50.cpp
+30
-19
test/fpga/test_rfcn.cpp
test/fpga/test_rfcn.cpp
+62
-0
tools/op.cmake
tools/op.cmake
+5
-0
未找到文件。
CMakeLists.txt
浏览文件 @
975687d5
...
...
@@ -4,7 +4,7 @@ option(USE_OPENMP "build with openmp support" ON)
option
(
USE_EXCEPTION
"build with exception"
ON
)
option
(
WITH_LOGGING
"print logging for debug"
ON
)
option
(
WITH_SYMBOL
"build with all symbols"
ON
)
# turn off if use jni or ios io
option
(
WITH_PROFILE
"print op profile for debug"
O
N
)
option
(
WITH_PROFILE
"print op profile for debug"
O
FF
)
option
(
WITH_TEST
"build with unit tests"
ON
)
# select the platform to build
...
...
src/fpga/V1/api.cpp
浏览文件 @
975687d5
...
...
@@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) {
auto
dims
=
image_tensor
->
dims
();
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
image_tensor
->
data
<
float
>
();
size_t
memory_size
=
channel
*
height
*
width
*
sizeof
(
float
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
// NOLINT
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
image
::
format_image
(
&
new_data
,
channel
,
height
,
width
);
image_tensor
->
reset_data_ptr
(
new_data
);
auto
external_ptr
=
reinterpret_cast
<
float
*>
(
image_tensor
->
external_data
);
float
*
p_data
=
external_ptr
==
nullptr
?
data_ptr
:
external_ptr
;
float
*
old_p
=
p_data
;
image
::
format_image
(
&
p_data
,
channel
,
height
,
width
);
if
(
old_p
!=
p_data
)
{
image_tensor
->
reset_data_ptr
(
p_data
);
}
}
void
format_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
if
(
ofm_tensor
->
type
()
==
typeid
(
float
))
{
format_fp32_ofm
(
ofm_tensor
);
}
else
{
format_fp16_ofm
(
ofm_tensor
);
}
}
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
auto
dims
=
ofm_tensor
->
dims
();
size_t
memory_size
=
0
;
...
...
@@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
half
));
}
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
,
framework
::
DDim
dims
)
{
...
...
@@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
half
));
}
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
auto
dims
=
ofm_tensor
->
dims
();
...
...
@@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
float
));
}
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
)
{
...
...
@@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_dwconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
)
{
auto
dims
=
filter_tensor
->
dims
();
...
...
@@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
filter
::
format_dwconv_filter
(
&
new_data
,
num
,
height
,
width
,
scale_ptr
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_DWDconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
,
...
...
@@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
)
{
...
...
@@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter
::
format_fc_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
1
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_deconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
,
int
stride
)
{
...
...
@@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework
::
make_ddim
({
num
,
channel
,
height
,
width
});
filter_tensor
->
Resize
(
dims_new
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_bias_scale_array
(
float
**
bias_scale_array
,
...
...
@@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto
ddim
=
framework
::
make_ddim
({
1
,
sum_channel
,
height
,
width
});
out
->
Resize
(
ddim
);
out
->
reset_data_ptr
(
data_ptr
);
out
->
set_type
(
typeid
(
half
));
}
void
format_conv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
...
...
@@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
out_ptr
=
out
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
filter_ptr
=
filter
->
data
<
int8_
t
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
...
...
@@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
filter_ptr
=
filter
->
data
<
int8_
t
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
...
...
@@ -603,9 +621,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
sub_output_height
*
sub_conv_num
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
arg
->
output
.
address
=
(
half
*
)
out_ptr
+
// NOLINT
out_ptr
+
omit_size
*
sizeof
(
half
)
*
(
align_to_x
(
real_out_width
*
arg
->
filter_num
,
IMAGE_ALIGNMENT
));
arg
->
output
.
scale_address
=
out
->
scale
;
...
...
@@ -695,7 +713,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
}
for
(
int
j
=
0
;
j
<
split_num
;
++
j
)
{
// arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
activation
.
activation_type
=
activation_enable
;
arg
->
split_conv_args
[
i
]
...
...
@@ -741,9 +758,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_to_x
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_num
,
FILTER_NUM_ALIGNMENT
)
*
sizeof
(
int8_t
);
auto
filter_head
=
&
((
int8_t
*
)
filter_ptr
)
[
j
*
element_num
*
filter_num_per_div
+
// NOLINT
i
*
filter_sub_conv_offset
];
auto
filter_head
=
&
filter_ptr
[
j
*
element_num
*
filter_num_per_div
+
// NOLINT
i
*
filter_sub_conv_offset
];
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_address
=
fpga_malloc
(
filter_size
);
arg
->
split_conv_args
[
i
]
->
vector_conv_space
.
push_back
(
...
...
@@ -793,7 +810,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
),
deleter
));
}
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
int16_t
*>
(
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
half
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
);
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
;
...
...
@@ -818,9 +835,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
vector_dwconv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
bias_ptr
),
deleter
));
auto
filter_ptr
=
filter
->
data
<
uint8_t
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
out
->
mutable_data
<
half
>
();
arg
->
sub_conv_num
=
1
;
// arg->relu_enabled = relu_enabled;
arg
->
output
.
activation
.
activation_type
=
activation_enable
;
...
...
@@ -848,9 +869,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
int8_t
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
...
...
@@ -885,7 +905,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
real_out_height
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
/*====For Addition
arg->output.address =
...
...
src/fpga/V1/api.h
浏览文件 @
975687d5
...
...
@@ -23,6 +23,7 @@ namespace paddle_mobile {
namespace
fpga
{
void
format_image
(
framework
::
Tensor
*
image_tensor
);
void
format_ofm
(
framework
::
Tensor
*
ofm_tensor
);
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
);
// only allocate memory
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
,
framework
::
DDim
dims
);
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
);
...
...
src/fpga/V1/deconv_filter.cpp
浏览文件 @
975687d5
...
...
@@ -247,6 +247,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_copy
(
ptr_space
+
i
*
align_offset
,
ptr_tmp
,
align_offset
);
fpga_free
(
ptr_tmp
);
}
fpga_free
(
ptr_ptr_data
);
*
data_in
=
reinterpret_cast
<
float
*>
(
ptr_space
);
/* {
...
...
src/fpga/V1/image.cpp
浏览文件 @
975687d5
...
...
@@ -22,7 +22,6 @@ namespace fpga {
namespace
image
{
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
));
// NOLINT
int64_t
amount_per_row
=
width
*
channel
;
...
...
@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
}
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
)
{
int
h
=
0
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
// NOLINT
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
// NOLINT
for
(
h
=
0
;
h
<
height
;
h
++
)
{
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
// NOLINT
(
void
*
)(
*
data_in
+
h
*
cw
),
// NOLINT
cw
*
sizeof
(
float
));
}
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
for
(
h
=
0
;
h
<
height
;
h
++
)
{
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
// NOLINT
(
void
*
)(
*
data_in
+
h
*
cw
),
// NOLINT
cw
*
sizeof
(
float
));
}
*
data_in
=
data_tmp
;
}
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
convert_to_hwc
(
data_in
,
channel
,
height
,
width
);
align_element_conv
(
data_in
,
height
,
channel
*
width
);
int
cw
=
channel
*
width
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
hwc_temp
=
*
data_in
;
align_element_conv
(
data_in
,
height
,
channel
*
width
);
fpga_free
(
hwc_temp
);
}
fpga_flush
(
*
data_in
,
align_to_x
(
channel
*
width
,
IMAGE_ALIGNMENT
)
*
height
*
sizeof
(
float
));
}
...
...
src/fpga/V1/pe.cpp
浏览文件 @
975687d5
...
...
@@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq
(
args
.
driver
.
deconv_param
,
0xd18
);
reg_writeq
(
args
.
driver
.
fpga_bias_scale_len
/
4
,
0xd20
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_CONV_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_CONV
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"Conv Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
975687d5
...
...
@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"
Address: "
<<
ptr
<<
"
Invalid pointer"
;
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
...
...
src/fpga/common/fpga_common.h
浏览文件 @
975687d5
...
...
@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory>
#include <vector>
namespace
paddle_mobile
{
namespace
fpga
{
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT
16
// Aligned to 16
#define FILTER_NUM_ALIGNMENT
32
// Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT
16
// Filter element number aligned to 16
#define BS_NUM_ALIGNMENT
8
#define BIAS_NUM_ALIGNMENT
16
#define IMAGE_ALIGNMENT
(16)
// Aligned to 16
#define FILTER_NUM_ALIGNMENT
(32)
// Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT
(16)
// Filter element number aligned to 16
#define BS_NUM_ALIGNMENT
(8)
#define BIAS_NUM_ALIGNMENT
(16)
#endif
namespace
paddle_mobile
{
namespace
fpga
{
enum
DataType
{
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP16
=
0
,
...
...
@@ -49,7 +48,7 @@ enum ActivationType {
};
struct
ActivationArgs
{
enum
ActivationType
activation_type
;
enum
ActivationType
activation_type
=
NONE
;
int16_t
leaky_relu_negative_slope
;
};
...
...
@@ -188,6 +187,7 @@ struct SplitArgs {
uint32_t
*
out_channel_nums
;
uint32_t
height
;
uint32_t
width
;
std
::
vector
<
std
::
shared_ptr
<
char
>>
vector_split_space
;
};
struct
PoolingArgs
{
...
...
@@ -237,6 +237,7 @@ struct DWconvArgs {
struct
KernelArgs
kernel
;
struct
ImageInputArgs
image
;
struct
ImageOutputArgs
output
;
std
::
vector
<
std
::
shared_ptr
<
char
>>
vector_dwconv_space
;
};
struct
DWDeconvArgs
{
...
...
src/framework/executor.cpp
浏览文件 @
975687d5
...
...
@@ -83,6 +83,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
// resize feed and fetch list
InitFeedFetchList
();
#ifdef PADDLE_MOBILE_FPGA
program_
.
scope
->
EraseVars
({
"feed"
,
"fetch"
});
program_
.
scope
->
print_vars
();
#endif
int
count
=
0
;
for
(
auto
&
op_handler
:
ops_of_block0_
)
{
DLOG
<<
"Initialize op["
<<
count
++
<<
"]: "
<<
op_handler
->
Type
();
...
...
@@ -291,6 +296,7 @@ template <typename Device, typename T>
bool
Executor
<
Device
,
T
>::
varInputMemory
(
const
std
::
shared_ptr
<
VarDesc
>
&
var_desc
,
Variable
*
var
)
const
{
#ifdef PADDLE_MOBILE_FPGA
framework
::
LoDTensor
*
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
tensor
->
init
(
typeid
(
float
));
return
true
;
#endif
...
...
@@ -506,14 +512,41 @@ template <typename Device, typename T>
void
Executor
<
Device
,
T
>::
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
)
{
Variable
*
g_feed_value
=
program_
.
scope
->
Var
(
var_name
);
Tensor
*
feed_tensor
=
g_feed_value
->
GetMutable
<
LoDTensor
>
();
Tensor
*
feed_tensor
=
g_feed_value
->
template
GetMutable
<
LoDTensor
>();
feed_tensor
->
Resize
(
t
.
dims
());
feed_tensor
->
ShareDataWith
(
t
);
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
FeedData
(
const
Tensor
&
t
)
{
InjectVariable
(
t
,
"feed"
);
InjectVariable
(
t
,
"feed0"
);
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
v
)
{
auto
input_size
=
v
.
size
();
auto
vars
=
program_
.
scope
->
VarContain
(
"feed"
);
PADDLE_MOBILE_ENFORCE
(
input_size
==
vars
.
size
(),
"input data number not correct"
);
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
auto
var
=
program_
.
scope
->
Var
(
"feed"
,
i
);
auto
feed_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
feed_tensor
->
external_data
=
v
[
i
];
}
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
v
)
{
auto
output_size
=
v
->
size
();
PADDLE_MOBILE_ENFORCE
(
output_size
>
0
,
"Empty output"
);
auto
vars
=
program_
.
scope
->
VarContain
(
"fetch"
);
PADDLE_MOBILE_ENFORCE
(
output_size
==
vars
.
size
(),
"output data number not correct"
);
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
auto
var
=
program_
.
scope
->
Var
(
"fetch"
,
i
);
auto
fetch_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
(
*
v
)[
i
]
=
fetch_tensor
->
template
data
<
float
>();
}
}
template
<
typename
Device
,
typename
T
>
...
...
src/framework/executor.h
浏览文件 @
975687d5
...
...
@@ -52,6 +52,8 @@ class Executor {
#ifdef PADDLE_MOBILE_FPGA
void
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
Tensor
&
t
);
void
FeedData
(
const
std
::
vector
<
void
*>
&
v
);
void
GetResults
(
std
::
vector
<
void
*>
*
v
);
std
::
shared_ptr
<
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
...
...
src/framework/operator.cpp
浏览文件 @
975687d5
...
...
@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_
(
attrs
),
scope_
(
scope
)
{
CheckAllInputOutputSet
();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors
();
#endif
}
template
<
typename
Dtype
>
...
...
@@ -133,6 +136,25 @@ void OperatorBase<GPU_CL>::Run() {
}
#endif
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Dtype
>
void
OperatorBase
<
Dtype
>::
InsertTensors
()
{
static
int
feed_num
=
0
;
static
int
fetch_num
=
0
;
if
(
type_
==
"feed"
)
{
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
auto
var
=
scope_
->
Var
(
new_name
);
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
inputs_
.
at
(
"X"
)
=
{
string
(
new_name
)};
}
else
if
(
type_
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
auto
var
=
scope_
->
Var
(
new_name
);
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
outputs_
.
at
(
"Out"
)
=
{
string
(
new_name
)};
}
}
#endif
template
class
OperatorBase
<
CPU
>;
template
class
OperatorBase
<
FPGA
>;
template
class
OperatorBase
<
GPU_MALI
>;
...
...
src/framework/operator.h
浏览文件 @
975687d5
...
...
@@ -78,6 +78,9 @@ class OperatorBase {
this
->
scope_
->
EraseVars
(
var_names
);
}
}
#ifdef PADDLE_MOBILE_FPGA
void
InsertTensors
();
#endif
protected:
framework
::
Scope
*
scope_
;
...
...
@@ -102,7 +105,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_
.
InitCLHelper
(
scope
->
GetCLScpoe
());
#endif
}
virtual
void
RunImpl
()
{
this
->
kernel_
.
Compute
(
this
->
param_
);
}
virtual
void
InferShape
()
const
=
0
;
...
...
src/framework/program/program_desc.cpp
浏览文件 @
975687d5
...
...
@@ -72,7 +72,8 @@ void ProgramDesc::Description(std::string header) {
}
}
for
(
auto
&
attr
:
op
->
GetAttrMap
())
{
LOG
(
kLOG_DEBUG2
)
<<
"attr name:: "
<<
attr
.
first
;
if
(
attr
.
first
==
"op_callstack"
)
continue
;
LOG
(
kLOG_DEBUG2
)
<<
"attr name: "
<<
attr
.
first
;
LOG
(
kLOG_DEBUG3
)
<<
"argument - "
<<
attr
.
second
;
}
}
...
...
src/framework/scope.cpp
浏览文件 @
975687d5
...
...
@@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const {
return
nullptr
;
}
#ifdef PADDLE_MOBILE_FPGA
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
,
const
int
id
)
{
return
Var
(
name
+
std
::
to_string
(
id
));
}
std
::
vector
<
Variable
*>
Scope
::
VarContain
(
const
std
::
string
substring
)
{
std
::
vector
<
Variable
*>
v
;
for
(
auto
pair
:
vars_
)
{
if
(
pair
.
first
.
find
(
substring
)
==
0
)
{
v
.
push_back
(
pair
.
second
);
}
}
return
v
;
}
void
Scope
::
print_vars
()
{
DLOG
<<
"====================start to print variables================="
;
for
(
auto
pair
:
vars_
)
{
DLOG
<<
pair
.
first
;
}
DLOG
<<
"==================complete printing variables================"
;
}
#endif
}
// namespace framework
}
// namespace paddle_mobile
src/framework/scope.h
浏览文件 @
975687d5
...
...
@@ -75,6 +75,12 @@ class Scope {
Variable
*
FindVarLocally
(
const
std
::
string
&
name
)
const
;
#ifdef PADDLE_MOBILE_FPGA
Variable
*
Var
(
const
std
::
string
&
name
,
const
int
id
);
std
::
vector
<
Variable
*>
VarContain
(
const
std
::
string
substring
);
void
print_vars
();
#endif
#ifdef PADDLE_MOBILE_CL
CLScope
*
GetCLScpoe
()
{
return
cl_scope_
;
}
#endif
...
...
src/framework/tensor.h
浏览文件 @
975687d5
...
...
@@ -202,6 +202,11 @@ class Tensor : public TensorBase {
inline
void
reset_data_ptr
(
void
*
p
)
{
((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
reset
((
uint8_t
*
)
p
);
// NOLINT
}
inline
void
set_type
(
std
::
type_index
type
)
{
holder_
->
set_type
(
type
);
}
inline
void
*
get_data
()
{
return
(
void
*
)(((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
get
());
// NOLINT
}
inline
void
*
init
(
std
::
type_index
type
)
{
if
(
holder_
!=
nullptr
)
{
...
...
@@ -217,7 +222,8 @@ class Tensor : public TensorBase {
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())
+
offset_
);
}
float
scale
[
2
];
// scale[0]= MAX/127.0, scale[1]= 127.0/MAX
float
scale
[
2
];
// scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void
*
external_data
=
nullptr
;
// only used for Feed
#endif
};
...
...
src/io/api_paddle_mobile.cc
浏览文件 @
975687d5
...
...
@@ -110,6 +110,91 @@ bool PaddleMobilePredictor<Device, T>::Run(
return
true
;
}
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Device
,
typename
T
>
bool
PaddleMobilePredictor
<
Device
,
T
>::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
std
::
vector
<
int
>
*
index_data
,
int
batch_size
)
{
if
(
inputs
.
empty
())
{
LOG
(
kLOG_ERROR
)
<<
"At least one output should be set with tensors' names."
;
return
false
;
}
auto
input
=
inputs
[
0
];
if
(
input
.
shape
.
size
()
!=
4
)
{
LOG
(
kLOG_ERROR
)
<<
"input shape not equal to 4!"
;
return
false
;
}
std
::
vector
<
int64_t
>
dims
;
for
(
auto
d
:
input
.
shape
)
{
dims
.
push_back
(
static_cast
<
int64_t
>
(
d
));
}
// use tensor
framework
::
DDim
ddim
=
framework
::
make_ddim
({
dims
[
0
],
dims
[
1
],
dims
[
2
],
dims
[
3
]});
framework
::
Tensor
input_tensor
;
input_tensor
.
Resize
(
ddim
);
int
input_length
=
framework
::
product
(
ddim
);
auto
input_ptr
=
input_tensor
.
mutable_data
<
T
>
();
memcpy
(
input_ptr
,
static_cast
<
T
*>
(
input
.
data
.
data
()),
input_length
*
sizeof
(
T
));
paddle_mobile_
->
Predict
(
input_tensor
);
auto
num_result
=
index_data
->
size
();
if
(
output_data
->
size
()
!=
num_result
)
{
LOG
(
kLOG_ERROR
)
<<
"index and output number don't match"
;
return
false
;
}
for
(
int
i
=
0
;
i
<
num_result
;
i
++
)
{
auto
output_tensor
=
paddle_mobile_
->
FetchResult
((
*
index_data
)[
i
]);
if
(
output_data
->
empty
())
{
LOG
(
kLOG_ERROR
)
<<
"At least one output should be set with tensors' names."
;
return
false
;
}
auto
&
output
=
(
*
output_data
)[
i
];
int
output_length
=
output_tensor
->
numel
();
std
::
vector
<
int64_t
>
tensor_shape
=
framework
::
vectorize
(
output_tensor
->
dims
());
for
(
auto
d
:
tensor_shape
)
{
output
.
shape
.
push_back
(
static_cast
<
int
>
(
d
));
}
if
(
output
.
data
.
length
()
<
output_length
*
sizeof
(
T
))
{
output
.
data
.
Resize
(
output_length
*
sizeof
(
T
));
}
memcpy
(
output
.
data
.
data
(),
output_tensor
->
template
data
<
T
>(),
output_length
*
sizeof
(
T
));
}
return
true
;
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
inputs
)
{
paddle_mobile_
->
FeedData
(
inputs
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
outputs
)
{
paddle_mobile_
->
GetResults
(
outputs
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
Predict_From_To
(
int
start
,
int
end
)
{
paddle_mobile_
->
Predict_From_To
(
start
,
end
);
}
#endif
template
<
typename
Device
,
typename
T
>
PaddleMobilePredictor
<
Device
,
T
>::~
PaddleMobilePredictor
()
{
paddle_mobile_
->
Clear
();
...
...
src/io/api_paddle_mobile.h
浏览文件 @
975687d5
...
...
@@ -31,7 +31,14 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
override
;
#ifdef PADDLE_MOBILE_FPGA
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
override
;
void
FeedData
(
const
std
::
vector
<
void
*>&
inputs
)
override
;
void
GetResults
(
std
::
vector
<
void
*>*
outputs
)
override
;
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
)
override
;
#endif
~
PaddleMobilePredictor
()
override
;
private:
...
...
src/io/paddle_inference_api.h
浏览文件 @
975687d5
...
...
@@ -26,8 +26,16 @@ limitations under the License. */
#include <string>
#include <vector>
// #define PADDLE_MOBILE_FPGA
namespace
paddle_mobile
{
#ifdef PADDLE_MOBILE_FPGA
namespace
fpga
{
int
open_device
();
}
#endif
enum
PaddleDType
{
FLOAT32
,
INT64
,
...
...
@@ -107,6 +115,14 @@ class PaddlePredictor {
std
::
string
prog_file
;
std
::
string
param_file
;
};
#ifdef PADDLE_MOBILE_FPGA
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
=
0
;
virtual
void
FeedData
(
const
std
::
vector
<
void
*>&
inputs
)
=
0
;
virtual
void
GetResults
(
std
::
vector
<
void
*>*
outputs
)
=
0
;
virtual
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
)
=
0
;
#endif
protected:
PaddlePredictor
()
=
default
;
...
...
src/io/paddle_mobile.cpp
浏览文件 @
975687d5
...
...
@@ -228,6 +228,16 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_
->
FeedData
(
t
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
v
)
{
executor_
->
FeedData
(
v
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
v
)
{
executor_
->
GetResults
(
v
);
}
template
<
typename
Device
,
typename
T
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Device
,
T
>::
FetchResult
(
int
id
)
{
...
...
src/io/paddle_mobile.h
浏览文件 @
975687d5
...
...
@@ -90,6 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA
void
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
framework
::
Tensor
&
t
);
void
FeedData
(
const
std
::
vector
<
void
*>
&
v
);
void
GetResults
(
std
::
vector
<
void
*>
*
v
);
std
::
shared_ptr
<
framework
::
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
...
...
src/operators/detection_ops.cpp
浏览文件 @
975687d5
...
...
@@ -22,6 +22,7 @@ namespace operators {
template
<
typename
DeviceType
,
typename
T
>
void
AnchorGeneratorOp
<
DeviceType
,
T
>::
InferShape
()
const
{
const
auto
&
input_dims
=
this
->
param_
.
input_
->
dims
();
// DLOG << "AnchorGenerator input dim =" << input_dims.size();
PADDLE_MOBILE_ENFORCE
(
input_dims
.
size
()
==
4
,
"The layout of input is NCHW."
);
const
auto
&
anchor_sizes
=
this
->
param_
.
anchor_sizes_
;
const
auto
&
aspect_ratios
=
this
->
param_
.
aspect_ratios_
;
...
...
@@ -98,3 +99,15 @@ REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
REGISTER_OPERATOR_CPU
(
roi_perspective_transform
,
ops
::
RoiPerspectiveOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef ANCHOR_GENERATOR_OP
REGISTER_OPERATOR_FPGA
(
anchor_generator
,
ops
::
AnchorGeneratorOp
);
#endif
#ifdef PROPOSAL_OP
REGISTER_OPERATOR_FPGA
(
generate_proposals
,
ops
::
ProposalOp
);
#endif
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_FPGA
(
psroi_pool
,
ops
::
PSRoiPoolOp
);
#endif
#endif
src/operators/kernel/detection_kernel.h
浏览文件 @
975687d5
...
...
@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float
nms_thresh_
;
float
min_size_
;
float
eta_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_score
,
float_bbox
;
fpga
::
BypassArgs
score_arg
,
bbox_arg
;
#endif
};
DECLARE_KERNEL
(
Proposal
,
ProposalParam
);
...
...
@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int
pooled_height_
;
int
pooled_width_
;
float
spatial_scale_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_input
,
float_output
;
fpga
::
BypassArgs
input_arg
,
output_arg
;
#endif
};
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
...
...
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
0 → 100644
浏览文件 @
975687d5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ANCHOR_GENERATOR_OP
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
AnchorGeneratorKernel
<
FPGA
,
float
>::
Init
(
AnchorGeneratorParam
<
FPGA
>
*
param
)
{
auto
input
=
param
->
input_
;
auto
anchors
=
param
->
output_anchors_
;
auto
anchor_ptr
=
anchors
->
mutable_data
<
float
>
();
auto
stride
=
param
->
stride_
;
auto
feature_width
=
input
->
dims
()[
3
],
feature_height
=
input
->
dims
()[
2
];
auto
stride_width
=
stride
[
0
],
stride_height
=
stride
[
1
];
int
anchors_offset
[]
=
{
-
2
,
-
2
,
18
,
18
,
-
10
,
-
9
,
26
,
25
,
-
23
,
-
20
,
39
,
36
,
-
43
,
-
34
,
59
,
49
,
-
63
,
-
54
,
79
,
69
,
-
96
,
-
77
,
112
,
93
,
-
137
,
-
118
,
153
,
134
,
-
204
,
-
188
,
220
,
204
,
-
281
,
-
395
,
296
,
441
};
int
num_anchors
=
sizeof
(
anchors_offset
)
/
(
sizeof
(
int
)
*
4
);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for
(
int
h_idx
=
0
;
h_idx
<
feature_height
;
++
h_idx
)
{
for
(
int
w_idx
=
0
;
w_idx
<
feature_width
;
++
w_idx
)
{
int
offset
=
h_idx
*
w_idx
*
num_anchors
*
4
;
for
(
int
idx
=
0
;
idx
<
num_anchors
;
idx
++
)
{
anchor_ptr
[
offset
+
0
]
=
anchors_offset
[
idx
*
4
+
0
]
+
w_idx
*
stride_width
;
anchor_ptr
[
offset
+
1
]
=
anchors_offset
[
idx
*
4
+
1
]
+
h_idx
*
stride_height
;
anchor_ptr
[
offset
+
2
]
=
anchors_offset
[
idx
*
4
+
2
]
+
w_idx
*
stride_width
;
anchor_ptr
[
offset
+
3
]
=
anchors_offset
[
idx
*
4
+
3
]
+
h_idx
*
stride_height
;
}
}
}
return
true
;
}
template
<
>
void
AnchorGeneratorKernel
<
FPGA
,
float
>::
Compute
(
const
AnchorGeneratorParam
<
FPGA
>
&
param
)
{}
}
// namespace operators
}
// namespace paddle_mobile
#endif // ANCHOR_GENERATOR_OP
src/operators/kernel/fpga/V1/concat_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE
(
input
->
dims
()[
2
]
==
height
&&
input
->
dims
()[
3
]
==
width
,
"Image height & width should be unified"
);
images_in
[
i
]
=
(
half
*
)
input
->
data
<
float
>
();
// NOLINT
images_in
[
i
]
=
input
->
data
<
half
>
();
channel_num
[
i
]
=
(
uint32_t
)
inputs
[
i
]
->
dims
()[
1
];
// NOLINT
scales_in
[
i
]
=
input
->
scale
;
}
...
...
@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs
.
image_num
=
image_num
;
concatArgs
.
images_in
=
images_in
;
concatArgs
.
scales_in
=
scales_in
;
concatArgs
.
image_out
=
(
half
*
)
out
->
data
<
float
>
();
// NOLINT
concatArgs
.
image_out
=
out
->
data
<
half
>
();
concatArgs
.
scale_out
=
out
->
scale
;
concatArgs
.
channel_num
=
channel_num
;
concatArgs
.
height
=
height
;
...
...
src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -26,11 +26,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
NONE
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
auto
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
...
...
@@ -59,8 +59,6 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bs_ptr
[
i
+
channel
]
=
new_scale_ptr
[
i
];
bs_ptr
[
i
]
=
new_bias_ptr
[
i
];
}
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
fpga
::
format_conv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
());
fpga
::
SplitConvArgs
conv_arg
=
{
0
};
...
...
@@ -70,6 +68,9 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
conv_arg
);
delete
new_scale
;
delete
new_bias
;
return
true
;
}
...
...
src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -27,10 +27,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
LEAKYRELU
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
auto
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
vector
<
int
>
paddings
=
param
->
Paddings
();
...
...
@@ -60,8 +60,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
bs_ptr
[
i
+
channel
]
=
new_scale_ptr
[
i
];
bs_ptr
[
i
]
=
new_bias_ptr
[
i
];
}
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
const
int
groups
=
param
->
Groups
();
if
(
groups
==
channel
)
{
...
...
@@ -71,6 +69,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope
,
strides
[
0
],
strides
[
1
],
paddings
[
0
],
paddings
[
1
],
new_bias_ptr
);
param
->
SetFpgaArgs
(
dwconv_arg
);
fpga
::
fpga_free
(
new_scale_ptr
);
fpga
::
fpga_free
(
bs_ptr
);
}
else
{
fpga
::
format_conv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
());
fpga
::
SplitConvArgs
conv_arg
=
{
0
};
...
...
@@ -78,6 +78,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope
,
param
->
Groups
(),
strides
[
0
],
strides
[
1
],
paddings
[
0
],
paddings
[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
conv_arg
);
delete
new_scale
;
delete
new_bias
;
}
return
true
;
}
...
...
src/operators/kernel/fpga/V1/conv_add_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -25,10 +25,10 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
NONE
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
PADDLE_MOBILE_ENFORCE
(
out
->
dims
()[
1
]
==
bias
->
dims
()[
0
],
...
...
src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -25,10 +25,10 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
LEAKYRELU
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
PADDLE_MOBILE_ENFORCE
(
out
->
dims
()[
1
]
==
bias
->
dims
()[
0
],
...
...
src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -26,8 +26,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
NONE
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
auto
bn_mean_ptr
=
param
->
InputMean
()
->
data
<
float
>
();
auto
bn_var_ptr
=
param
->
InputVariance
()
->
data
<
float
>
();
...
...
@@ -51,8 +51,6 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bs_ptr
[
i
+
channel
]
=
new_scale_ptr
[
i
];
bs_ptr
[
i
]
=
new_bias_ptr
[
i
];
}
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
fpga
::
format_conv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
());
fpga
::
SplitConvArgs
conv_arg
=
{
0
};
...
...
@@ -61,6 +59,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
conv_arg
);
delete
new_scale
;
delete
new_bias
;
return
true
;
}
...
...
src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -26,8 +26,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
LEAKYRELU
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
auto
bn_mean_ptr
=
param
->
InputMean
()
->
data
<
float
>
();
auto
bn_var_ptr
=
param
->
InputVariance
()
->
data
<
float
>
();
...
...
@@ -51,8 +51,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bs_ptr
[
i
+
channel
]
=
new_scale_ptr
[
i
];
bs_ptr
[
i
]
=
new_bias_ptr
[
i
];
}
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
fpga
::
format_conv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
());
fpga
::
SplitConvArgs
conv_arg
=
{
0
};
...
...
@@ -61,6 +59,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
conv_arg
);
delete
new_scale
;
delete
new_bias
;
return
true
;
}
...
...
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -27,10 +27,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
NONE
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
PADDLE_MOBILE_ENFORCE
(
out
->
dims
()[
1
]
==
bias
->
dims
()[
0
],
...
...
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -28,10 +28,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
LEAKYRELU
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
PADDLE_MOBILE_ENFORCE
(
out
->
dims
()[
1
]
==
bias
->
dims
()[
0
],
...
...
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
input_y_ptr
=
input_y
->
data
<
float
>
();
auto
input_x_ptr
=
input_x
->
data
<
half
>
();
auto
input_y_ptr
=
input_y
->
data
<
half
>
();
fpga
::
format_fp16_ofm
(
out
);
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
auto
out_ptr
=
out
->
mutable_data
<
half
>
();
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
// ewaddArgs.relu_enabled = relu_enabled;
...
...
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
input_y_ptr
=
input_y
->
data
<
float
>
();
auto
input_x_ptr
=
input_x
->
data
<
half
>
();
auto
input_y_ptr
=
input_y
->
data
<
half
>
();
fpga
::
format_fp16_ofm
(
out
);
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
auto
out_ptr
=
out
->
mutable_data
<
half
>
();
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
// ewaddArgs.relu_enabled = relu_enabled;
...
...
src/operators/kernel/fpga/V1/feed_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -19,19 +19,37 @@ namespace operators {
template
<
>
bool
FeedKernel
<
FPGA
,
float
>::
Init
(
FeedParam
<
FPGA
>
*
param
)
{
Tensor
*
output
=
param
->
Out
();
auto
output
=
param
->
Out
();
int
col
=
param
->
Col
();
auto
input
=
const_cast
<
LoDTensor
*>
(
&
param
->
InputX
()
->
at
(
col
));
input
->
init
(
typeid
(
float
));
input
->
Resize
(
output
->
dims
());
if
(
output
->
dims
().
size
()
!=
4
)
{
auto
input_ptr
=
input
->
mutable_data
<
float
>
();
size_t
size
=
output
->
numel
()
*
sizeof
(
float
);
auto
p
=
fpga
::
fpga_malloc
(
size
);
memcpy
(
p
,
input_ptr
,
size
);
output
->
reset_data_ptr
(
p
);
return
true
;
}
fpga
::
format_fp16_ofm
(
output
);
return
true
;
}
template
<
>
void
FeedKernel
<
FPGA
,
float
>::
Compute
(
const
FeedParam
<
FPGA
>
&
param
)
{
auto
input
=
reinterpret_cast
<
Tensor
*>
(
const_cast
<
LoDTensor
*>
(
param
.
InputX
()));
auto
output
=
param
.
Out
();
int
col
=
param
.
Col
();
auto
input
=
const_cast
<
LoDTensor
*>
(
&
param
.
InputX
()
->
at
(
col
));
if
(
input
->
dims
().
size
()
!=
4
)
{
return
;
}
fpga
::
format_image
(
input
);
auto
input_ptr
=
input
->
data
<
float
>
();
Tensor
*
output
=
param
.
Out
();
auto
output_ptr
=
output
->
data
<
float
>
();
auto
output_ptr
=
output
->
data
<
half
>
();
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP32
};
...
...
@@ -39,7 +57,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
image
.
address
=
reinterpret_cast
<
void
*>
(
input_ptr
)
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
...
...
@@ -48,6 +66,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
output
.
address
=
output_ptr
;
args
.
output
.
scale_address
=
output
->
scale
;
fpga
::
PerformBypass
(
args
);
input
->
external_data
=
nullptr
;
}
template
class
FeedKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/fetch_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -19,20 +19,15 @@ namespace operators {
template
<
>
bool
FetchKernel
<
FPGA
,
float
>::
Init
(
FetchParam
<
FPGA
>
*
param
)
{
Tensor
*
output
=
param
->
Out
();
// fpga::format_fp16_ofm(output);
return
true
;
}
template
<
>
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
param
.
Out
()
->
ShareDataWith
(
*
(
param
.
InputX
()));
/*auto input =
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
int
col
=
param
->
Col
();
auto
output
=
&
(
param
->
Out
()
->
at
(
col
));
if
(
input
->
type
()
==
typeid
(
float
))
{
return
true
;
}
output
->
init
(
typeid
(
float
));
output
->
Resize
(
input
->
dims
());
fpga
::
format_fp32_ofm
(
output
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
...
...
@@ -40,13 +35,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
= output_ptr; args.output.scale_address = output->scale;
fpga::PerformBypass(args);*/
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
channels
=
(
uint32_t
)
product
(
input
->
dims
());
args
.
image
.
height
=
1
;
args
.
image
.
width
=
1
;
args
.
image
.
pad_height
=
0
;
args
.
image
.
pad_width
=
0
;
args
.
output
.
address
=
output
->
data
<
float
>
();
args
.
output
.
scale_address
=
output
->
scale
;
param
->
fpga_bypass_args
=
args
;
return
true
;
}
template
<
>
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
auto
input
=
param
.
InputX
();
if
(
input
->
type
()
==
typeid
(
float
))
{
int
col
=
param
.
Col
();
auto
output
=
&
(
param
.
Out
()
->
at
(
col
));
output
->
ShareDataWith
(
*
input
);
return
;
}
fpga
::
PerformBypass
(
param
.
fpga_bypass_args
);
fpga
::
fpga_invalidate
(
param
.
fpga_bypass_args
.
output
.
address
,
param
.
fpga_bypass_args
.
image
.
channels
*
sizeof
(
float
));
// TODO: DEalign: get rid of extra 0
}
template
class
FetchKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -25,7 +25,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
paddle_mobile
::
fpga
::
NONE
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
InputY
());
auto
filter
=
const_cast
<
LoD
Tensor
*>
(
param
->
InputY
());
const
Tensor
*
input_z
=
param
->
InputZ
();
auto
input_z_ptr
=
input_z
->
data
<
float
>
();
auto
out
=
param
->
Out
();
...
...
src/operators/kernel/fpga/V1/pool_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -21,11 +21,11 @@ namespace operators {
template
<
>
bool
PoolKernel
<
FPGA
,
float
>::
Init
(
PoolParam
<
FPGA
>
*
param
)
{
auto
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
*
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
half
>
();
Tensor
*
output
=
param
->
Output
();
fpga
::
format_fp16_ofm
(
output
);
auto
output_ptr
=
output
->
mutable_data
<
float
>
();
auto
output_ptr
=
output
->
mutable_data
<
half
>
();
vector
<
int
>
ksize
=
param
->
Ksize
();
vector
<
int
>
strides
=
param
->
Strides
();
vector
<
int
>
paddings
=
param
->
Paddings
();
...
...
src/operators/kernel/fpga/V1/proposal_kernel.cpp
0 → 100644
浏览文件 @
975687d5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PROPOSAL_OP
#include <algorithm>
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
template
<
>
bool
ProposalKernel
<
FPGA
,
float
>::
Init
(
ProposalParam
<
FPGA
>
*
param
)
{
int
post_nms_top_n
=
param
->
post_nms_topn_
;
int64_t
batch
=
param
->
scores_
->
dims
()[
0
];
auto
total
=
post_nms_top_n
*
batch
;
param
->
rpn_rois_
->
mutable_data
<
float
>
({
total
,
4
});
param
->
rpn_probs_
->
mutable_data
<
float
>
({
total
,
1
});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param
->
float_bbox
=
std
::
make_shared
<
Tensor
>
();
param
->
float_bbox
->
Resize
(
param
->
bbox_deltas_
->
dims
());
param
->
float_bbox
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
param
->
float_bbox
.
get
());
param
->
float_score
=
std
::
make_shared
<
Tensor
>
();
param
->
float_score
->
Resize
(
param
->
scores_
->
dims
());
param
->
float_score
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
param
->
float_score
.
get
());
auto
input
=
param
->
bbox_deltas_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_bbox
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_bbox
->
scale
;
param
->
bbox_arg
=
args
;
input
=
param
->
scores_
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_score
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_score
->
scale
;
param
->
score_arg
=
args
;
return
true
;
}
void
AppendProposals
(
Tensor
*
dst
,
int64_t
offset
,
const
Tensor
&
src
)
{
auto
*
out_data
=
dst
->
data
<
void
>
();
auto
*
to_add_data
=
src
.
data
<
void
>
();
size_t
size_of_t
=
framework
::
SizeOfType
(
src
.
type
());
offset
*=
size_of_t
;
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
out_data
)
+
offset
),
to_add_data
,
src
.
numel
()
*
size_of_t
);
}
template
<
class
T
>
static
inline
void
BoxCoder
(
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
();
int64_t
row
=
all_anchors
->
dims
()[
0
];
int64_t
len
=
all_anchors
->
dims
()[
1
];
auto
*
bbox_deltas_data
=
bbox_deltas
->
data
<
T
>
();
auto
*
anchor_data
=
all_anchors
->
data
<
T
>
();
const
T
*
variances_data
=
nullptr
;
if
(
variances
)
{
variances_data
=
variances
->
data
<
T
>
();
}
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
T
anchor_width
=
anchor_data
[
i
*
len
+
2
]
-
anchor_data
[
i
*
len
]
+
1.0
;
T
anchor_height
=
anchor_data
[
i
*
len
+
3
]
-
anchor_data
[
i
*
len
+
1
]
+
1.0
;
T
anchor_center_x
=
anchor_data
[
i
*
len
]
+
0.5
*
anchor_width
;
T
anchor_center_y
=
anchor_data
[
i
*
len
+
1
]
+
0.5
*
anchor_height
;
T
bbox_center_x
=
0
,
bbox_center_y
=
0
;
T
bbox_width
=
0
,
bbox_height
=
0
;
if
(
variances
)
{
bbox_center_x
=
variances_data
[
i
*
len
]
*
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
variances_data
[
i
*
len
+
1
]
*
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
],
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
],
kBBoxClipDefault
))
*
anchor_height
;
}
else
{
bbox_center_x
=
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
kBBoxClipDefault
))
*
anchor_height
;
}
proposals_data
[
i
*
len
]
=
bbox_center_x
-
bbox_width
/
2
;
proposals_data
[
i
*
len
+
1
]
=
bbox_center_y
-
bbox_height
/
2
;
proposals_data
[
i
*
len
+
2
]
=
bbox_center_x
+
bbox_width
/
2
-
1
;
proposals_data
[
i
*
len
+
3
]
=
bbox_center_y
+
bbox_height
/
2
-
1
;
}
// return proposals;
}
template
<
class
T
>
static
inline
void
ClipTiledBoxes
(
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
();
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
zero
(
0
);
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
}
}
template
<
class
T
>
static
inline
void
FilterBoxes
(
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
();
T
im_scale
=
im_info_data
[
2
];
keep
->
Resize
({
boxes
->
dims
()[
0
]});
min_size
=
std
::
max
(
min_size
,
1.0
f
);
int
*
keep_data
=
keep
->
mutable_data
<
int
>
();
int
keep_len
=
0
;
for
(
int
i
=
0
;
i
<
boxes
->
dims
()[
0
];
++
i
)
{
T
ws
=
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
]
+
1
;
T
hs
=
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
]
+
1
;
T
ws_origin_scale
=
(
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
])
/
im_scale
+
1
;
T
hs_origin_scale
=
(
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
])
/
im_scale
+
1
;
T
x_ctr
=
boxes_data
[
4
*
i
]
+
ws
/
2
;
T
y_ctr
=
boxes_data
[
4
*
i
+
1
]
+
hs
/
2
;
if
(
ws_origin_scale
>=
min_size
&&
hs_origin_scale
>=
min_size
&&
x_ctr
<=
im_info_data
[
1
]
&&
y_ctr
<=
im_info_data
[
0
])
{
keep_data
[
keep_len
++
]
=
i
;
}
}
keep
->
Resize
({
keep_len
});
}
template
<
class
T
>
static
inline
std
::
vector
<
std
::
pair
<
T
,
int
>>
GetSortedScoreIndex
(
const
std
::
vector
<
T
>
&
scores
)
{
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
sorted_indices
.
reserve
(
scores
.
size
());
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
.
emplace_back
(
scores
[
i
],
i
);
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
[](
const
std
::
pair
<
T
,
int
>
&
a
,
const
std
::
pair
<
T
,
int
>
&
b
)
{
return
a
.
first
<
b
.
first
;
});
return
sorted_indices
;
}
template
<
class
T
>
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
typename
T
>
static
inline
Tensor
VectorToTensor
(
const
std
::
vector
<
T
>
&
selected_indices
,
int
selected_num
)
{
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
auto
*
keep_data
=
keep_nms
.
mutable_data
<
T
>
();
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
class
T
>
static
inline
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
inter_xmin
=
std
::
max
(
box1
[
0
],
box2
[
0
]);
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
std
::
max
(
T
(
0
),
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
T
(
0
),
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
return
inter_area
/
(
bbox1_area
+
bbox2_area
-
inter_area
);
}
}
template
<
class
T
>
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
if
(
flag
)
{
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
}
else
{
break
;
}
}
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
++
selected_num
;
}
sorted_indices
.
erase
(
sorted_indices
.
end
()
-
1
);
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
return
VectorToTensor
(
selected_indices
,
selected_num
);
}
template
<
typename
T
>
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
// Sort index
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
int
*
index
=
index_t
.
mutable_data
<
int
>
();
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
}
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
if
(
pre_nms_top_n
<=
0
||
pre_nms_top_n
>=
scores_slice
.
numel
())
{
std
::
sort
(
index
,
index
+
scores_slice
.
numel
(),
compare
);
}
else
{
std
::
nth_element
(
index
,
index
+
pre_nms_top_n
,
index
+
scores_slice
.
numel
(),
compare
);
index_t
.
Resize
({
pre_nms_top_n
});
}
Tensor
scores_sel
,
bbox_sel
,
anchor_sel
,
var_sel
;
scores_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
1
});
bbox_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
anchor_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
var_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
BoxCoder
<
T
>
(
&
anchor_sel
,
&
bbox_sel
,
&
var_sel
,
&
proposals
);
ClipTiledBoxes
<
T
>
(
im_info_slice
,
&
proposals
);
Tensor
keep
;
FilterBoxes
<
T
>
(
&
proposals
,
min_size
,
im_info_slice
,
&
keep
);
Tensor
scores_filter
;
bbox_sel
.
mutable_data
<
T
>
({
keep
.
numel
(),
4
});
scores_filter
.
mutable_data
<
T
>
({
keep
.
numel
(),
1
});
if
(
nms_thresh
<=
0
)
{
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
}
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
keep_nms
.
Resize
({
post_nms_top_n
});
}
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
});
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
});
return
std
::
make_pair
(
proposals
,
scores_sel
);
}
template
<
>
void
ProposalKernel
<
FPGA
,
float
>::
Compute
(
const
ProposalParam
<
FPGA
>
&
param
)
{
auto
score_tensor
=
param
.
float_score
.
get
();
fpga
::
PerformBypass
(
param
.
score_arg
);
fpga
::
fpga_invalidate
(
score_tensor
->
data
<
float
>
(),
score_tensor
->
numel
()
*
sizeof
(
float
));
auto
bbox_tensor
=
param
.
float_bbox
.
get
();
fpga
::
PerformBypass
(
param
.
bbox_arg
);
fpga
::
fpga_invalidate
(
bbox_tensor
->
data
<
float
>
(),
bbox_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
scores
=
param
.
float_score
.
get
();
auto
*
bbox_deltas
=
param
.
float_bbox
.
get
();
auto
*
im_info
=
param
.
im_info_
;
auto
anchors
=
*
param
.
anchors_
;
auto
variances
=
*
param
.
variances_
;
auto
*
rpn_rois
=
param
.
rpn_rois_
;
auto
*
rpn_roi_probs
=
param
.
rpn_probs_
;
int
pre_nms_top_n
=
param
.
pre_nms_topn_
;
int
post_nms_top_n
=
param
.
post_nms_topn_
;
float
nms_thresh
=
param
.
nms_thresh_
;
float
min_size
=
param
.
min_size_
;
float
eta
=
param
.
eta_
;
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
//
Tensor
bbox_deltas_swap
,
scores_swap
;
bbox_deltas_swap
.
mutable_data
<
float
>
({
num
,
h_bbox
,
w_bbox
,
c_bbox
});
scores_swap
.
mutable_data
<
float
>
({
num
,
h_score
,
w_score
,
c_score
});
framework
::
LoD
lod
;
lod
.
resize
(
1
);
auto
&
lod0
=
lod
[
0
];
lod0
.
push_back
(
0
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
Tensor
im_info_slice
=
im_info
->
Slice
(
i
,
i
+
1
);
Tensor
bbox_deltas_slice
=
bbox_deltas_swap
.
Slice
(
i
,
i
+
1
);
Tensor
scores_slice
=
scores_swap
.
Slice
(
i
,
i
+
1
);
bbox_deltas_slice
.
Resize
({
h_bbox
*
w_bbox
*
c_bbox
/
4
,
4
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
AppendProposals
(
rpn_rois
,
4
*
num_proposals
,
proposals
);
AppendProposals
(
rpn_roi_probs
,
num_proposals
,
scores
);
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
push_back
(
num_proposals
);
}
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
rpn_roi_probs
->
Resize
({
num_proposals
,
1
});
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // PROPOSAL_OP
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
0 → 100644
浏览文件 @
975687d5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>*
param
)
{
auto
dims
=
param
->
input_x_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
"data not aligned"
);
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
param
->
float_output
=
std
::
make_shared
<
Tensor
>
();
param
->
float_output
->
mutable_data
<
float
>
(
param
->
output_
->
dims
());
auto
input
=
param
->
input_x_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
param
->
input_arg
=
args
;
fpga
::
format_fp16_ofm
(
param
->
output_
);
input
=
param
->
float_output
.
get
();
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
image
.
address
=
input
->
data
<
float
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
output_
->
mutable_data
<
half
>
();
args
.
output
.
scale_address
=
param
->
output_
->
scale
;
param
->
input_arg
=
args
;
return
true
;
}
template
<
>
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
auto
input_tensor
=
param
.
float_input
.
get
();
fpga
::
PerformBypass
(
param
.
input_arg
);
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
input_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
in
=
input_tensor
;
auto
*
rois
=
param
.
input_rois_
;
auto
*
out
=
param
.
float_output
.
get
();
auto
pooled_height
=
param
.
pooled_height_
;
auto
pooled_width
=
param
.
pooled_width_
;
auto
spatial_scale
=
param
.
spatial_scale_
;
auto
output_channels
=
param
.
output_channels_
;
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto
in_stride
=
framework
::
stride
({
batch_size
,
height
,
width
,
input_channels
});
auto
out_stride
=
framework
::
stride
(
{
out
->
dims
()[
0
],
out
->
dims
()[
2
],
out
->
dims
()[
3
],
out
->
dims
()[
1
]});
const
float
*
input_data
=
in
->
data
<
float
>
();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
return
;
PADDLE_MOBILE_ENFORCE
(
rois
->
NumLevels
()
>
0
,
"ROIS should not be empty"
);
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_MOBILE_ENFORCE
(
rois_batch_size
==
batch_size
,
"the rois_batch_size and input(X) batch_size should be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_MOBILE_ENFORCE
(
rois_num_with_lod
==
rois_num
,
"the rois_num from input and lod must be the same"
);
PADDLE_MOBILE_ENFORCE
(
input_channels
==
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// set roi batch id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
auto
roi_end_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
auto
roi_end_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small rois to be 1 x 1
auto
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
// avoid 0
auto
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// Compute bin size w and h at input feature map
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
DLOG
<<
3
;
// calculate each pixel of the output feature map.
int
out_roi_offset
=
n
*
out_stride
[
0
];
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int
out_plane_offset
=
out_roi_offset
+
c
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
++
ph
)
{
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int
out_row_offset
=
out_plane_offset
+
ph
*
out_stride
[
1
];
for
(
int
pw
=
0
;
pw
<
pooled_width
;
++
pw
)
{
// calculate w and h at input feature map
int
hstart
=
floor
(
static_cast
<
float
>
(
ph
)
*
bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
float
>
(
pw
)
*
bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
float
>
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
);
int
wend
=
ceil
(
static_cast
<
float
>
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
// TODO int output_index = out_row_offset + pw;
int
output_index
=
out_row_offset
+
pw
*
output_channels
;
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int
input_plane_offset
=
roi_batch_id
*
in_stride
[
0
]
+
input_channel
;
auto
offset_input_data
=
input_data
+
input_plane_offset
;
float
out_sum
=
0.
;
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
in_stride
[
1
]
+
iw
*
input_channel
;
out_sum
+=
offset_input_data
[
input_index
];
}
}
float
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
output_data
[
output_index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
}
}
}
}
fpga
::
format_image
(
out
);
fpga
::
PerformBypass
(
param
.
output_arg
);
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // PSROI_POOL_OP
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
0 → 100644
浏览文件 @
975687d5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
Reshape2Kernel
<
FPGA
,
float
>::
Init
(
Reshape2Param
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
output
=
param
->
Out
();
auto
shape
=
param
->
Shape
();
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
PADDLE_MOBILE_ENFORCE
(
num_shape
!=
0
,
"0 index is not supported"
);
for
(
int
i
=
0
;
i
<
shape
.
size
();
i
++
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
static_cast
<
int
>
(
-
num_in
/
num_shape
);
break
;
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
output
->
set_type
(
input
->
type
());
fpga
::
format_ofm
(
output
);
DLOG
<<
"input: "
<<
input
;
DLOG
<<
"output: "
<<
output
;
return
true
;
}
void
reshape
(
LoDTensor
*
input
,
LoDTensor
*
output
)
{
// Subscript r means after reshape
// TODO zhangyang verify this function
float
*
input_ptr_f
,
*
output_ptr_f
;
half
*
input_ptr_h
,
*
output_ptr_h
;
bool
is_float
=
false
;
if
(
input
->
type
()
==
typeid
(
float
))
{
input_ptr_f
=
input
->
data
<
float
>
();
output_ptr_f
=
output
->
data
<
float
>
();
is_float
=
true
;
}
else
{
input_ptr_h
=
input
->
data
<
half
>
();
output_ptr_h
=
output
->
data
<
half
>
();
}
auto
C
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
auto
H
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
auto
W
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
auto
Cr
=
static_cast
<
int
>
(
output
->
dims
()[
1
]);
auto
Hr
=
static_cast
<
int
>
(
output
->
dims
()[
2
]);
auto
Wr
=
static_cast
<
int
>
(
output
->
dims
()[
3
]);
PADDLE_MOBILE_ENFORCE
(
C
*
H
*
W
==
Cr
*
Hr
*
Wr
,
"Dims don't match"
);
auto
WC
=
W
*
C
;
auto
WC_align
=
fpga
::
align_to_x
(
WC
,
IMAGE_ALIGNMENT
);
auto
HW
=
H
*
W
;
auto
WCr
=
Wr
*
Cr
;
auto
WCr_align
=
fpga
::
align_to_x
(
WCr
,
IMAGE_ALIGNMENT
);
auto
HWr
=
Hr
*
Wr
;
int
offset_align
=
0
;
int
offset_r
=
0
,
offset_align_r
=
0
;
int
cr
=
0
,
hr
=
0
,
wr
=
0
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
int
offset0
=
h
*
WC_align
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
int
offset1
=
w
*
C
+
offset0
;
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
offset_align
=
offset1
+
c
;
offset_r
=
c
*
HW
+
h
*
W
+
c
;
cr
=
offset_r
/
HWr
;
hr
=
offset_r
%
HWr
/
Wr
;
wr
=
offset_r
%
Wr
;
offset_align_r
=
hr
*
WCr_align
+
wr
*
Cr
+
cr
;
// DLOG << "hwc"<< h<< " " << w << " " << c;
// DLOG << "hrwrcr" << hr<< " " << wr << " " << cr;
if
(
is_float
)
{
output_ptr_f
[
offset_align_r
]
=
input_ptr_f
[
offset_align
];
}
else
{
output_ptr_h
[
offset_align_r
]
=
input_ptr_h
[
offset_align
];
}
}
}
}
}
template
<
>
void
Reshape2Kernel
<
FPGA
,
float
>::
Compute
(
const
Reshape2Param
<
FPGA
>
&
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
.
InputX
());
auto
output
=
param
.
Out
();
auto
shape
=
param
.
Shape
();
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
PADDLE_MOBILE_ENFORCE
(
num_shape
!=
0
,
"0 index is not supported"
);
for
(
int
i
=
0
;
i
<
shape
.
size
();
i
++
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
static_cast
<
int
>
(
-
num_in
/
num_shape
);
break
;
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
if
(
output
->
dims
()
==
input
->
dims
())
{
DLOG
<<
"No need to reshape"
;
return
;
}
reshape
(
input
,
output
);
//
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile
::
fpga
::
SIGMOID
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp16_ofm
(
out
);
...
...
@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args
.
image
.
width
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
out
->
data
<
float
>
();
args
.
output
.
address
=
out
->
data
<
half
>
();
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
activation
.
activation_type
=
activation_enable
;
args
.
output
.
activation
.
leaky_relu_negative_slope
=
leaky_relu_negative_slope
;
...
...
src/operators/kernel/fpga/V1/slice_kernel.cpp
0 → 100644
浏览文件 @
975687d5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SLICE_OP
#include "operators/kernel/slice_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
SliceKernel
<
FPGA
,
float
>::
Init
(
SliceParam
<
FPGA
>*
param
)
{
auto
output
=
param
->
output_
;
fpga
::
format_fp16_ofm
(
output
);
DLOG
<<
"input: "
<<
param
->
input_
;
DLOG
<<
"output: "
<<
param
->
output_
;
if
(
param
->
input_
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
}
return
true
;
}
template
<
>
void
SliceKernel
<
FPGA
,
float
>::
Compute
(
const
SliceParam
<
FPGA
>&
param
)
{
// Only support slicing in channel dimension
auto
input
=
param
.
input_
;
DLOG
<<
input
;
int
HW
=
input
->
dims
()[
2
]
*
input
->
dims
()[
3
];
int
channel
=
input
->
dims
()[
1
];
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
param
.
output_
->
data
<
half
>
();
int
start
=
param
.
starts_
[
0
],
end
=
param
.
ends_
[
0
];
start
=
start
<
0
?
start
+
channel
:
start
;
end
=
end
<
0
?
end
+
channel
:
end
;
start
=
start
>
channel
?
channel
:
start
;
end
=
end
>
channel
?
channel
:
end
;
int
len
=
end
-
start
;
for
(
int
i
=
0
;
i
<
HW
;
i
++
)
{
memcpy
(
output_ptr
+
len
*
i
,
input_ptr
+
i
*
channel
+
start
,
len
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -23,49 +23,72 @@ namespace operators {
template
<
>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp32_ofm
(
out
);
auto
float_input
=
new
Tensor
;
if
(
input
->
dims
().
size
()
==
2
)
{
float_input
->
mutable_data
<
float
>
({
1
,
input
->
dims
()[
1
]});
}
else
if
(
input
->
dims
().
size
()
==
4
)
{
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
2
],
input
->
dims
()[
3
],
input
->
dims
()[
1
]});
}
else
{
DLOG
<<
"wrong dimension of softmax input"
;
auto
float_input
=
new
LoDTensor
;
PADDLE_MOBILE_ENFORCE
(
input
->
dims
().
size
()
==
4
,
"Softmax should have 4-order input"
);
auto
dims
=
framework
::
vectorize
(
input
->
dims
());
auto
channel
=
dims
[
3
];
if
(
channel
==
1
)
{
// This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE
(
dims
[
2
]
==
1
,
"Softmax input must come from FC op"
);
dims
[
3
]
=
dims
[
1
];
dims
[
1
]
=
1
;
}
input
->
Resize
(
framework
::
make_ddim
(
dims
));
float_input
->
Resize
(
framework
::
make_ddim
(
dims
));
if
(
channel
!=
2
)
{
// Use CPU
float_input
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
format_fp32_ofm
(
out
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
uint32_t
)
dims
[
1
];
args
.
image
.
width
=
(
uint32_t
)
dims
[
2
];
args
.
image
.
channels
=
(
uint32_t
)
dims
[
3
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
param
->
SetFloatInput
(
float_input
);
param
->
SetFpgaArgs
(
args
);
}
else
{
// Use FPGA
fpga
::
format_fp16_ofm
(
out
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
output
.
address
=
out
->
data
<
half
>
();
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
activation
.
activation_type
=
fpga
::
SOFTMAX
;
param
->
SetFpgaArgs
(
args
);
}
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
2
]
:
1
;
args
.
image
.
width
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
param
->
SetFloatInput
(
float_input
);
param
->
SetFpgaArgs
(
args
);
return
true
;
}
template
<
>
void
SoftmaxKernel
<
FPGA
,
float
>::
Compute
(
const
SoftmaxParam
<
FPGA
>
&
param
)
{
Tensor
*
in_x
=
param
.
FloatInput
();
Tensor
*
out
=
param
.
Out
();
fpga
::
PerformBypass
(
param
.
FpgaArgs
());
fpga
::
fpga_invalidate
((
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
in_x
->
numel
()
*
sizeof
(
float
));
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
if
(
param
.
FpgaArgs
().
output
.
activation
.
activation_type
!=
fpga
::
SOFTMAX
)
{
Tensor
*
out
=
param
.
Out
();
Tensor
*
in_x
=
param
.
FloatInput
();
fpga
::
fpga_invalidate
(
in_x
->
data
<
float
>
(),
in_x
->
numel
()
*
sizeof
(
float
));
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
}
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/split_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -20,7 +20,7 @@ namespace paddle_mobile {
namespace
operators
{
template
<
>
bool
SplitKernel
<
FPGA
,
float
>::
Init
(
SplitParam
<
FPGA
>
*
param
)
{
auto
*
in
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
*
in
=
const_cast
<
LoD
Tensor
*>
(
param
->
InputX
());
auto
outs
=
param
->
Outs
();
auto
sections
=
param
->
Sections
();
int
axis
=
param
->
Axis
();
...
...
@@ -34,22 +34,32 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
float
*
)));
auto
out_channels
=
reinterpret_cast
<
uint32_t
*>
(
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
uint32_t
)));
DLOG
<<
"input: "
<<
in
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
fpga
::
format_fp16_ofm
(
outs
[
i
]);
images_out
[
i
]
=
outs
[
i
]
->
mutable_data
<
float
>
();
DLOG
<<
"output: "
<<
outs
[
i
];
images_out
[
i
]
=
outs
[
i
]
->
mutable_data
<
half
>
();
scales_out
[
i
]
=
outs
[
i
]
->
scale
;
out_channels
[
i
]
=
(
uint32_t
)
sections
[
i
];
}
auto
deleter
=
[](
void
*
p
)
{
fpga
::
fpga_free
(
p
);
};
fpga
::
SplitArgs
arg
=
{
0
};
arg
.
image_num
=
image_num
;
arg
.
image_in
=
(
half
*
)
in
->
data
<
float
>
();
arg
.
image_in
=
in
->
data
<
half
>
();
arg
.
scale_in
=
in
->
scale
;
arg
.
images_out
=
images_out
;
arg
.
scales_out
=
scales_out
;
arg
.
out_channel_nums
=
out_channels
;
arg
.
height
=
(
uint32_t
)
in
->
dims
()[
2
];
arg
.
width
=
(
uint32_t
)
in
->
dims
()[
3
];
arg
.
vector_split_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
images_out
),
deleter
));
arg
.
vector_split_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
scales_out
),
deleter
));
arg
.
vector_split_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
out_channels
),
deleter
));
param
->
SetFpgaArgs
(
arg
);
return
true
;
...
...
src/operators/kernel/fpga/V1/tanh_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -21,9 +21,11 @@ namespace operators {
template
<
>
bool
TanhKernel
<
FPGA
,
float
>::
Init
(
TanhParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
float_input
=
new
Tensor
;
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
DLOG
<<
"input: "
<<
input
;
auto
input_ptr
=
input
->
data
<
half
>
();
auto
float_input
=
new
LoDTensor
;
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
1
],
input
->
dims
()[
2
],
input
->
dims
()[
3
]});
fpga
::
format_fp32_ofm
(
float_input
);
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
975687d5
...
...
@@ -20,7 +20,21 @@ namespace operators {
template
<
>
bool
Transpose2Kernel
<
FPGA
,
float
>::
Init
(
Transpose2Param
<
FPGA
>
*
param
)
{
param
->
Out
()
->
ShareDataWith
(
*
param
->
InputX
());
auto
input
=
param
->
InputX
();
auto
output
=
param
->
Out
();
auto
axis
=
param
->
Axis
();
auto
dim
=
input
->
dims
();
output
->
ShareDataWith
(
*
input
);
auto
dim_v
=
vectorize
(
dim
);
for
(
int
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
dim_v
[
i
]
=
dim
[
axis
[
i
]];
}
output
->
Resize
(
framework
::
make_ddim
(
dim_v
));
DLOG
<<
"input: "
<<
input
;
DLOG
<<
"output: "
<<
output
;
return
true
;
}
...
...
src/operators/op_param.h
浏览文件 @
975687d5
...
...
@@ -1053,7 +1053,7 @@ class SoftmaxParam : public OpParam {
GType
*
FloatInput
()
const
{
return
float_input_x_
==
nullptr
?
input_x_
:
float_input_x_
.
get
();
}
void
SetFloatInput
(
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
void
SetFloatInput
(
LoD
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
const
fpga
::
BypassArgs
&
FpgaArgs
()
const
{
return
fpga_bypass_args
;
}
void
SetFpgaArgs
(
const
fpga
::
BypassArgs
&
args
)
{
fpga_bypass_args
=
args
;
}
#endif
...
...
@@ -1212,18 +1212,8 @@ class FetchParam : public OpParam {
framework
::
LoDTensorArray
*
out_
;
int
col_
;
#ifdef PADDLE_MOBILE_FPGA
private:
std
::
shared_ptr
<
GType
>
float_input_x_
;
fpga
::
BypassArgs
fpga_bypass_args
;
public:
GType
*
FloatInput
()
const
{
return
float_input_x_
==
nullptr
?
input_x_
:
float_input_x_
.
get
();
}
void
SetFloatInput
(
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
const
fpga
::
BypassArgs
&
FpgaArgs
()
const
{
return
fpga_bypass_args
;
}
void
SetFpgaArgs
(
const
fpga
::
BypassArgs
&
args
)
{
fpga_bypass_args
=
args
;
}
fpga
::
BypassArgs
fpga_bypass_args
;
#endif
};
...
...
@@ -1660,7 +1650,7 @@ class TanhParam : public OpParam {
GType
*
FloatInput
()
const
{
return
float_input_x_
==
nullptr
?
input_x_
:
float_input_x_
.
get
();
}
void
SetFloatInput
(
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
void
SetFloatInput
(
LoD
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
const
fpga
::
BypassArgs
&
FpgaArgs
()
const
{
return
fpga_bypass_args
;
}
void
SetFpgaArgs
(
const
fpga
::
BypassArgs
&
args
)
{
fpga_bypass_args
=
args
;
}
#endif
...
...
src/operators/reshape2_op.cpp
浏览文件 @
975687d5
...
...
@@ -43,5 +43,8 @@ REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU
(
reshape2
,
ops
::
Reshape2Op
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
reshape2
,
ops
::
Reshape2Op
);
#endif
#endif
test/CMakeLists.txt
浏览文件 @
975687d5
...
...
@@ -74,6 +74,9 @@ if (CON GREATER -1)
ADD_EXECUTABLE
(
test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-densebox paddle-mobile
)
ADD_EXECUTABLE
(
test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-rfcn paddle-mobile
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
test/fpga/test_resnet50.cpp
浏览文件 @
975687d5
...
...
@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
}
}
void
dump
(
std
::
string
filename
,
const
Tensor
input_tensor
)
{
auto
dataptr
=
input_tensor
.
data
<
float
>
(
);
void
dump
(
std
::
string
filename
,
Tensor
input_tensor
)
{
auto
dataptr
=
reinterpret_cast
<
half
*>
(
input_tensor
.
get_data
()
);
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
++
i
)
{
...
...
@@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) {
}
out
.
close
();
}
void
dump_stride
(
std
::
string
filename
,
const
Tensor
input_tensor
,
const
int
dumpnum
)
{
void
dump_stride
_half
(
std
::
string
filename
,
Tensor
input_tensor
,
const
int
dumpnum
)
{
int
c
=
(
input_tensor
.
dims
())[
1
];
int
h
=
(
input_tensor
.
dims
())[
2
];
int
w
=
(
input_tensor
.
dims
())[
3
];
auto
data_ptr
=
input_tensor
.
data
<
float
>
();
int16_t
*
data_tmp
=
(
int16_t
*
)
malloc
(
c
*
h
*
w
*
sizeof
(
int16_t
));
int16_t
*
data_ptr_16
=
(
int16_t
*
)
data_ptr
;
auto
data_ptr
=
input_tensor
.
get_data
();
auto
*
data_tmp
=
reinterpret_cast
<
half
*>
(
malloc
(
c
*
h
*
w
*
sizeof
(
int16_t
)));
auto
*
data_ptr_16
=
reinterpret_cast
<
half
*>
(
data_ptr
);
convert_to_chw
(
&
data_ptr_16
,
c
,
h
,
w
,
data_tmp
);
// const int16_t *dataptr = input_tensor.data<int16_t>();
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
int
stride
=
input_tensor
.
numel
()
/
dumpnum
;
...
...
@@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor,
out
.
close
();
free
(
data_tmp
);
}
void
dump_stride_float
(
std
::
string
filename
,
Tensor
input_tensor
,
const
int
dumpnum
)
{
auto
data_ptr
=
reinterpret_cast
<
float
*>
(
input_tensor
.
get_data
());
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
int
stride
=
input_tensor
.
numel
()
/
dumpnum
;
stride
=
stride
>
0
?
stride
:
1
;
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
i
+=
stride
)
{
result
=
data_ptr
[
i
];
out
<<
result
<<
std
::
endl
;
}
out
.
close
();
}
static
const
char
*
g_resnet50
=
"../models/resnet50"
;
const
std
::
string
g_image_src_float
=
"../images/image_src_float"
;
int
main
()
{
...
...
@@ -98,24 +112,21 @@ int main() {
for
(
int
i
=
0
;
i
<
73
;
i
++
)
{
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
i
);
std
::
string
saveName
=
"resnet50_result_"
+
std
::
to_string
(
i
);
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
data
<
float
>
(),
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
get_data
(),
tensor_ptr
->
numel
()
*
sizeof
(
half
));
dump_stride
(
saveName
,
(
*
tensor_ptr
),
20
);
dump_stride
_half
(
saveName
,
(
*
tensor_ptr
),
20
);
// dump(saveName, (*tensor_ptr));
}
std
::
shared_ptr
<
Tensor
>
output_tensor
=
paddle_mobile
.
FetchResult
(
73
);
//(*output_tensor).dump<float>("resnet50_result_73");
output_tensor
=
paddle_mobile
.
FetchResult
(
74
);
//(*output_tensor).dump<float>("resnet50_result_74");
// std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
// output_tensor = paddle_mobile.FetchResult(74);
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
73
);
dump_stride_float
(
"resnet50_result_73"
,
(
*
tensor_ptr
),
20
);
tensor_ptr
=
paddle_mobile
.
FetchResult
(
74
);
dump_stride_float
(
"resnet50_result_74"
,
(
*
tensor_ptr
),
9999
);
float
max
=
0
;
auto
data_ptr
=
output_tenso
r
->
data
<
float
>
();
auto
data_ptr
=
tensor_pt
r
->
data
<
float
>
();
int
maximumIdx
=
0
;
for
(
int
i
=
0
;
i
<
(
*
output_tenso
r
).
numel
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
*
tensor_pt
r
).
numel
();
i
++
)
{
if
(
data_ptr
[
i
]
>
max
)
{
maximumIdx
=
i
;
max
=
data_ptr
[
i
];
...
...
test/fpga/test_rfcn.cpp
0 → 100644
浏览文件 @
975687d5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#ifdef PADDLE_MOBILE_FPGA_V1
#include "fpga/V1/api.h"
#endif
#ifdef PADDLE_MOBILE_FPGA_V2
#include "fpga/V2/api.h"
#endif
void
readStream
(
std
::
string
filename
,
uint8_t
*
buf
)
{
std
::
ifstream
in
;
in
.
open
(
filename
,
std
::
ios
::
in
);
if
(
!
in
.
is_open
())
{
std
::
cout
<<
"open File Failed."
<<
std
::
endl
;
return
;
}
int
i
=
0
;
while
(
!
in
.
eof
())
{
in
>>
buf
[
i
];
i
++
;
}
in
.
close
();
}
static
const
char
*
g_rfcn_combine
=
"../models/rfcn"
;
static
const
char
*
g_image_src_float
=
"../models/rfcn/data.bin"
;
int
main
()
{
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_rfcn_combine
)
+
"/model"
,
std
::
string
(
g_rfcn_combine
)
+
"/params"
,
true
,
false
,
1
,
true
))
{
float
img_info
[
3
]
=
{
768
,
1536
,
768.0
f
/
960.0
f
};
auto
img
=
fpga
::
fpga_malloc
(
768
*
1536
*
3
*
sizeof
(
float
));
readStream
(
g_image_src_float
,
reinterpret_cast
<
uint8_t
*>
(
img
));
std
::
vector
<
void
*>
v
(
3
,
nullptr
);
paddle_mobile
.
FeedData
({
img_info
,
img
});
paddle_mobile
.
Predict_To
(
-
1
);
paddle_mobile
.
GetResults
(
&
v
);
DLOG
<<
"Computation done"
;
fpga
::
fpga_free
(
img
);
}
return
0
;
}
tools/op.cmake
浏览文件 @
975687d5
...
...
@@ -126,6 +126,11 @@ if (CON GREATER -1)
set
(
RESHAPE_OP ON
)
set
(
FUSION_CONVADDBNRELU_OP ON
)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
RESHAPE2_OP ON
)
set
(
PSROI_POOL_OP ON
)
set
(
PROPOSAL_OP ON
)
set
(
ANCHOR_GENERATOR_OP ON
)
set
(
SLICE_OP ON
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录