Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
16927084
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
16927084
编写于
2月 10, 2019
作者:
Z
zhangyang0701
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
reconstruct code to support RFCN for FPGA track
上级
458183af
变更
34
隐藏空白更改
内联
并排
Showing
34 changed file
with
1036 addition
and
244 deletion
+1036
-244
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+29
-19
src/fpga/V1/image.cpp
src/fpga/V1/image.cpp
+16
-15
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+1
-1
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+8
-9
src/framework/executor.cpp
src/framework/executor.cpp
+26
-61
src/framework/executor.h
src/framework/executor.h
+2
-3
src/framework/operator.cpp
src/framework/operator.cpp
+16
-9
src/framework/operator.h
src/framework/operator.h
+21
-1
src/framework/tensor.h
src/framework/tensor.h
+6
-1
src/io/api_paddle_mobile.cc
src/io/api_paddle_mobile.cc
+17
-0
src/io/api_paddle_mobile.h
src/io/api_paddle_mobile.h
+3
-0
src/io/paddle_inference_api.h
src/io/paddle_inference_api.h
+3
-0
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+6
-2
src/io/paddle_mobile.h
src/io/paddle_mobile.h
+2
-1
src/operators/kernel/detection_kernel.h
src/operators/kernel/detection_kernel.h
+8
-0
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+35
-4
src/operators/kernel/fpga/V1/concat_kernel.cpp
src/operators/kernel/fpga/V1/concat_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/feed_kernel.cpp
src/operators/kernel/fpga/V1/feed_kernel.cpp
+24
-6
src/operators/kernel/fpga/V1/fetch_kernel.cpp
src/operators/kernel/fpga/V1/fetch_kernel.cpp
+31
-21
src/operators/kernel/fpga/V1/pool_kernel.cpp
src/operators/kernel/fpga/V1/pool_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+403
-1
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+171
-3
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+44
-1
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/slice_kernel.cpp
src/operators/kernel/fpga/V1/slice_kernel.cpp
+28
-1
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+56
-33
src/operators/kernel/fpga/V1/split_kernel.cpp
src/operators/kernel/fpga/V1/split_kernel.cpp
+4
-2
src/operators/kernel/fpga/V1/tanh_kernel.cpp
src/operators/kernel/fpga/V1/tanh_kernel.cpp
+3
-1
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+15
-1
src/operators/op_param.h
src/operators/op_param.h
+12
-10
test/fpga/test_resnet50.cpp
test/fpga/test_resnet50.cpp
+6
-7
test/fpga/test_rfcn.cpp
test/fpga/test_rfcn.cpp
+28
-19
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
16927084
...
@@ -28,11 +28,13 @@ void format_image(framework::Tensor *image_tensor) {
...
@@ -28,11 +28,13 @@ void format_image(framework::Tensor *image_tensor) {
auto
dims
=
image_tensor
->
dims
();
auto
dims
=
image_tensor
->
dims
();
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
image_tensor
->
data
<
float
>
();
auto
data_ptr
=
image_tensor
->
data
<
float
>
();
size_t
memory_size
=
channel
*
height
*
width
*
sizeof
(
float
);
auto
external_ptr
=
reinterpret_cast
<
float
*>
(
image_tensor
->
external_data
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
// NOLINT
float
*
p_data
=
external_ptr
==
nullptr
?
data_ptr
:
external_ptr
;
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
float
*
old_p
=
p_data
;
image
::
format_image
(
&
new_data
,
channel
,
height
,
width
);
image
::
format_image
(
&
p_data
,
channel
,
height
,
width
);
image_tensor
->
reset_data_ptr
(
new_data
);
if
(
old_p
!=
p_data
)
{
image_tensor
->
reset_data_ptr
(
p_data
);
}
}
}
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
...
@@ -50,6 +52,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
...
@@ -50,6 +52,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto
p
=
fpga_malloc
(
memory_size
);
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
half
));
}
}
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
,
framework
::
DDim
dims
)
{
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
,
framework
::
DDim
dims
)
{
...
@@ -67,6 +70,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
...
@@ -67,6 +70,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto
p
=
fpga_malloc
(
memory_size
);
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
half
));
}
}
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
auto
dims
=
ofm_tensor
->
dims
();
auto
dims
=
ofm_tensor
->
dims
();
...
@@ -83,6 +87,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
...
@@ -83,6 +87,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto
p
=
fpga_malloc
(
memory_size
);
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
float
));
}
}
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
)
{
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
)
{
...
@@ -139,6 +144,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
...
@@ -139,6 +144,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
}
void
format_dwconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
)
{
void
format_dwconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
dims
=
filter_tensor
->
dims
();
...
@@ -149,6 +155,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
...
@@ -149,6 +155,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
filter
::
format_dwconv_filter
(
&
new_data
,
num
,
height
,
width
,
scale_ptr
);
filter
::
format_dwconv_filter
(
&
new_data
,
num
,
height
,
width
,
scale_ptr
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
}
void
format_DWDconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
,
void
format_DWDconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
,
...
@@ -173,6 +180,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
...
@@ -173,6 +180,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width});
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
// filter_tensor->Resize(dims_new);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
}
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
)
{
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
)
{
...
@@ -187,6 +195,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
...
@@ -187,6 +195,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter
::
format_fc_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
1
,
filter
::
format_fc_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
1
,
max_value
);
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
}
void
format_deconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
void
format_deconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
,
int
stride
)
{
int
group_num
,
int
stride
)
{
...
@@ -213,6 +222,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
...
@@ -213,6 +222,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework
::
make_ddim
({
num
,
channel
,
height
,
width
});
framework
::
make_ddim
({
num
,
channel
,
height
,
width
});
filter_tensor
->
Resize
(
dims_new
);
filter_tensor
->
Resize
(
dims_new
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
}
void
format_bias_scale_array
(
float
**
bias_scale_array
,
void
format_bias_scale_array
(
float
**
bias_scale_array
,
...
@@ -236,6 +246,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
...
@@ -236,6 +246,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto
ddim
=
framework
::
make_ddim
({
1
,
sum_channel
,
height
,
width
});
auto
ddim
=
framework
::
make_ddim
({
1
,
sum_channel
,
height
,
width
});
out
->
Resize
(
ddim
);
out
->
Resize
(
ddim
);
out
->
reset_data_ptr
(
data_ptr
);
out
->
reset_data_ptr
(
data_ptr
);
out
->
set_type
(
typeid
(
half
));
}
}
void
format_conv_data
(
framework
::
Tensor
*
filter_tensor
,
void
format_conv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
...
@@ -447,9 +458,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
...
@@ -447,9 +458,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
)
{
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
filter_ptr
=
filter
->
data
<
int8_
t
>
();
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
group_num
=
(
uint32_t
)
group_num
;
...
@@ -571,8 +582,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -571,8 +582,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
)
{
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
filter_ptr
=
filter
->
data
<
int8_
t
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
group_num
=
(
uint32_t
)
group_num
;
...
@@ -603,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -603,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
sub_output_height
*
sub_conv_num
,
real_out_width
});
{
1
,
arg
->
filter_num
,
sub_output_height
*
sub_conv_num
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
arg
->
output
.
address
=
arg
->
output
.
address
=
(
half
*
)
out_ptr
+
// NOLINT
(
half
*
)
out_ptr
+
// NOLINT
omit_size
*
sizeof
(
half
)
*
omit_size
*
sizeof
(
half
)
*
...
@@ -793,7 +804,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -793,7 +804,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
),
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
),
deleter
));
deleter
));
}
}
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
int16_t
*>
(
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
half
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
);
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
);
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
;
...
@@ -818,9 +829,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
...
@@ -818,9 +829,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
filter_ptr
=
filter
->
data
<
uint8_
t
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
output_ptr
=
out
->
mutable_data
<
half
>
();
arg
->
sub_conv_num
=
1
;
arg
->
sub_conv_num
=
1
;
// arg->relu_enabled = relu_enabled;
// arg->relu_enabled = relu_enabled;
arg
->
output
.
activation
.
activation_type
=
activation_enable
;
arg
->
output
.
activation
.
activation_type
=
activation_enable
;
...
@@ -848,9 +859,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
...
@@ -848,9 +859,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
int8_t
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
...
@@ -885,7 +895,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
...
@@ -885,7 +895,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
real_out_height
,
real_out_width
});
{
1
,
arg
->
filter_num
,
real_out_height
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
/*====For Addition
/*====For Addition
arg->output.address =
arg->output.address =
...
...
src/fpga/V1/image.cpp
浏览文件 @
16927084
...
@@ -22,7 +22,6 @@ namespace fpga {
...
@@ -22,7 +22,6 @@ namespace fpga {
namespace
image
{
namespace
image
{
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
));
// NOLINT
(
float
*
)
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
));
// NOLINT
int64_t
amount_per_row
=
width
*
channel
;
int64_t
amount_per_row
=
width
*
channel
;
...
@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
...
@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
}
}
}
}
*
data_in
=
data_tmp
;
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
}
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
)
{
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
)
{
int
h
=
0
;
int
h
=
0
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
// NOLINT
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
// NOLINT
for
(
h
=
0
;
h
<
height
;
h
++
)
{
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
// NOLINT
(
void
*
)(
*
data_in
+
h
*
cw
),
// NOLINT
cw
*
sizeof
(
float
));
}
*
data_in
=
data_tmp
;
for
(
h
=
0
;
h
<
height
;
h
++
)
{
fpga_free
(
tmp
);
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
// NOLINT
(
void
*
)(
*
data_in
+
h
*
cw
),
// NOLINT
cw
*
sizeof
(
float
));
}
}
*
data_in
=
data_tmp
;
}
}
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
convert_to_hwc
(
data_in
,
channel
,
height
,
width
);
convert_to_hwc
(
data_in
,
channel
,
height
,
width
);
align_element_conv
(
data_in
,
height
,
channel
*
width
);
int
cw
=
channel
*
width
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
hwc_temp
=
*
data_in
;
align_element_conv
(
data_in
,
height
,
channel
*
width
);
fpga_free
(
hwc_temp
);
}
fpga_flush
(
*
data_in
,
align_to_x
(
channel
*
width
,
IMAGE_ALIGNMENT
)
*
height
*
fpga_flush
(
*
data_in
,
align_to_x
(
channel
*
width
,
IMAGE_ALIGNMENT
)
*
height
*
sizeof
(
float
));
sizeof
(
float
));
}
}
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
16927084
...
@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
...
@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
// << counter << " bytes";
}
else
{
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"
Address: "
<<
ptr
<<
"
Invalid pointer"
;
}
}
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
...
...
src/fpga/common/fpga_common.h
浏览文件 @
16927084
...
@@ -19,17 +19,16 @@ limitations under the License. */
...
@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory>
#include <memory>
#include <vector>
#include <vector>
namespace
paddle_mobile
{
namespace
fpga
{
#ifdef PADDLE_MOBILE_FPGA_V1
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT
16
// Aligned to 16
#define IMAGE_ALIGNMENT
(16)
// Aligned to 16
#define FILTER_NUM_ALIGNMENT
32
// Filter number aligned to 32
#define FILTER_NUM_ALIGNMENT
(32)
// Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT
16
// Filter element number aligned to 16
#define FILTER_ELEMENT_ALIGNMENT
(16)
// Filter element number aligned to 16
#define BS_NUM_ALIGNMENT
8
#define BS_NUM_ALIGNMENT
(8)
#define BIAS_NUM_ALIGNMENT
16
#define BIAS_NUM_ALIGNMENT
(16)
#endif
#endif
namespace
paddle_mobile
{
namespace
fpga
{
enum
DataType
{
enum
DataType
{
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP16
=
0
,
DATA_TYPE_FP16
=
0
,
...
@@ -49,7 +48,7 @@ enum ActivationType {
...
@@ -49,7 +48,7 @@ enum ActivationType {
};
};
struct
ActivationArgs
{
struct
ActivationArgs
{
enum
ActivationType
activation_type
;
enum
ActivationType
activation_type
=
NONE
;
int16_t
leaky_relu_negative_slope
;
int16_t
leaky_relu_negative_slope
;
};
};
...
...
src/framework/executor.cpp
浏览文件 @
16927084
...
@@ -84,6 +84,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
...
@@ -84,6 +84,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
InitMemory
();
InitMemory
();
}
}
#ifdef PADDLE_MOBILE_FPGA
program_
.
scope
->
EraseVars
({
"feed"
,
"fetch"
});
program_
.
scope
->
print_vars
();
#endif
int
count
=
0
;
int
count
=
0
;
for
(
int
block_id
=
0
;
block_id
<
ops_of_block_
.
size
();
++
block_id
)
{
for
(
int
block_id
=
0
;
block_id
<
ops_of_block_
.
size
();
++
block_id
)
{
for
(
auto
&
op_handler
:
ops_of_block_
[
block_id
])
{
for
(
auto
&
op_handler
:
ops_of_block_
[
block_id
])
{
...
@@ -92,14 +97,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
...
@@ -92,14 +97,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
ops_list_
.
push_back
(
op_handler
);
ops_list_
.
push_back
(
op_handler
);
}
}
}
}
#ifdef PADDLE_MOBILE_FPGA
TalorFeedOp
();
DLOG
<<
"TalorFeed finished"
;
TalorFetchdOp
();
DLOG
<<
"TalorFetch finished"
;
program_
.
scope
->
print_vars
();
#endif
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -451,49 +448,6 @@ std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
...
@@ -451,49 +448,6 @@ std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
}
}
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
TalorFeedOp
()
{
auto
&
ops
=
ops_of_block_
[
0
];
int
num
=
0
;
program_
.
scope
->
EraseVars
(
std
::
vector
<
string
>
{
string
(
"feed"
)});
for
(
auto
op
:
ops
)
{
if
(
op
->
Type
()
==
"feed"
)
{
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
num
++
);
auto
var
=
program_
.
scope
->
Var
(
new_name
);
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
auto
output_map
=
op
->
Outputs
();
std
::
vector
<
std
::
string
>
out_keys
=
op
->
GetOutKeys
();
PADDLE_MOBILE_ENFORCE
(
!
out_keys
.
empty
(),
"this op contains no output"
);
auto
output_tensor
=
GetVarValue
<
LoDTensor
>
(
out_keys
[
0
],
output_map
,
*
(
program_
.
scope
));
tensor
->
Resize
(
output_tensor
->
dims
());
tensor
->
init
(
typeid
(
float
));
op
->
ChangeNameMap
(
"X"
,
std
::
vector
<
string
>
{
new_name
});
}
}
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
TalorFetchdOp
()
{
auto
&
ops
=
ops_of_block_
[
0
];
int
num
=
0
;
program_
.
scope
->
EraseVars
(
std
::
vector
<
string
>
{
string
(
"fetch"
)});
for
(
auto
op
:
ops
)
{
if
(
op
->
Type
()
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
num
++
);
auto
var
=
program_
.
scope
->
Var
(
new_name
);
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
auto
input_map
=
op
->
Inputs
();
std
::
vector
<
std
::
string
>
in_keys
=
op
->
GetInputKeys
();
PADDLE_MOBILE_ENFORCE
(
!
in_keys
.
empty
(),
"this op contains no input"
);
auto
input_tensor
=
GetVarValue
<
LoDTensor
>
(
in_keys
[
0
],
input_map
,
*
(
program_
.
scope
));
tensor
->
Resize
(
input_tensor
->
dims
());
tensor
->
init
(
typeid
(
float
));
op
->
ChangeNameMap
(
"Out"
,
std
::
vector
<
string
>
{
new_name
});
}
}
}
template
<
typename
Device
,
typename
T
>
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
InjectVariable
(
const
Tensor
&
t
,
void
Executor
<
Device
,
T
>::
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
)
{
std
::
string
var_name
)
{
...
@@ -509,18 +463,29 @@ void Executor<Device, T>::FeedData(const Tensor &t) {
...
@@ -509,18 +463,29 @@ void Executor<Device, T>::FeedData(const Tensor &t) {
}
}
template
<
typename
Device
,
typename
T
>
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
Tensor
>
&
v
)
{
void
Executor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*
>
&
v
)
{
auto
input_size
=
v
.
size
();
auto
input_size
=
v
.
size
();
PADDLE_MOBILE_ENFORCE
(
input_size
>
0
,
"Empty input"
);
int
counter
=
0
;
auto
vars
=
program_
.
scope
->
VarContain
(
"feed"
);
auto
vars
=
program_
.
scope
->
VarContain
(
"feed"
);
for
(
auto
var
:
vars
)
{
PADDLE_MOBILE_ENFORCE
(
input_size
==
vars
.
size
(),
Tensor
*
feed_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
"input data number not correct"
);
feed_tensor
->
Resize
(
v
[
counter
].
dims
());
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
feed_tensor
->
ShareDataWith
(
v
[
counter
]);
auto
var
=
program_
.
scope
->
Var
(
"feed"
,
i
);
if
(
++
counter
>
v
.
size
())
{
auto
feed_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
return
;
feed_tensor
->
external_data
=
v
[
i
];
}
}
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
v
)
{
auto
output_size
=
v
->
size
();
PADDLE_MOBILE_ENFORCE
(
output_size
>
0
,
"Empty output"
);
auto
vars
=
program_
.
scope
->
VarContain
(
"fetch"
);
PADDLE_MOBILE_ENFORCE
(
output_size
==
vars
.
size
(),
"output data number not correct"
);
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
auto
var
=
program_
.
scope
->
Var
(
"fetch"
,
i
);
auto
fetch_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
(
*
v
)[
i
]
=
fetch_tensor
->
template
data
<
float
>();
}
}
}
}
...
...
src/framework/executor.h
浏览文件 @
16927084
...
@@ -50,11 +50,10 @@ class Executor {
...
@@ -50,11 +50,10 @@ class Executor {
std
::
shared_ptr
<
LoDTensor
>
GetOutput
(
const
std
::
string
&
var_name
);
std
::
shared_ptr
<
LoDTensor
>
GetOutput
(
const
std
::
string
&
var_name
);
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
void
TalorFeedOp
();
void
TalorFetchdOp
();
void
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
);
void
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
Tensor
&
t
);
void
FeedData
(
const
Tensor
&
t
);
void
FeedData
(
const
std
::
vector
<
Tensor
>
&
v
);
void
FeedData
(
const
std
::
vector
<
void
*>
&
v
);
void
GetResults
(
std
::
vector
<
void
*>
*
v
);
std
::
shared_ptr
<
Tensor
>
FetchResult
(
int
id
=
-
1
);
std
::
shared_ptr
<
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
void
Predict_From
(
int
start
);
...
...
src/framework/operator.cpp
浏览文件 @
16927084
...
@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
...
@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_
(
attrs
),
attrs_
(
attrs
),
scope_
(
scope
)
{
scope_
(
scope
)
{
CheckAllInputOutputSet
();
CheckAllInputOutputSet
();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors
();
#endif
}
}
template
<
typename
Dtype
>
template
<
typename
Dtype
>
...
@@ -133,15 +136,19 @@ void OperatorBase<GPU_CL>::Run() {
...
@@ -133,15 +136,19 @@ void OperatorBase<GPU_CL>::Run() {
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Dtype
>
template
<
typename
Dtype
>
void
OperatorBase
<
Dtype
>::
ChangeNameMap
(
string
key
,
std
::
vector
<
string
>
value
)
{
void
OperatorBase
<
Dtype
>::
InsertTensors
()
{
auto
it
=
inputs_
.
find
(
key
);
static
int
feed_num
=
0
;
if
(
it
!=
inputs_
.
end
())
{
static
int
fetch_num
=
0
;
inputs_
[
key
]
=
value
;
if
(
type_
==
"feed"
)
{
return
;
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
}
auto
var
=
scope_
->
Var
(
new_name
);
it
=
outputs_
.
find
(
key
);
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
if
(
it
!=
outputs_
.
end
())
{
inputs_
.
at
(
"X"
)
=
{
string
(
new_name
)};
inputs_
[
key
]
=
value
;
}
else
if
(
type_
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
auto
var
=
scope_
->
Var
(
new_name
);
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
outputs_
.
at
(
"Out"
)
=
{
string
(
new_name
)};
}
}
}
}
#endif
#endif
...
...
src/framework/operator.h
浏览文件 @
16927084
...
@@ -79,6 +79,7 @@ class OperatorBase {
...
@@ -79,6 +79,7 @@ class OperatorBase {
}
}
}
}
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
void
InsertTensors
();
void
ChangeNameMap
(
string
key
,
std
::
vector
<
string
>
value
);
void
ChangeNameMap
(
string
key
,
std
::
vector
<
string
>
value
);
#endif
#endif
protected:
protected:
...
@@ -95,6 +96,7 @@ class OperatorBase {
...
@@ -95,6 +96,7 @@ class OperatorBase {
template
<
typename
Dtype
,
typename
ParamType
,
typename
KernelType
>
template
<
typename
Dtype
,
typename
ParamType
,
typename
KernelType
>
class
OperatorWithKernel
:
public
OperatorBase
<
Dtype
>
{
class
OperatorWithKernel
:
public
OperatorBase
<
Dtype
>
{
public:
public:
#ifndef PADDLE_MOBILE_FPGA1
OperatorWithKernel
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
OperatorWithKernel
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
std
::
shared_ptr
<
Scope
>
scope
)
std
::
shared_ptr
<
Scope
>
scope
)
...
@@ -104,7 +106,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
...
@@ -104,7 +106,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_
.
InitCLHelper
(
scope
->
GetCLScpoe
());
kernel_
.
InitCLHelper
(
scope
->
GetCLScpoe
());
#endif
#endif
}
}
#else
OperatorWithKernel
(
const
std
::
string
&
type
,
const
VariableNameMap
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
std
::
shared_ptr
<
Scope
>
scope
)
:
OperatorBase
<
Dtype
>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{
static
int
feed_num
=
0
;
static
int
fetch_num
=
0
;
if
(
type
==
"feed"
)
{
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
auto
var
=
scope
->
Var
(
new_name
);
(
const_cast
<
VariableNameMap
&>
(
inputs
)).
at
(
"X"
)
=
{
string
(
new_name
)};
}
else
if
(
type
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
auto
var
=
scope
->
Var
(
new_name
);
(
const_cast
<
VariableNameMap
&>
(
outputs
)).
at
(
"Out"
)
=
{
string
(
new_name
)};
}
param_
=
ParamType
(
inputs
,
outputs
,
attrs
,
*
scope
);
}
#endif
virtual
void
RunImpl
()
{
this
->
kernel_
.
Compute
(
this
->
param_
);
}
virtual
void
RunImpl
()
{
this
->
kernel_
.
Compute
(
this
->
param_
);
}
virtual
void
InferShape
()
const
=
0
;
virtual
void
InferShape
()
const
=
0
;
...
...
src/framework/tensor.h
浏览文件 @
16927084
...
@@ -202,6 +202,10 @@ class Tensor : public TensorBase {
...
@@ -202,6 +202,10 @@ class Tensor : public TensorBase {
inline
void
reset_data_ptr
(
void
*
p
)
{
inline
void
reset_data_ptr
(
void
*
p
)
{
((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
reset
((
uint8_t
*
)
p
);
// NOLINT
((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
reset
((
uint8_t
*
)
p
);
// NOLINT
}
}
inline
void
set_type
(
std
::
type_index
type
)
{
holder_
->
set_type
(
type
);
}
inline
void
*
get_data
()
{
return
(
void
*
)(((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
get
());
}
// NOLINT
inline
void
*
init
(
std
::
type_index
type
)
{
inline
void
*
init
(
std
::
type_index
type
)
{
if
(
holder_
!=
nullptr
)
{
if
(
holder_
!=
nullptr
)
{
...
@@ -217,7 +221,8 @@ class Tensor : public TensorBase {
...
@@ -217,7 +221,8 @@ class Tensor : public TensorBase {
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())
+
offset_
);
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())
+
offset_
);
}
}
float
scale
[
2
];
// scale[0]= MAX/127.0, scale[1]= 127.0/MAX
float
scale
[
2
];
// scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void
*
external_data
=
nullptr
;
// only used for Feed
#endif
#endif
};
};
...
...
src/io/api_paddle_mobile.cc
浏览文件 @
16927084
...
@@ -177,6 +177,23 @@ bool PaddleMobilePredictor<Device, T>::Run(
...
@@ -177,6 +177,23 @@ bool PaddleMobilePredictor<Device, T>::Run(
return
true
;
return
true
;
}
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
inputs
)
{
paddle_mobile_
->
FeedData
(
inputs
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
outputs
)
{
paddle_mobile_
->
GetResults
(
outputs
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
Predict_From_To
(
int
start
,
int
end
)
{
paddle_mobile_
->
Predict_From_To
(
start
,
end
);
}
#endif
#endif
template
<
typename
Device
,
typename
T
>
template
<
typename
Device
,
typename
T
>
PaddleMobilePredictor
<
Device
,
T
>::~
PaddleMobilePredictor
()
{
PaddleMobilePredictor
<
Device
,
T
>::~
PaddleMobilePredictor
()
{
...
...
src/io/api_paddle_mobile.h
浏览文件 @
16927084
...
@@ -35,6 +35,9 @@ class PaddleMobilePredictor : public PaddlePredictor {
...
@@ -35,6 +35,9 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
override
;
int
batch_size
=
-
1
)
override
;
void
FeedData
(
const
std
::
vector
<
void
*>&
inputs
)
override
;
void
GetResults
(
std
::
vector
<
void
*>*
outputs
)
override
;
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
)
override
;
#endif
#endif
~
PaddleMobilePredictor
()
override
;
~
PaddleMobilePredictor
()
override
;
...
...
src/io/paddle_inference_api.h
浏览文件 @
16927084
...
@@ -119,6 +119,9 @@ class PaddlePredictor {
...
@@ -119,6 +119,9 @@ class PaddlePredictor {
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
=
0
;
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
=
0
;
virtual
void
FeedData
(
const
std
::
vector
<
void
*>&
inputs
)
=
0
;
virtual
void
GetResults
(
std
::
vector
<
void
*>*
outputs
)
=
0
;
virtual
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
)
=
0
;
#endif
#endif
protected:
protected:
...
...
src/io/paddle_mobile.cpp
浏览文件 @
16927084
...
@@ -228,10 +228,14 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
...
@@ -228,10 +228,14 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_
->
FeedData
(
t
);
executor_
->
FeedData
(
t
);
}
}
template
<
typename
Device
,
typename
T
>
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
FeedData
(
void
PaddleMobile
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
v
)
{
const
std
::
vector
<
framework
::
Tensor
>
&
v
)
{
executor_
->
FeedData
(
v
);
executor_
->
FeedData
(
v
);
};
};
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
v
)
{
executor_
->
GetResults
(
v
);
}
template
<
typename
Device
,
typename
T
>
template
<
typename
Device
,
typename
T
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Device
,
T
>::
FetchResult
(
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Device
,
T
>::
FetchResult
(
int
id
)
{
int
id
)
{
...
...
src/io/paddle_mobile.h
浏览文件 @
16927084
...
@@ -90,7 +90,8 @@ class PaddleMobile {
...
@@ -90,7 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
void
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
);
void
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
framework
::
Tensor
&
t
);
void
FeedData
(
const
framework
::
Tensor
&
t
);
void
FeedData
(
const
std
::
vector
<
framework
::
Tensor
>
&
v
);
void
FeedData
(
const
std
::
vector
<
void
*>
&
v
);
void
GetResults
(
std
::
vector
<
void
*>
*
v
);
std
::
shared_ptr
<
framework
::
Tensor
>
FetchResult
(
int
id
=
-
1
);
std
::
shared_ptr
<
framework
::
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
void
Predict_From
(
int
start
);
...
...
src/operators/kernel/detection_kernel.h
浏览文件 @
16927084
...
@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
...
@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float
nms_thresh_
;
float
nms_thresh_
;
float
min_size_
;
float
min_size_
;
float
eta_
;
float
eta_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_score
,
float_bbox
;
fpga
::
BypassArgs
score_arg
,
bbox_arg
;
#endif
};
};
DECLARE_KERNEL
(
Proposal
,
ProposalParam
);
DECLARE_KERNEL
(
Proposal
,
ProposalParam
);
...
@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
...
@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int
pooled_height_
;
int
pooled_height_
;
int
pooled_width_
;
int
pooled_width_
;
float
spatial_scale_
;
float
spatial_scale_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_input
,
float_output
;
fpga
::
BypassArgs
input_arg
,
output_arg
;
#endif
};
};
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
...
...
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
浏览文件 @
16927084
...
@@ -23,15 +23,46 @@ namespace operators {
...
@@ -23,15 +23,46 @@ namespace operators {
template
<
>
template
<
>
bool
AnchorGeneratorKernel
<
FPGA
,
float
>::
Init
(
bool
AnchorGeneratorKernel
<
FPGA
,
float
>::
Init
(
AnchorGeneratorParam
<
FPGA
>
*
param
)
{
AnchorGeneratorParam
<
FPGA
>
*
param
)
{
// TODO zhangyang
auto
input
=
param
->
input_
;
auto
anchors
=
param
->
output_anchors_
;
auto
anchor_ptr
=
anchors
->
mutable_data
<
float
>
();
auto
stride
=
param
->
stride_
;
auto
feature_width
=
input
->
dims
()[
3
],
feature_height
=
input
->
dims
()[
2
];
auto
stride_width
=
stride
[
0
],
stride_height
=
stride
[
1
];
int
anchors_offset
[]
=
{
-
2
,
-
2
,
18
,
18
,
-
10
,
-
9
,
26
,
25
,
-
23
,
-
20
,
39
,
36
,
-
43
,
-
34
,
59
,
49
,
-
63
,
-
54
,
79
,
69
,
-
96
,
-
77
,
112
,
93
,
-
137
,
-
118
,
153
,
134
,
-
204
,
-
188
,
220
,
204
,
-
281
,
-
395
,
296
,
441
};
int
num_anchors
=
sizeof
(
anchors_offset
)
/
(
sizeof
(
int
)
*
4
);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for
(
int
h_idx
=
0
;
h_idx
<
feature_height
;
++
h_idx
)
{
for
(
int
w_idx
=
0
;
w_idx
<
feature_width
;
++
w_idx
)
{
int
offset
=
h_idx
*
w_idx
*
num_anchors
*
4
;
for
(
int
idx
=
0
;
idx
<
num_anchors
;
idx
++
)
{
anchor_ptr
[
offset
+
0
]
=
anchors_offset
[
idx
*
4
+
0
]
+
w_idx
*
stride_width
;
anchor_ptr
[
offset
+
1
]
=
anchors_offset
[
idx
*
4
+
1
]
+
h_idx
*
stride_height
;
anchor_ptr
[
offset
+
2
]
=
anchors_offset
[
idx
*
4
+
2
]
+
w_idx
*
stride_width
;
anchor_ptr
[
offset
+
3
]
=
anchors_offset
[
idx
*
4
+
3
]
+
h_idx
*
stride_height
;
}
}
}
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
AnchorGeneratorKernel
<
FPGA
,
float
>::
Compute
(
void
AnchorGeneratorKernel
<
FPGA
,
float
>::
Compute
(
const
AnchorGeneratorParam
<
FPGA
>
&
param
)
{
const
AnchorGeneratorParam
<
FPGA
>
&
param
)
{}
// TODO(hjchen2)
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
src/operators/kernel/fpga/V1/concat_kernel.cpp
浏览文件 @
16927084
...
@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
...
@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE
(
PADDLE_MOBILE_ENFORCE
(
input
->
dims
()[
2
]
==
height
&&
input
->
dims
()[
3
]
==
width
,
input
->
dims
()[
2
]
==
height
&&
input
->
dims
()[
3
]
==
width
,
"Image height & width should be unified"
);
"Image height & width should be unified"
);
images_in
[
i
]
=
(
half
*
)
input
->
data
<
float
>
();
// NOLINT
images_in
[
i
]
=
input
->
data
<
half
>
();
channel_num
[
i
]
=
(
uint32_t
)
inputs
[
i
]
->
dims
()[
1
];
// NOLINT
channel_num
[
i
]
=
(
uint32_t
)
inputs
[
i
]
->
dims
()[
1
];
// NOLINT
scales_in
[
i
]
=
input
->
scale
;
scales_in
[
i
]
=
input
->
scale
;
}
}
...
@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
...
@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs
.
image_num
=
image_num
;
concatArgs
.
image_num
=
image_num
;
concatArgs
.
images_in
=
images_in
;
concatArgs
.
images_in
=
images_in
;
concatArgs
.
scales_in
=
scales_in
;
concatArgs
.
scales_in
=
scales_in
;
concatArgs
.
image_out
=
(
half
*
)
out
->
data
<
float
>
();
// NOLINT
concatArgs
.
image_out
=
out
->
data
<
half
>
();
concatArgs
.
scale_out
=
out
->
scale
;
concatArgs
.
scale_out
=
out
->
scale
;
concatArgs
.
channel_num
=
channel_num
;
concatArgs
.
channel_num
=
channel_num
;
concatArgs
.
height
=
height
;
concatArgs
.
height
=
height
;
...
...
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
浏览文件 @
16927084
...
@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
...
@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
auto
*
out
=
param
->
Out
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
input_x_ptr
=
input_x
->
data
<
half
>
();
auto
input_y_ptr
=
input_y
->
data
<
float
>
();
auto
input_y_ptr
=
input_y
->
data
<
half
>
();
fpga
::
format_fp16_ofm
(
out
);
fpga
::
format_fp16_ofm
(
out
);
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
auto
out_ptr
=
out
->
mutable_data
<
half
>
();
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
// ewaddArgs.relu_enabled = relu_enabled;
// ewaddArgs.relu_enabled = relu_enabled;
...
...
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
浏览文件 @
16927084
...
@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
...
@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
auto
*
out
=
param
->
Out
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
input_x_ptr
=
input_x
->
data
<
half
>
();
auto
input_y_ptr
=
input_y
->
data
<
float
>
();
auto
input_y_ptr
=
input_y
->
data
<
half
>
();
fpga
::
format_fp16_ofm
(
out
);
fpga
::
format_fp16_ofm
(
out
);
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
auto
out_ptr
=
out
->
mutable_data
<
half
>
();
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
// ewaddArgs.relu_enabled = relu_enabled;
// ewaddArgs.relu_enabled = relu_enabled;
...
...
src/operators/kernel/fpga/V1/feed_kernel.cpp
浏览文件 @
16927084
...
@@ -19,19 +19,35 @@ namespace operators {
...
@@ -19,19 +19,35 @@ namespace operators {
template
<
>
template
<
>
bool
FeedKernel
<
FPGA
,
float
>::
Init
(
FeedParam
<
FPGA
>
*
param
)
{
bool
FeedKernel
<
FPGA
,
float
>::
Init
(
FeedParam
<
FPGA
>
*
param
)
{
Tensor
*
output
=
param
->
Out
();
auto
output
=
param
->
Out
();
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
input
->
init
(
typeid
(
float
));
input
->
Resize
(
output
->
dims
());
if
(
output
->
dims
().
size
()
!=
4
)
{
auto
input_ptr
=
input
->
mutable_data
<
float
>
();
size_t
size
=
output
->
numel
()
*
sizeof
(
float
);
auto
p
=
fpga
::
fpga_malloc
(
size
);
memcpy
(
p
,
input_ptr
,
size
);
output
->
reset_data_ptr
(
p
);
return
true
;
}
fpga
::
format_fp16_ofm
(
output
);
fpga
::
format_fp16_ofm
(
output
);
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
FeedKernel
<
FPGA
,
float
>::
Compute
(
const
FeedParam
<
FPGA
>
&
param
)
{
void
FeedKernel
<
FPGA
,
float
>::
Compute
(
const
FeedParam
<
FPGA
>
&
param
)
{
auto
input
=
auto
output
=
param
.
Out
();
reinterpret_cast
<
Tensor
*>
(
const_cast
<
LoDTensor
*>
(
param
.
InputX
()));
auto
input
=
const_cast
<
LoDTensor
*>
(
param
.
InputX
());
if
(
input
->
dims
().
size
()
!=
4
)
{
return
;
}
fpga
::
format_image
(
input
);
fpga
::
format_image
(
input
);
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
Tensor
*
output
=
param
.
Out
();
auto
output_ptr
=
output
->
data
<
half
>
();
auto
output_ptr
=
output
->
data
<
float
>
();
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP32
};
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP32
};
...
@@ -39,7 +55,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
...
@@ -39,7 +55,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
image
.
address
=
reinterpret_cast
<
void
*>
(
input_ptr
)
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
...
@@ -48,6 +64,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
...
@@ -48,6 +64,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
output
.
address
=
output_ptr
;
args
.
output
.
address
=
output_ptr
;
args
.
output
.
scale_address
=
output
->
scale
;
args
.
output
.
scale_address
=
output
->
scale
;
fpga
::
PerformBypass
(
args
);
fpga
::
PerformBypass
(
args
);
input
->
external_data
=
nullptr
;
}
}
template
class
FeedKernel
<
FPGA
,
float
>;
template
class
FeedKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/fetch_kernel.cpp
浏览文件 @
16927084
...
@@ -19,20 +19,15 @@ namespace operators {
...
@@ -19,20 +19,15 @@ namespace operators {
template
<
>
template
<
>
bool
FetchKernel
<
FPGA
,
float
>::
Init
(
FetchParam
<
FPGA
>
*
param
)
{
bool
FetchKernel
<
FPGA
,
float
>::
Init
(
FetchParam
<
FPGA
>
*
param
)
{
Tensor
*
output
=
param
->
Out
();
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
// fpga::format_fp16_ofm(output);
auto
output
=
param
->
Out
();
return
true
;
if
(
input
->
type
()
==
typeid
(
float
))
{
}
output
->
ShareDataWith
(
*
input
);
return
true
;
template
<
>
}
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
output
->
init
(
typeid
(
float
));
param
.
Out
()
->
ShareDataWith
(
*
(
param
.
InputX
()));
output
->
Resize
(
input
->
dims
());
/*auto input =
fpga
::
format_fp32_ofm
(
output
);
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
...
@@ -40,13 +35,28 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
...
@@ -40,13 +35,28 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args.image.address = reinterpret_cast<void *>(input_ptr);
args
.
image
.
address
=
input
->
data
<
half
>
();
args.image.channels = (uint32_t)input->dims()[1];
args
.
image
.
channels
=
(
uint32_t
)
product
(
input
->
dims
());
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
args
.
image
.
height
=
1
;
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
args
.
image
.
width
=
1
;
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
args
.
image
.
pad_height
=
0
;
= output_ptr; args.output.scale_address = output->scale;
args
.
image
.
pad_width
=
0
;
fpga::PerformBypass(args);*/
args
.
output
.
address
=
output
->
data
<
float
>
();
args
.
output
.
scale_address
=
output
->
scale
;
param
->
fpga_bypass_args
=
args
;
return
true
;
}
template
<
>
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
auto
input
=
param
.
InputX
();
if
(
input
->
type
()
==
typeid
(
float
))
{
return
;
}
fpga
::
PerformBypass
(
param
.
fpga_bypass_args
);
// TODO: DEalign: get rid of extra 0
}
}
template
class
FetchKernel
<
FPGA
,
float
>;
template
class
FetchKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/pool_kernel.cpp
浏览文件 @
16927084
...
@@ -22,10 +22,10 @@ namespace operators {
...
@@ -22,10 +22,10 @@ namespace operators {
template
<
>
template
<
>
bool
PoolKernel
<
FPGA
,
float
>::
Init
(
PoolParam
<
FPGA
>
*
param
)
{
bool
PoolKernel
<
FPGA
,
float
>::
Init
(
PoolParam
<
FPGA
>
*
param
)
{
auto
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
Tensor
*
output
=
param
->
Output
();
Tensor
*
output
=
param
->
Output
();
fpga
::
format_fp16_ofm
(
output
);
fpga
::
format_fp16_ofm
(
output
);
auto
output_ptr
=
output
->
mutable_data
<
float
>
();
auto
output_ptr
=
output
->
mutable_data
<
half
>
();
vector
<
int
>
ksize
=
param
->
Ksize
();
vector
<
int
>
ksize
=
param
->
Ksize
();
vector
<
int
>
strides
=
param
->
Strides
();
vector
<
int
>
strides
=
param
->
Strides
();
vector
<
int
>
paddings
=
param
->
Paddings
();
vector
<
int
>
paddings
=
param
->
Paddings
();
...
...
src/operators/kernel/fpga/V1/proposal_kernel.cpp
浏览文件 @
16927084
...
@@ -14,20 +14,422 @@ limitations under the License. */
...
@@ -14,20 +14,422 @@ limitations under the License. */
#ifdef PROPOSAL_OP
#ifdef PROPOSAL_OP
#include <algorithm>
#include <vector>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
template
<
>
template
<
>
bool
ProposalKernel
<
FPGA
,
float
>::
Init
(
ProposalParam
<
FPGA
>
*
param
)
{
bool
ProposalKernel
<
FPGA
,
float
>::
Init
(
ProposalParam
<
FPGA
>
*
param
)
{
int
post_nms_top_n
=
param
->
post_nms_topn_
;
int64_t
batch
=
param
->
scores_
->
dims
()[
0
];
auto
total
=
post_nms_top_n
*
batch
;
param
->
rpn_rois_
->
mutable_data
<
float
>
({
total
,
4
});
param
->
rpn_probs_
->
mutable_data
<
float
>
({
total
,
1
});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param
->
float_bbox
=
std
::
make_shared
<
Tensor
>
();
param
->
float_bbox
->
Resize
(
param
->
bbox_deltas_
->
dims
());
param
->
float_bbox
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
param
->
float_bbox
.
get
());
param
->
float_score
=
std
::
make_shared
<
Tensor
>
();
param
->
float_score
->
Resize
(
param
->
scores_
->
dims
());
param
->
float_score
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
param
->
float_score
.
get
());
auto
input
=
param
->
bbox_deltas_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_bbox
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_bbox
->
scale
;
param
->
bbox_arg
=
args
;
input
=
param
->
scores_
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_score
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_score
->
scale
;
param
->
score_arg
=
args
;
return
true
;
return
true
;
}
}
void
AppendProposals
(
Tensor
*
dst
,
int64_t
offset
,
const
Tensor
&
src
)
{
auto
*
out_data
=
dst
->
data
<
void
>
();
auto
*
to_add_data
=
src
.
data
<
void
>
();
size_t
size_of_t
=
framework
::
SizeOfType
(
src
.
type
());
offset
*=
size_of_t
;
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
out_data
)
+
offset
),
to_add_data
,
src
.
numel
()
*
size_of_t
);
}
template
<
class
T
>
static
inline
void
BoxCoder
(
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
();
int64_t
row
=
all_anchors
->
dims
()[
0
];
int64_t
len
=
all_anchors
->
dims
()[
1
];
auto
*
bbox_deltas_data
=
bbox_deltas
->
data
<
T
>
();
auto
*
anchor_data
=
all_anchors
->
data
<
T
>
();
const
T
*
variances_data
=
nullptr
;
if
(
variances
)
{
variances_data
=
variances
->
data
<
T
>
();
}
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
T
anchor_width
=
anchor_data
[
i
*
len
+
2
]
-
anchor_data
[
i
*
len
]
+
1.0
;
T
anchor_height
=
anchor_data
[
i
*
len
+
3
]
-
anchor_data
[
i
*
len
+
1
]
+
1.0
;
T
anchor_center_x
=
anchor_data
[
i
*
len
]
+
0.5
*
anchor_width
;
T
anchor_center_y
=
anchor_data
[
i
*
len
+
1
]
+
0.5
*
anchor_height
;
T
bbox_center_x
=
0
,
bbox_center_y
=
0
;
T
bbox_width
=
0
,
bbox_height
=
0
;
if
(
variances
)
{
bbox_center_x
=
variances_data
[
i
*
len
]
*
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
variances_data
[
i
*
len
+
1
]
*
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
],
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
],
kBBoxClipDefault
))
*
anchor_height
;
}
else
{
bbox_center_x
=
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
kBBoxClipDefault
))
*
anchor_height
;
}
proposals_data
[
i
*
len
]
=
bbox_center_x
-
bbox_width
/
2
;
proposals_data
[
i
*
len
+
1
]
=
bbox_center_y
-
bbox_height
/
2
;
proposals_data
[
i
*
len
+
2
]
=
bbox_center_x
+
bbox_width
/
2
-
1
;
proposals_data
[
i
*
len
+
3
]
=
bbox_center_y
+
bbox_height
/
2
-
1
;
}
// return proposals;
}
template
<
class
T
>
static
inline
void
ClipTiledBoxes
(
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
();
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
zero
(
0
);
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
}
}
template
<
class
T
>
static
inline
void
FilterBoxes
(
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
();
T
im_scale
=
im_info_data
[
2
];
keep
->
Resize
({
boxes
->
dims
()[
0
]});
min_size
=
std
::
max
(
min_size
,
1.0
f
);
int
*
keep_data
=
keep
->
mutable_data
<
int
>
();
int
keep_len
=
0
;
for
(
int
i
=
0
;
i
<
boxes
->
dims
()[
0
];
++
i
)
{
T
ws
=
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
]
+
1
;
T
hs
=
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
]
+
1
;
T
ws_origin_scale
=
(
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
])
/
im_scale
+
1
;
T
hs_origin_scale
=
(
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
])
/
im_scale
+
1
;
T
x_ctr
=
boxes_data
[
4
*
i
]
+
ws
/
2
;
T
y_ctr
=
boxes_data
[
4
*
i
+
1
]
+
hs
/
2
;
if
(
ws_origin_scale
>=
min_size
&&
hs_origin_scale
>=
min_size
&&
x_ctr
<=
im_info_data
[
1
]
&&
y_ctr
<=
im_info_data
[
0
])
{
keep_data
[
keep_len
++
]
=
i
;
}
}
keep
->
Resize
({
keep_len
});
}
template
<
class
T
>
static
inline
std
::
vector
<
std
::
pair
<
T
,
int
>>
GetSortedScoreIndex
(
const
std
::
vector
<
T
>
&
scores
)
{
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
sorted_indices
.
reserve
(
scores
.
size
());
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
.
emplace_back
(
scores
[
i
],
i
);
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
[](
const
std
::
pair
<
T
,
int
>
&
a
,
const
std
::
pair
<
T
,
int
>
&
b
)
{
return
a
.
first
<
b
.
first
;
});
return
sorted_indices
;
}
template
<
class
T
>
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
typename
T
>
static
inline
Tensor
VectorToTensor
(
const
std
::
vector
<
T
>
&
selected_indices
,
int
selected_num
)
{
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
auto
*
keep_data
=
keep_nms
.
mutable_data
<
T
>
();
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
class
T
>
static
inline
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
inter_xmin
=
std
::
max
(
box1
[
0
],
box2
[
0
]);
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
std
::
max
(
T
(
0
),
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
T
(
0
),
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
return
inter_area
/
(
bbox1_area
+
bbox2_area
-
inter_area
);
}
}
template
<
class
T
>
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
if
(
flag
)
{
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
}
else
{
break
;
}
}
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
++
selected_num
;
}
sorted_indices
.
erase
(
sorted_indices
.
end
()
-
1
);
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
return
VectorToTensor
(
selected_indices
,
selected_num
);
}
template
<
typename
T
>
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
// Sort index
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
int
*
index
=
index_t
.
mutable_data
<
int
>
();
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
}
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
if
(
pre_nms_top_n
<=
0
||
pre_nms_top_n
>=
scores_slice
.
numel
())
{
std
::
sort
(
index
,
index
+
scores_slice
.
numel
(),
compare
);
}
else
{
std
::
nth_element
(
index
,
index
+
pre_nms_top_n
,
index
+
scores_slice
.
numel
(),
compare
);
index_t
.
Resize
({
pre_nms_top_n
});
}
Tensor
scores_sel
,
bbox_sel
,
anchor_sel
,
var_sel
;
scores_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
1
});
bbox_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
anchor_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
var_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
BoxCoder
<
T
>
(
&
anchor_sel
,
&
bbox_sel
,
&
var_sel
,
&
proposals
);
ClipTiledBoxes
<
T
>
(
im_info_slice
,
&
proposals
);
Tensor
keep
;
FilterBoxes
<
T
>
(
&
proposals
,
min_size
,
im_info_slice
,
&
keep
);
Tensor
scores_filter
;
bbox_sel
.
mutable_data
<
T
>
({
keep
.
numel
(),
4
});
scores_filter
.
mutable_data
<
T
>
({
keep
.
numel
(),
1
});
if
(
nms_thresh
<=
0
)
{
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
}
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
keep_nms
.
Resize
({
post_nms_top_n
});
}
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
});
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
});
return
std
::
make_pair
(
proposals
,
scores_sel
);
}
template
<
>
template
<
>
void
ProposalKernel
<
FPGA
,
float
>::
Compute
(
const
ProposalParam
<
FPGA
>
&
param
)
{
void
ProposalKernel
<
FPGA
,
float
>::
Compute
(
const
ProposalParam
<
FPGA
>
&
param
)
{
// TODO(hjchen2)
auto
score_tensor
=
param
.
float_score
.
get
();
fpga
::
PerformBypass
(
param
.
score_arg
);
fpga
::
fpga_invalidate
(
score_tensor
->
data
<
float
>
(),
score_tensor
->
numel
()
*
sizeof
(
float
));
auto
bbox_tensor
=
param
.
float_bbox
.
get
();
fpga
::
PerformBypass
(
param
.
bbox_arg
);
fpga
::
fpga_invalidate
(
bbox_tensor
->
data
<
float
>
(),
bbox_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
scores
=
param
.
float_score
.
get
();
auto
*
bbox_deltas
=
param
.
float_bbox
.
get
();
auto
*
im_info
=
param
.
im_info_
;
auto
anchors
=
*
param
.
anchors_
;
auto
variances
=
*
param
.
variances_
;
auto
*
rpn_rois
=
param
.
rpn_rois_
;
auto
*
rpn_roi_probs
=
param
.
rpn_probs_
;
int
pre_nms_top_n
=
param
.
pre_nms_topn_
;
int
post_nms_top_n
=
param
.
post_nms_topn_
;
float
nms_thresh
=
param
.
nms_thresh_
;
float
min_size
=
param
.
min_size_
;
float
eta
=
param
.
eta_
;
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
//
Tensor
bbox_deltas_swap
,
scores_swap
;
bbox_deltas_swap
.
mutable_data
<
float
>
({
num
,
h_bbox
,
w_bbox
,
c_bbox
});
scores_swap
.
mutable_data
<
float
>
({
num
,
h_score
,
w_score
,
c_score
});
framework
::
LoD
lod
;
lod
.
resize
(
1
);
auto
&
lod0
=
lod
[
0
];
lod0
.
push_back
(
0
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
Tensor
im_info_slice
=
im_info
->
Slice
(
i
,
i
+
1
);
Tensor
bbox_deltas_slice
=
bbox_deltas_swap
.
Slice
(
i
,
i
+
1
);
Tensor
scores_slice
=
scores_swap
.
Slice
(
i
,
i
+
1
);
bbox_deltas_slice
.
Resize
({
h_bbox
*
w_bbox
*
c_bbox
/
4
,
4
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
AppendProposals
(
rpn_rois
,
4
*
num_proposals
,
proposals
);
AppendProposals
(
rpn_roi_probs
,
num_proposals
,
scores
);
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
push_back
(
num_proposals
);
}
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
rpn_roi_probs
->
Resize
({
num_proposals
,
1
});
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
浏览文件 @
16927084
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "operators/kernel/detection_kernel.h"
...
@@ -21,13 +22,180 @@ namespace paddle_mobile {
...
@@ -21,13 +22,180 @@ namespace paddle_mobile {
namespace
operators
{
namespace
operators
{
template
<
>
template
<
>
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>
*
param
)
{
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>*
param
)
{
auto
dims
=
param
->
input_x_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
"data not aligned"
);
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
param
->
float_output
=
std
::
make_shared
<
Tensor
>
();
param
->
float_output
->
mutable_data
<
float
>
(
param
->
output_
->
dims
());
auto
input
=
param
->
input_x_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
param
->
input_arg
=
args
;
fpga
::
format_fp16_ofm
(
param
->
output_
);
input
=
param
->
float_output
.
get
();
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
image
.
address
=
input
->
data
<
float
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
output_
->
mutable_data
<
half
>
();
args
.
output
.
scale_address
=
param
->
output_
->
scale
;
param
->
input_arg
=
args
;
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>
&
param
)
{
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
// TODO(hjchen2)
auto
input_tensor
=
param
.
float_input
.
get
();
fpga
::
PerformBypass
(
param
.
input_arg
);
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
input_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
in
=
input_tensor
;
auto
*
rois
=
param
.
input_rois_
;
auto
*
out
=
param
.
float_output
.
get
();
auto
pooled_height
=
param
.
pooled_height_
;
auto
pooled_width
=
param
.
pooled_width_
;
auto
spatial_scale
=
param
.
spatial_scale_
;
auto
output_channels
=
param
.
output_channels_
;
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto
in_stride
=
framework
::
stride
({
batch_size
,
height
,
width
,
input_channels
});
auto
out_stride
=
framework
::
stride
(
{
out
->
dims
()[
0
],
out
->
dims
()[
2
],
out
->
dims
()[
3
],
out
->
dims
()[
1
]});
const
float
*
input_data
=
in
->
data
<
float
>
();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
return
;
PADDLE_MOBILE_ENFORCE
(
rois
->
NumLevels
()
>
0
,
"ROIS should not be empty"
);
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_MOBILE_ENFORCE
(
rois_batch_size
==
batch_size
,
"the rois_batch_size and input(X) batch_size should be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_MOBILE_ENFORCE
(
rois_num_with_lod
==
rois_num
,
"the rois_num from input and lod must be the same"
);
PADDLE_MOBILE_ENFORCE
(
input_channels
==
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// set roi batch id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
auto
roi_end_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
auto
roi_end_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small rois to be 1 x 1
auto
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
// avoid 0
auto
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// Compute bin size w and h at input feature map
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
DLOG
<<
3
;
// calculate each pixel of the output feature map.
int
out_roi_offset
=
n
*
out_stride
[
0
];
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int
out_plane_offset
=
out_roi_offset
+
c
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
++
ph
)
{
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int
out_row_offset
=
out_plane_offset
+
ph
*
out_stride
[
1
];
for
(
int
pw
=
0
;
pw
<
pooled_width
;
++
pw
)
{
// calculate w and h at input feature map
int
hstart
=
floor
(
static_cast
<
float
>
(
ph
)
*
bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
float
>
(
pw
)
*
bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
float
>
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
);
int
wend
=
ceil
(
static_cast
<
float
>
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
// TODO int output_index = out_row_offset + pw;
int
output_index
=
out_row_offset
+
pw
*
output_channels
;
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int
input_plane_offset
=
roi_batch_id
*
in_stride
[
0
]
+
input_channel
;
auto
offset_input_data
=
input_data
+
input_plane_offset
;
float
out_sum
=
0.
;
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
in_stride
[
1
]
+
iw
*
input_channel
;
out_sum
+=
offset_input_data
[
input_index
];
}
}
float
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
output_data
[
output_index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
}
}
}
}
fpga
::
format_image
(
out
);
fpga
::
PerformBypass
(
param
.
output_arg
);
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
浏览文件 @
16927084
...
@@ -15,18 +15,61 @@ limitations under the License. */
...
@@ -15,18 +15,61 @@ limitations under the License. */
#ifdef RESHAPE2_OP
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
>
template
<
>
bool
Reshape2Kernel
<
FPGA
,
float
>::
Init
(
Reshape2Param
<
FPGA
>
*
param
)
{
bool
Reshape2Kernel
<
FPGA
,
float
>::
Init
(
Reshape2Param
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
output
=
param
->
Out
();
auto
shape
=
param
->
Shape
();
output
->
ShareDataWith
(
*
input
);
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
PADDLE_MOBILE_ENFORCE
(
num_shape
!=
0
,
"0 index is not supported"
);
for
(
int
i
=
0
;
i
<
shape
.
size
();
i
++
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
static_cast
<
int
>
(
-
num_in
/
num_shape
);
break
;
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
DLOG
<<
"input: "
<<
input
;
DLOG
<<
"output: "
<<
output
;
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
Reshape2Kernel
<
FPGA
,
float
>::
Compute
(
const
Reshape2Param
<
FPGA
>
&
param
)
{
void
Reshape2Kernel
<
FPGA
,
float
>::
Compute
(
const
Reshape2Param
<
FPGA
>
&
param
)
{
return
;
auto
input
=
const_cast
<
LoDTensor
*>
(
param
.
InputX
());
auto
output
=
param
.
Out
();
auto
shape
=
param
.
Shape
();
if
(
output
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
}
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
PADDLE_MOBILE_ENFORCE
(
num_shape
!=
0
,
"0 index is not supported"
);
for
(
int
i
=
0
;
i
<
shape
.
size
();
i
++
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
static_cast
<
int
>
(
-
num_in
/
num_shape
);
break
;
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
if
(
output
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
DLOG
<<
output
;
}
//
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
浏览文件 @
16927084
...
@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
...
@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile
::
fpga
::
SIGMOID
;
paddle_mobile
::
fpga
::
SIGMOID
;
int16_t
leaky_relu_negative_slope
=
0
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
out
=
param
->
Out
();
auto
out
=
param
->
Out
();
fpga
::
format_fp16_ofm
(
out
);
fpga
::
format_fp16_ofm
(
out
);
...
@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
...
@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args
.
image
.
width
=
args
.
image
.
width
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
out
->
data
<
float
>
();
args
.
output
.
address
=
out
->
data
<
half
>
();
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
activation
.
activation_type
=
activation_enable
;
args
.
output
.
activation
.
activation_type
=
activation_enable
;
args
.
output
.
activation
.
leaky_relu_negative_slope
=
leaky_relu_negative_slope
;
args
.
output
.
activation
.
leaky_relu_negative_slope
=
leaky_relu_negative_slope
;
...
...
src/operators/kernel/fpga/V1/slice_kernel.cpp
浏览文件 @
16927084
...
@@ -21,10 +21,37 @@ namespace operators {
...
@@ -21,10 +21,37 @@ namespace operators {
template
<
>
template
<
>
bool
SliceKernel
<
FPGA
,
float
>::
Init
(
SliceParam
<
FPGA
>*
param
)
{
bool
SliceKernel
<
FPGA
,
float
>::
Init
(
SliceParam
<
FPGA
>*
param
)
{
auto
output
=
param
->
output_
;
fpga
::
format_fp16_ofm
(
output
);
DLOG
<<
"input: "
<<
param
->
input_
;
DLOG
<<
"output: "
<<
param
->
output_
;
if
(
param
->
input_
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
}
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
SliceKernel
<
FPGA
,
float
>::
Compute
(
const
SliceParam
<
FPGA
>&
param
)
{}
void
SliceKernel
<
FPGA
,
float
>::
Compute
(
const
SliceParam
<
FPGA
>&
param
)
{
// Only support slicing in channel dimension
auto
input
=
param
.
input_
;
DLOG
<<
input
;
int
HW
=
input
->
dims
()[
2
]
*
input
->
dims
()[
3
];
int
channel
=
input
->
dims
()[
1
];
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
param
.
output_
->
data
<
half
>
();
int
start
=
param
.
starts_
[
0
],
end
=
param
.
ends_
[
0
];
start
=
start
<
0
?
start
+
channel
:
start
;
end
=
end
<
0
?
end
+
channel
:
end
;
start
=
start
>
channel
?
channel
:
start
;
end
=
end
>
channel
?
channel
:
end
;
int
len
=
end
-
start
;
for
(
int
i
=
0
;
i
<
HW
;
i
++
)
{
memcpy
(
output_ptr
+
len
*
i
,
input_ptr
+
i
*
channel
+
start
,
len
);
}
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
16927084
...
@@ -23,49 +23,72 @@ namespace operators {
...
@@ -23,49 +23,72 @@ namespace operators {
template
<
>
template
<
>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
out
=
param
->
Out
();
auto
out
=
param
->
Out
();
fpga
::
format_fp32_ofm
(
out
);
auto
float_input
=
new
Tensor
;
auto
float_input
=
new
Tensor
;
if
(
input
->
dims
().
size
()
==
2
)
{
float_input
->
mutable_data
<
float
>
({
1
,
input
->
dims
()[
1
]});
PADDLE_MOBILE_ENFORCE
(
input
->
dims
().
size
()
==
4
,
}
else
if
(
input
->
dims
().
size
()
==
4
)
{
"Softmax should have 4-order input"
);
float_input
->
mutable_data
<
float
>
(
auto
dims
=
framework
::
vectorize
(
input
->
dims
());
{
1
,
input
->
dims
()[
2
],
input
->
dims
()[
3
],
input
->
dims
()[
1
]});
auto
channel
=
dims
[
3
];
}
else
{
if
(
channel
==
1
)
{
// This input is generated by FC op, dims = [N C 1 1]
DLOG
<<
"wrong dimension of softmax input"
;
PADDLE_MOBILE_ENFORCE
(
dims
[
2
]
==
1
,
"Softmax input must come from FC op"
);
dims
[
3
]
=
dims
[
1
];
dims
[
1
]
=
1
;
}
input
->
Resize
(
framework
::
make_ddim
(
dims
));
float_input
->
Resize
(
framework
::
make_ddim
(
dims
));
if
(
channel
!=
2
)
{
// Use CPU
float_input
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
format_fp32_ofm
(
out
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
uint32_t
)
dims
[
1
];
args
.
image
.
width
=
(
uint32_t
)
dims
[
2
];
args
.
image
.
channels
=
(
uint32_t
)
dims
[
3
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
param
->
SetFloatInput
(
float_input
);
param
->
SetFpgaArgs
(
args
);
}
else
{
// Use FPGA
fpga
::
format_fp16_ofm
(
out
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
output
.
address
=
out
->
data
<
half
>
();
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
activation
.
activation_type
=
fpga
::
SOFTMAX
;
param
->
SetFpgaArgs
(
args
);
}
}
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
2
]
:
1
;
args
.
image
.
width
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
param
->
SetFloatInput
(
float_input
);
param
->
SetFpgaArgs
(
args
);
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
SoftmaxKernel
<
FPGA
,
float
>::
Compute
(
const
SoftmaxParam
<
FPGA
>
&
param
)
{
void
SoftmaxKernel
<
FPGA
,
float
>::
Compute
(
const
SoftmaxParam
<
FPGA
>
&
param
)
{
Tensor
*
in_x
=
param
.
FloatInput
();
Tensor
*
out
=
param
.
Out
();
fpga
::
PerformBypass
(
param
.
FpgaArgs
());
fpga
::
PerformBypass
(
param
.
FpgaArgs
());
fpga
::
fpga_invalidate
((
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
in_x
->
numel
()
*
sizeof
(
float
));
if
(
param
.
FpgaArgs
().
output
.
activation
.
activation_type
!=
fpga
::
SOFTMAX
)
{
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
Tensor
*
out
=
param
.
Out
();
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
Tensor
*
in_x
=
param
.
FloatInput
();
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
fpga
::
fpga_invalidate
(
in_x
->
data
<
float
>
(),
in_x
->
numel
()
*
sizeof
(
float
));
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
}
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/split_kernel.cpp
浏览文件 @
16927084
...
@@ -34,16 +34,18 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
...
@@ -34,16 +34,18 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
float
*
)));
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
float
*
)));
auto
out_channels
=
reinterpret_cast
<
uint32_t
*>
(
auto
out_channels
=
reinterpret_cast
<
uint32_t
*>
(
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
uint32_t
)));
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
uint32_t
)));
DLOG
<<
"input: "
<<
in
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
fpga
::
format_fp16_ofm
(
outs
[
i
]);
fpga
::
format_fp16_ofm
(
outs
[
i
]);
images_out
[
i
]
=
outs
[
i
]
->
mutable_data
<
float
>
();
DLOG
<<
"output: "
<<
outs
[
i
];
images_out
[
i
]
=
outs
[
i
]
->
mutable_data
<
half
>
();
scales_out
[
i
]
=
outs
[
i
]
->
scale
;
scales_out
[
i
]
=
outs
[
i
]
->
scale
;
out_channels
[
i
]
=
(
uint32_t
)
sections
[
i
];
out_channels
[
i
]
=
(
uint32_t
)
sections
[
i
];
}
}
fpga
::
SplitArgs
arg
=
{
0
};
fpga
::
SplitArgs
arg
=
{
0
};
arg
.
image_num
=
image_num
;
arg
.
image_num
=
image_num
;
arg
.
image_in
=
(
half
*
)
in
->
data
<
float
>
();
arg
.
image_in
=
in
->
data
<
half
>
();
arg
.
scale_in
=
in
->
scale
;
arg
.
scale_in
=
in
->
scale
;
arg
.
images_out
=
images_out
;
arg
.
images_out
=
images_out
;
arg
.
scales_out
=
scales_out
;
arg
.
scales_out
=
scales_out
;
...
...
src/operators/kernel/fpga/V1/tanh_kernel.cpp
浏览文件 @
16927084
...
@@ -22,8 +22,10 @@ namespace operators {
...
@@ -22,8 +22,10 @@ namespace operators {
template
<
>
template
<
>
bool
TanhKernel
<
FPGA
,
float
>::
Init
(
TanhParam
<
FPGA
>
*
param
)
{
bool
TanhKernel
<
FPGA
,
float
>::
Init
(
TanhParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
DLOG
<<
"input: "
<<
input
;
auto
input_ptr
=
input
->
data
<
half
>
();
auto
float_input
=
new
Tensor
;
auto
float_input
=
new
Tensor
;
float_input
->
mutable_data
<
float
>
(
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
1
],
input
->
dims
()[
2
],
input
->
dims
()[
3
]});
{
1
,
input
->
dims
()[
1
],
input
->
dims
()[
2
],
input
->
dims
()[
3
]});
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
format_fp32_ofm
(
float_input
);
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
16927084
...
@@ -20,7 +20,21 @@ namespace operators {
...
@@ -20,7 +20,21 @@ namespace operators {
template
<
>
template
<
>
bool
Transpose2Kernel
<
FPGA
,
float
>::
Init
(
Transpose2Param
<
FPGA
>
*
param
)
{
bool
Transpose2Kernel
<
FPGA
,
float
>::
Init
(
Transpose2Param
<
FPGA
>
*
param
)
{
param
->
Out
()
->
ShareDataWith
(
*
param
->
InputX
());
auto
input
=
param
->
InputX
();
auto
output
=
param
->
Out
();
auto
axis
=
param
->
Axis
();
auto
dim
=
input
->
dims
();
output
->
ShareDataWith
(
*
input
);
auto
dim_v
=
vectorize
(
dim
);
for
(
int
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
dim_v
[
i
]
=
dim
[
axis
[
i
]];
}
output
->
Resize
(
framework
::
make_ddim
(
dim_v
));
DLOG
<<
"input: "
<<
input
;
DLOG
<<
"output: "
<<
output
;
return
true
;
return
true
;
}
}
...
...
src/operators/op_param.h
浏览文件 @
16927084
...
@@ -1172,6 +1172,12 @@ class FeedParam : public OpParam {
...
@@ -1172,6 +1172,12 @@ class FeedParam : public OpParam {
public:
public:
FeedParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
FeedParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
#ifdef PADDLE_MOBILE_FPGA
static
int
feed_num
=
0
;
auto
new_name
=
std
::
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
const_cast
<
VariableNameMap
&>
(
inputs
).
at
(
"X"
)
=
{
string
(
new_name
)};
#endif
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
out_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
out_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
auto
var
=
scope
.
FindVar
(
"batch_size"
);
auto
var
=
scope
.
FindVar
(
"batch_size"
);
...
@@ -1195,6 +1201,11 @@ class FetchParam : public OpParam {
...
@@ -1195,6 +1201,11 @@ class FetchParam : public OpParam {
public:
public:
FetchParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
FetchParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
#ifdef PADDLE_MOBILE_FPGA
static
int
fetch_num
=
0
;
auto
new_name
=
std
::
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
const_cast
<
VariableNameMap
&>
(
outputs
).
at
(
"Out"
)
=
{
string
(
new_name
)};
#endif
input_x_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
input_x_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
out_
=
OutFrom
(
outputs
,
scope
);
out_
=
OutFrom
(
outputs
,
scope
);
}
}
...
@@ -1210,18 +1221,9 @@ class FetchParam : public OpParam {
...
@@ -1210,18 +1221,9 @@ class FetchParam : public OpParam {
RType
*
input_x_
;
RType
*
input_x_
;
Tensor
*
out_
;
Tensor
*
out_
;
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
public:
private:
std
::
shared_ptr
<
RType
>
float_input_x_
;
fpga
::
BypassArgs
fpga_bypass_args
;
fpga
::
BypassArgs
fpga_bypass_args
;
public:
RType
*
FloatInput
()
const
{
return
float_input_x_
==
nullptr
?
input_x_
:
float_input_x_
.
get
();
}
void
SetFloatInput
(
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
const
fpga
::
BypassArgs
&
FpgaArgs
()
const
{
return
fpga_bypass_args
;
}
void
SetFpgaArgs
(
const
fpga
::
BypassArgs
&
args
)
{
fpga_bypass_args
=
args
;
}
#endif
#endif
};
};
...
...
test/fpga/test_resnet50.cpp
浏览文件 @
16927084
...
@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
...
@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
}
}
}
}
void
dump
(
std
::
string
filename
,
const
Tensor
input_tensor
)
{
void
dump
(
std
::
string
filename
,
Tensor
input_tensor
)
{
auto
dataptr
=
input_tensor
.
data
<
float
>
(
);
auto
dataptr
=
reinterpret_cast
<
half
*>
(
input_tensor
.
get_data
()
);
std
::
ofstream
out
(
filename
.
c_str
());
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
float
result
=
0
;
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
++
i
)
{
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
++
i
)
{
...
@@ -61,12 +61,11 @@ void dump(std::string filename, const Tensor input_tensor) {
...
@@ -61,12 +61,11 @@ void dump(std::string filename, const Tensor input_tensor) {
}
}
out
.
close
();
out
.
close
();
}
}
void
dump_stride
(
std
::
string
filename
,
const
Tensor
input_tensor
,
void
dump_stride
(
std
::
string
filename
,
Tensor
input_tensor
,
const
int
dumpnum
)
{
const
int
dumpnum
)
{
int
c
=
(
input_tensor
.
dims
())[
1
];
int
c
=
(
input_tensor
.
dims
())[
1
];
int
h
=
(
input_tensor
.
dims
())[
2
];
int
h
=
(
input_tensor
.
dims
())[
2
];
int
w
=
(
input_tensor
.
dims
())[
3
];
int
w
=
(
input_tensor
.
dims
())[
3
];
auto
data_ptr
=
input_tensor
.
data
<
float
>
();
auto
data_ptr
=
input_tensor
.
get_data
();
int16_t
*
data_tmp
=
(
int16_t
*
)
malloc
(
c
*
h
*
w
*
sizeof
(
int16_t
));
int16_t
*
data_tmp
=
(
int16_t
*
)
malloc
(
c
*
h
*
w
*
sizeof
(
int16_t
));
int16_t
*
data_ptr_16
=
(
int16_t
*
)
data_ptr
;
int16_t
*
data_ptr_16
=
(
int16_t
*
)
data_ptr
;
convert_to_chw
(
&
data_ptr_16
,
c
,
h
,
w
,
data_tmp
);
convert_to_chw
(
&
data_ptr_16
,
c
,
h
,
w
,
data_tmp
);
...
@@ -98,9 +97,9 @@ int main() {
...
@@ -98,9 +97,9 @@ int main() {
for
(
int
i
=
0
;
i
<
73
;
i
++
)
{
for
(
int
i
=
0
;
i
<
73
;
i
++
)
{
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
i
);
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
i
);
std
::
string
saveName
=
"resnet50_result_"
+
std
::
to_string
(
i
);
std
::
string
saveName
=
"resnet50_result_"
+
std
::
to_string
(
i
);
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
data
<
float
>
(),
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
get_data
(),
tensor_ptr
->
numel
()
*
sizeof
(
half
));
tensor_ptr
->
numel
()
*
sizeof
(
half
));
dump_stride
(
saveName
,
(
*
tensor_ptr
),
20
);
//
dump_stride(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr));
// dump(saveName, (*tensor_ptr));
}
}
...
...
test/fpga/test_rfcn.cpp
浏览文件 @
16927084
...
@@ -23,29 +23,38 @@ limitations under the License. */
...
@@ -23,29 +23,38 @@ limitations under the License. */
#include "fpga/V2/api.h"
#include "fpga/V2/api.h"
#endif
#endif
// static const char *g_densebox_combine = "../models/densebox";
void
readStream
(
std
::
string
filename
,
uint8_t
*
buf
)
{
static
const
char
*
g_densebox_combine
=
"../models/rfcn"
;
std
::
ifstream
in
;
in
.
open
(
filename
,
std
::
ios
::
in
);
if
(
!
in
.
is_open
())
{
std
::
cout
<<
"open File Failed."
<<
std
::
endl
;
return
;
}
int
i
=
0
;
while
(
!
in
.
eof
())
{
in
>>
buf
[
i
];
i
++
;
}
in
.
close
();
}
static
const
char
*
g_rfcn_combine
=
"../models/rfcn"
;
const
std
::
string
g_image_src_float
=
"../models/rfcn/data.bin"
;
int
main
()
{
int
main
()
{
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
// paddle_mobile.SetThreadNum(4);
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_densebox_combine
)
+
"/model"
,
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_rfcn_combine
)
+
"/model"
,
std
::
string
(
g_densebox_combine
)
+
"/params"
,
true
,
std
::
string
(
g_rfcn_combine
)
+
"/params"
,
true
,
false
,
false
,
1
,
true
))
{
1
,
true
))
{
// std::vector<float> input;
float
img_info
[
3
]
=
{
768
,
1536
,
768.0
f
/
960.0
f
};
// std::vector<int64_t> dims{1, 3, 512, 1024};
auto
img
=
fpga
::
fpga_malloc
(
768
*
1536
*
3
*
sizeof
(
float
));
// GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
readStream
(
g_image_src_float
,
reinterpret_cast
<
uint8_t
*>
(
img
));
std
::
vector
<
void
*>
v
(
3
,
nullptr
);
// auto vec_result = paddle_mobile.Predict(input, dims);
paddle_mobile
.
FeedData
({
img_info
,
img
});
return
0
;
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
512
,
1024
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
// readStream(g_image_src_float,
// input_tensor.mutable_data<float>({1, 3, 224, 224}));
paddle_mobile
.
FeedData
(
input_tensor
);
paddle_mobile
.
Predict_To
(
-
1
);
paddle_mobile
.
Predict_To
(
-
1
);
paddle_mobile
.
GetResults
(
&
v
);
DLOG
<<
"Computation done"
;
}
}
return
0
;
return
0
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录