Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
16927084
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
16927084
编写于
2月 10, 2019
作者:
Z
zhangyang0701
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
reconstruct code to support RFCN for FPGA track
上级
458183af
变更
34
隐藏空白更改
内联
并排
Showing
34 changed file
with
1036 addition
and
244 deletion
+1036
-244
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+29
-19
src/fpga/V1/image.cpp
src/fpga/V1/image.cpp
+16
-15
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+1
-1
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+8
-9
src/framework/executor.cpp
src/framework/executor.cpp
+26
-61
src/framework/executor.h
src/framework/executor.h
+2
-3
src/framework/operator.cpp
src/framework/operator.cpp
+16
-9
src/framework/operator.h
src/framework/operator.h
+21
-1
src/framework/tensor.h
src/framework/tensor.h
+6
-1
src/io/api_paddle_mobile.cc
src/io/api_paddle_mobile.cc
+17
-0
src/io/api_paddle_mobile.h
src/io/api_paddle_mobile.h
+3
-0
src/io/paddle_inference_api.h
src/io/paddle_inference_api.h
+3
-0
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+6
-2
src/io/paddle_mobile.h
src/io/paddle_mobile.h
+2
-1
src/operators/kernel/detection_kernel.h
src/operators/kernel/detection_kernel.h
+8
-0
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+35
-4
src/operators/kernel/fpga/V1/concat_kernel.cpp
src/operators/kernel/fpga/V1/concat_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+3
-3
src/operators/kernel/fpga/V1/feed_kernel.cpp
src/operators/kernel/fpga/V1/feed_kernel.cpp
+24
-6
src/operators/kernel/fpga/V1/fetch_kernel.cpp
src/operators/kernel/fpga/V1/fetch_kernel.cpp
+31
-21
src/operators/kernel/fpga/V1/pool_kernel.cpp
src/operators/kernel/fpga/V1/pool_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+403
-1
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+171
-3
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+44
-1
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+2
-2
src/operators/kernel/fpga/V1/slice_kernel.cpp
src/operators/kernel/fpga/V1/slice_kernel.cpp
+28
-1
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+56
-33
src/operators/kernel/fpga/V1/split_kernel.cpp
src/operators/kernel/fpga/V1/split_kernel.cpp
+4
-2
src/operators/kernel/fpga/V1/tanh_kernel.cpp
src/operators/kernel/fpga/V1/tanh_kernel.cpp
+3
-1
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+15
-1
src/operators/op_param.h
src/operators/op_param.h
+12
-10
test/fpga/test_resnet50.cpp
test/fpga/test_resnet50.cpp
+6
-7
test/fpga/test_rfcn.cpp
test/fpga/test_rfcn.cpp
+28
-19
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
16927084
...
...
@@ -28,11 +28,13 @@ void format_image(framework::Tensor *image_tensor) {
auto
dims
=
image_tensor
->
dims
();
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
image_tensor
->
data
<
float
>
();
size_t
memory_size
=
channel
*
height
*
width
*
sizeof
(
float
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
// NOLINT
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
image
::
format_image
(
&
new_data
,
channel
,
height
,
width
);
image_tensor
->
reset_data_ptr
(
new_data
);
auto
external_ptr
=
reinterpret_cast
<
float
*>
(
image_tensor
->
external_data
);
float
*
p_data
=
external_ptr
==
nullptr
?
data_ptr
:
external_ptr
;
float
*
old_p
=
p_data
;
image
::
format_image
(
&
p_data
,
channel
,
height
,
width
);
if
(
old_p
!=
p_data
)
{
image_tensor
->
reset_data_ptr
(
p_data
);
}
}
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
...
...
@@ -50,6 +52,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
half
));
}
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
,
framework
::
DDim
dims
)
{
...
...
@@ -67,6 +70,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
half
));
}
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
auto
dims
=
ofm_tensor
->
dims
();
...
...
@@ -83,6 +87,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto
p
=
fpga_malloc
(
memory_size
);
memset
(
p
,
0
,
memory_size
);
ofm_tensor
->
reset_data_ptr
(
p
);
ofm_tensor
->
set_type
(
typeid
(
float
));
}
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
)
{
...
...
@@ -139,6 +144,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_dwconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
)
{
auto
dims
=
filter_tensor
->
dims
();
...
...
@@ -149,6 +155,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
filter
::
format_dwconv_filter
(
&
new_data
,
num
,
height
,
width
,
scale_ptr
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_DWDconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
,
...
...
@@ -173,6 +180,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
)
{
...
...
@@ -187,6 +195,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter
::
format_fc_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
1
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_deconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
,
int
stride
)
{
...
...
@@ -213,6 +222,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework
::
make_ddim
({
num
,
channel
,
height
,
width
});
filter_tensor
->
Resize
(
dims_new
);
filter_tensor
->
reset_data_ptr
(
new_data
);
filter_tensor
->
set_type
(
typeid
(
int8_t
));
}
void
format_bias_scale_array
(
float
**
bias_scale_array
,
...
...
@@ -236,6 +246,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto
ddim
=
framework
::
make_ddim
({
1
,
sum_channel
,
height
,
width
});
out
->
Resize
(
ddim
);
out
->
reset_data_ptr
(
data_ptr
);
out
->
set_type
(
typeid
(
half
));
}
void
format_conv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
...
...
@@ -447,9 +458,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
out_ptr
=
out
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
filter_ptr
=
filter
->
data
<
int8_
t
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
...
...
@@ -571,8 +582,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
filter_ptr
=
filter
->
data
<
int8_
t
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
...
...
@@ -603,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
sub_output_height
*
sub_conv_num
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
arg
->
output
.
address
=
(
half
*
)
out_ptr
+
// NOLINT
omit_size
*
sizeof
(
half
)
*
...
...
@@ -793,7 +804,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
),
deleter
));
}
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
int16_t
*>
(
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
half
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
);
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
;
...
...
@@ -818,9 +829,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
floa
t
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
uint8_
t
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
out
->
mutable_data
<
half
>
();
arg
->
sub_conv_num
=
1
;
// arg->relu_enabled = relu_enabled;
arg
->
output
.
activation
.
activation_type
=
activation_enable
;
...
...
@@ -848,9 +859,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t
leaky_relu_negative_slope
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
int8_t
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
...
...
@@ -885,7 +895,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
real_out_height
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
half
>
();
/*====For Addition
arg->output.address =
...
...
src/fpga/V1/image.cpp
浏览文件 @
16927084
...
...
@@ -22,7 +22,6 @@ namespace fpga {
namespace
image
{
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
));
// NOLINT
int64_t
amount_per_row
=
width
*
channel
;
...
...
@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
}
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
)
{
int
h
=
0
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
// NOLINT
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
// NOLINT
for
(
h
=
0
;
h
<
height
;
h
++
)
{
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
// NOLINT
(
void
*
)(
*
data_in
+
h
*
cw
),
// NOLINT
cw
*
sizeof
(
float
));
}
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
for
(
h
=
0
;
h
<
height
;
h
++
)
{
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
// NOLINT
(
void
*
)(
*
data_in
+
h
*
cw
),
// NOLINT
cw
*
sizeof
(
float
));
}
*
data_in
=
data_tmp
;
}
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
convert_to_hwc
(
data_in
,
channel
,
height
,
width
);
align_element_conv
(
data_in
,
height
,
channel
*
width
);
int
cw
=
channel
*
width
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
hwc_temp
=
*
data_in
;
align_element_conv
(
data_in
,
height
,
channel
*
width
);
fpga_free
(
hwc_temp
);
}
fpga_flush
(
*
data_in
,
align_to_x
(
channel
*
width
,
IMAGE_ALIGNMENT
)
*
height
*
sizeof
(
float
));
}
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
16927084
...
...
@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"
Address: "
<<
ptr
<<
"
Invalid pointer"
;
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
...
...
src/fpga/common/fpga_common.h
浏览文件 @
16927084
...
...
@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory>
#include <vector>
namespace
paddle_mobile
{
namespace
fpga
{
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT
16
// Aligned to 16
#define FILTER_NUM_ALIGNMENT
32
// Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT
16
// Filter element number aligned to 16
#define BS_NUM_ALIGNMENT
8
#define BIAS_NUM_ALIGNMENT
16
#define IMAGE_ALIGNMENT
(16)
// Aligned to 16
#define FILTER_NUM_ALIGNMENT
(32)
// Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT
(16)
// Filter element number aligned to 16
#define BS_NUM_ALIGNMENT
(8)
#define BIAS_NUM_ALIGNMENT
(16)
#endif
namespace
paddle_mobile
{
namespace
fpga
{
enum
DataType
{
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP16
=
0
,
...
...
@@ -49,7 +48,7 @@ enum ActivationType {
};
struct
ActivationArgs
{
enum
ActivationType
activation_type
;
enum
ActivationType
activation_type
=
NONE
;
int16_t
leaky_relu_negative_slope
;
};
...
...
src/framework/executor.cpp
浏览文件 @
16927084
...
...
@@ -84,6 +84,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
InitMemory
();
}
#ifdef PADDLE_MOBILE_FPGA
program_
.
scope
->
EraseVars
({
"feed"
,
"fetch"
});
program_
.
scope
->
print_vars
();
#endif
int
count
=
0
;
for
(
int
block_id
=
0
;
block_id
<
ops_of_block_
.
size
();
++
block_id
)
{
for
(
auto
&
op_handler
:
ops_of_block_
[
block_id
])
{
...
...
@@ -92,14 +97,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
ops_list_
.
push_back
(
op_handler
);
}
}
#ifdef PADDLE_MOBILE_FPGA
TalorFeedOp
();
DLOG
<<
"TalorFeed finished"
;
TalorFetchdOp
();
DLOG
<<
"TalorFetch finished"
;
program_
.
scope
->
print_vars
();
#endif
}
template
<
typename
T
>
...
...
@@ -451,49 +448,6 @@ std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
}
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
TalorFeedOp
()
{
auto
&
ops
=
ops_of_block_
[
0
];
int
num
=
0
;
program_
.
scope
->
EraseVars
(
std
::
vector
<
string
>
{
string
(
"feed"
)});
for
(
auto
op
:
ops
)
{
if
(
op
->
Type
()
==
"feed"
)
{
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
num
++
);
auto
var
=
program_
.
scope
->
Var
(
new_name
);
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
auto
output_map
=
op
->
Outputs
();
std
::
vector
<
std
::
string
>
out_keys
=
op
->
GetOutKeys
();
PADDLE_MOBILE_ENFORCE
(
!
out_keys
.
empty
(),
"this op contains no output"
);
auto
output_tensor
=
GetVarValue
<
LoDTensor
>
(
out_keys
[
0
],
output_map
,
*
(
program_
.
scope
));
tensor
->
Resize
(
output_tensor
->
dims
());
tensor
->
init
(
typeid
(
float
));
op
->
ChangeNameMap
(
"X"
,
std
::
vector
<
string
>
{
new_name
});
}
}
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
TalorFetchdOp
()
{
auto
&
ops
=
ops_of_block_
[
0
];
int
num
=
0
;
program_
.
scope
->
EraseVars
(
std
::
vector
<
string
>
{
string
(
"fetch"
)});
for
(
auto
op
:
ops
)
{
if
(
op
->
Type
()
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
num
++
);
auto
var
=
program_
.
scope
->
Var
(
new_name
);
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
auto
input_map
=
op
->
Inputs
();
std
::
vector
<
std
::
string
>
in_keys
=
op
->
GetInputKeys
();
PADDLE_MOBILE_ENFORCE
(
!
in_keys
.
empty
(),
"this op contains no input"
);
auto
input_tensor
=
GetVarValue
<
LoDTensor
>
(
in_keys
[
0
],
input_map
,
*
(
program_
.
scope
));
tensor
->
Resize
(
input_tensor
->
dims
());
tensor
->
init
(
typeid
(
float
));
op
->
ChangeNameMap
(
"Out"
,
std
::
vector
<
string
>
{
new_name
});
}
}
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
)
{
...
...
@@ -509,18 +463,29 @@ void Executor<Device, T>::FeedData(const Tensor &t) {
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
Tensor
>
&
v
)
{
void
Executor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*
>
&
v
)
{
auto
input_size
=
v
.
size
();
PADDLE_MOBILE_ENFORCE
(
input_size
>
0
,
"Empty input"
);
int
counter
=
0
;
auto
vars
=
program_
.
scope
->
VarContain
(
"feed"
);
for
(
auto
var
:
vars
)
{
Tensor
*
feed_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
feed_tensor
->
Resize
(
v
[
counter
].
dims
());
feed_tensor
->
ShareDataWith
(
v
[
counter
]);
if
(
++
counter
>
v
.
size
())
{
return
;
}
PADDLE_MOBILE_ENFORCE
(
input_size
==
vars
.
size
(),
"input data number not correct"
);
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
auto
var
=
program_
.
scope
->
Var
(
"feed"
,
i
);
auto
feed_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
feed_tensor
->
external_data
=
v
[
i
];
}
}
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
v
)
{
auto
output_size
=
v
->
size
();
PADDLE_MOBILE_ENFORCE
(
output_size
>
0
,
"Empty output"
);
auto
vars
=
program_
.
scope
->
VarContain
(
"fetch"
);
PADDLE_MOBILE_ENFORCE
(
output_size
==
vars
.
size
(),
"output data number not correct"
);
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
auto
var
=
program_
.
scope
->
Var
(
"fetch"
,
i
);
auto
fetch_tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
(
*
v
)[
i
]
=
fetch_tensor
->
template
data
<
float
>();
}
}
...
...
src/framework/executor.h
浏览文件 @
16927084
...
...
@@ -50,11 +50,10 @@ class Executor {
std
::
shared_ptr
<
LoDTensor
>
GetOutput
(
const
std
::
string
&
var_name
);
#ifdef PADDLE_MOBILE_FPGA
void
TalorFeedOp
();
void
TalorFetchdOp
();
void
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
Tensor
&
t
);
void
FeedData
(
const
std
::
vector
<
Tensor
>
&
v
);
void
FeedData
(
const
std
::
vector
<
void
*>
&
v
);
void
GetResults
(
std
::
vector
<
void
*>
*
v
);
std
::
shared_ptr
<
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
...
...
src/framework/operator.cpp
浏览文件 @
16927084
...
...
@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_
(
attrs
),
scope_
(
scope
)
{
CheckAllInputOutputSet
();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors
();
#endif
}
template
<
typename
Dtype
>
...
...
@@ -133,15 +136,19 @@ void OperatorBase<GPU_CL>::Run() {
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Dtype
>
void
OperatorBase
<
Dtype
>::
ChangeNameMap
(
string
key
,
std
::
vector
<
string
>
value
)
{
auto
it
=
inputs_
.
find
(
key
);
if
(
it
!=
inputs_
.
end
())
{
inputs_
[
key
]
=
value
;
return
;
}
it
=
outputs_
.
find
(
key
);
if
(
it
!=
outputs_
.
end
())
{
inputs_
[
key
]
=
value
;
void
OperatorBase
<
Dtype
>::
InsertTensors
()
{
static
int
feed_num
=
0
;
static
int
fetch_num
=
0
;
if
(
type_
==
"feed"
)
{
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
auto
var
=
scope_
->
Var
(
new_name
);
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
inputs_
.
at
(
"X"
)
=
{
string
(
new_name
)};
}
else
if
(
type_
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
auto
var
=
scope_
->
Var
(
new_name
);
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
outputs_
.
at
(
"Out"
)
=
{
string
(
new_name
)};
}
}
#endif
...
...
src/framework/operator.h
浏览文件 @
16927084
...
...
@@ -79,6 +79,7 @@ class OperatorBase {
}
}
#ifdef PADDLE_MOBILE_FPGA
void
InsertTensors
();
void
ChangeNameMap
(
string
key
,
std
::
vector
<
string
>
value
);
#endif
protected:
...
...
@@ -95,6 +96,7 @@ class OperatorBase {
template
<
typename
Dtype
,
typename
ParamType
,
typename
KernelType
>
class
OperatorWithKernel
:
public
OperatorBase
<
Dtype
>
{
public:
#ifndef PADDLE_MOBILE_FPGA1
OperatorWithKernel
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
std
::
shared_ptr
<
Scope
>
scope
)
...
...
@@ -104,7 +106,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_
.
InitCLHelper
(
scope
->
GetCLScpoe
());
#endif
}
#else
OperatorWithKernel
(
const
std
::
string
&
type
,
const
VariableNameMap
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
std
::
shared_ptr
<
Scope
>
scope
)
:
OperatorBase
<
Dtype
>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{
static
int
feed_num
=
0
;
static
int
fetch_num
=
0
;
if
(
type
==
"feed"
)
{
auto
new_name
=
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
auto
var
=
scope
->
Var
(
new_name
);
(
const_cast
<
VariableNameMap
&>
(
inputs
)).
at
(
"X"
)
=
{
string
(
new_name
)};
}
else
if
(
type
==
"fetch"
)
{
auto
new_name
=
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
auto
var
=
scope
->
Var
(
new_name
);
(
const_cast
<
VariableNameMap
&>
(
outputs
)).
at
(
"Out"
)
=
{
string
(
new_name
)};
}
param_
=
ParamType
(
inputs
,
outputs
,
attrs
,
*
scope
);
}
#endif
virtual
void
RunImpl
()
{
this
->
kernel_
.
Compute
(
this
->
param_
);
}
virtual
void
InferShape
()
const
=
0
;
...
...
src/framework/tensor.h
浏览文件 @
16927084
...
...
@@ -202,6 +202,10 @@ class Tensor : public TensorBase {
inline
void
reset_data_ptr
(
void
*
p
)
{
((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
reset
((
uint8_t
*
)
p
);
// NOLINT
}
inline
void
set_type
(
std
::
type_index
type
)
{
holder_
->
set_type
(
type
);
}
inline
void
*
get_data
()
{
return
(
void
*
)(((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
get
());
}
// NOLINT
inline
void
*
init
(
std
::
type_index
type
)
{
if
(
holder_
!=
nullptr
)
{
...
...
@@ -217,7 +221,8 @@ class Tensor : public TensorBase {
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())
+
offset_
);
}
float
scale
[
2
];
// scale[0]= MAX/127.0, scale[1]= 127.0/MAX
float
scale
[
2
];
// scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void
*
external_data
=
nullptr
;
// only used for Feed
#endif
};
...
...
src/io/api_paddle_mobile.cc
浏览文件 @
16927084
...
...
@@ -177,6 +177,23 @@ bool PaddleMobilePredictor<Device, T>::Run(
return
true
;
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
inputs
)
{
paddle_mobile_
->
FeedData
(
inputs
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
outputs
)
{
paddle_mobile_
->
GetResults
(
outputs
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobilePredictor
<
Device
,
T
>::
Predict_From_To
(
int
start
,
int
end
)
{
paddle_mobile_
->
Predict_From_To
(
start
,
end
);
}
#endif
template
<
typename
Device
,
typename
T
>
PaddleMobilePredictor
<
Device
,
T
>::~
PaddleMobilePredictor
()
{
...
...
src/io/api_paddle_mobile.h
浏览文件 @
16927084
...
...
@@ -35,6 +35,9 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
override
;
void
FeedData
(
const
std
::
vector
<
void
*>&
inputs
)
override
;
void
GetResults
(
std
::
vector
<
void
*>*
outputs
)
override
;
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
)
override
;
#endif
~
PaddleMobilePredictor
()
override
;
...
...
src/io/paddle_inference_api.h
浏览文件 @
16927084
...
...
@@ -119,6 +119,9 @@ class PaddlePredictor {
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
int
>*
index_data
,
int
batch_size
=
-
1
)
=
0
;
virtual
void
FeedData
(
const
std
::
vector
<
void
*>&
inputs
)
=
0
;
virtual
void
GetResults
(
std
::
vector
<
void
*>*
outputs
)
=
0
;
virtual
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
)
=
0
;
#endif
protected:
...
...
src/io/paddle_mobile.cpp
浏览文件 @
16927084
...
...
@@ -228,10 +228,14 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_
->
FeedData
(
t
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
framework
::
Tensor
>
&
v
)
{
void
PaddleMobile
<
Device
,
T
>::
FeedData
(
const
std
::
vector
<
void
*>
&
v
)
{
executor_
->
FeedData
(
v
);
};
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
GetResults
(
std
::
vector
<
void
*>
*
v
)
{
executor_
->
GetResults
(
v
);
}
template
<
typename
Device
,
typename
T
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Device
,
T
>::
FetchResult
(
int
id
)
{
...
...
src/io/paddle_mobile.h
浏览文件 @
16927084
...
...
@@ -90,7 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA
void
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
framework
::
Tensor
&
t
);
void
FeedData
(
const
std
::
vector
<
framework
::
Tensor
>
&
v
);
void
FeedData
(
const
std
::
vector
<
void
*>
&
v
);
void
GetResults
(
std
::
vector
<
void
*>
*
v
);
std
::
shared_ptr
<
framework
::
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
...
...
src/operators/kernel/detection_kernel.h
浏览文件 @
16927084
...
...
@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float
nms_thresh_
;
float
min_size_
;
float
eta_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_score
,
float_bbox
;
fpga
::
BypassArgs
score_arg
,
bbox_arg
;
#endif
};
DECLARE_KERNEL
(
Proposal
,
ProposalParam
);
...
...
@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int
pooled_height_
;
int
pooled_width_
;
float
spatial_scale_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_input
,
float_output
;
fpga
::
BypassArgs
input_arg
,
output_arg
;
#endif
};
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
...
...
src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
浏览文件 @
16927084
...
...
@@ -23,15 +23,46 @@ namespace operators {
template
<
>
bool
AnchorGeneratorKernel
<
FPGA
,
float
>::
Init
(
AnchorGeneratorParam
<
FPGA
>
*
param
)
{
// TODO zhangyang
auto
input
=
param
->
input_
;
auto
anchors
=
param
->
output_anchors_
;
auto
anchor_ptr
=
anchors
->
mutable_data
<
float
>
();
auto
stride
=
param
->
stride_
;
auto
feature_width
=
input
->
dims
()[
3
],
feature_height
=
input
->
dims
()[
2
];
auto
stride_width
=
stride
[
0
],
stride_height
=
stride
[
1
];
int
anchors_offset
[]
=
{
-
2
,
-
2
,
18
,
18
,
-
10
,
-
9
,
26
,
25
,
-
23
,
-
20
,
39
,
36
,
-
43
,
-
34
,
59
,
49
,
-
63
,
-
54
,
79
,
69
,
-
96
,
-
77
,
112
,
93
,
-
137
,
-
118
,
153
,
134
,
-
204
,
-
188
,
220
,
204
,
-
281
,
-
395
,
296
,
441
};
int
num_anchors
=
sizeof
(
anchors_offset
)
/
(
sizeof
(
int
)
*
4
);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for
(
int
h_idx
=
0
;
h_idx
<
feature_height
;
++
h_idx
)
{
for
(
int
w_idx
=
0
;
w_idx
<
feature_width
;
++
w_idx
)
{
int
offset
=
h_idx
*
w_idx
*
num_anchors
*
4
;
for
(
int
idx
=
0
;
idx
<
num_anchors
;
idx
++
)
{
anchor_ptr
[
offset
+
0
]
=
anchors_offset
[
idx
*
4
+
0
]
+
w_idx
*
stride_width
;
anchor_ptr
[
offset
+
1
]
=
anchors_offset
[
idx
*
4
+
1
]
+
h_idx
*
stride_height
;
anchor_ptr
[
offset
+
2
]
=
anchors_offset
[
idx
*
4
+
2
]
+
w_idx
*
stride_width
;
anchor_ptr
[
offset
+
3
]
=
anchors_offset
[
idx
*
4
+
3
]
+
h_idx
*
stride_height
;
}
}
}
return
true
;
}
template
<
>
void
AnchorGeneratorKernel
<
FPGA
,
float
>::
Compute
(
const
AnchorGeneratorParam
<
FPGA
>
&
param
)
{
// TODO(hjchen2)
}
const
AnchorGeneratorParam
<
FPGA
>
&
param
)
{}
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/fpga/V1/concat_kernel.cpp
浏览文件 @
16927084
...
...
@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE
(
input
->
dims
()[
2
]
==
height
&&
input
->
dims
()[
3
]
==
width
,
"Image height & width should be unified"
);
images_in
[
i
]
=
(
half
*
)
input
->
data
<
float
>
();
// NOLINT
images_in
[
i
]
=
input
->
data
<
half
>
();
channel_num
[
i
]
=
(
uint32_t
)
inputs
[
i
]
->
dims
()[
1
];
// NOLINT
scales_in
[
i
]
=
input
->
scale
;
}
...
...
@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs
.
image_num
=
image_num
;
concatArgs
.
images_in
=
images_in
;
concatArgs
.
scales_in
=
scales_in
;
concatArgs
.
image_out
=
(
half
*
)
out
->
data
<
float
>
();
// NOLINT
concatArgs
.
image_out
=
out
->
data
<
half
>
();
concatArgs
.
scale_out
=
out
->
scale
;
concatArgs
.
channel_num
=
channel_num
;
concatArgs
.
height
=
height
;
...
...
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
浏览文件 @
16927084
...
...
@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
input_y_ptr
=
input_y
->
data
<
float
>
();
auto
input_x_ptr
=
input_x
->
data
<
half
>
();
auto
input_y_ptr
=
input_y
->
data
<
half
>
();
fpga
::
format_fp16_ofm
(
out
);
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
auto
out_ptr
=
out
->
mutable_data
<
half
>
();
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
// ewaddArgs.relu_enabled = relu_enabled;
...
...
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
浏览文件 @
16927084
...
...
@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
input_y_ptr
=
input_y
->
data
<
float
>
();
auto
input_x_ptr
=
input_x
->
data
<
half
>
();
auto
input_y_ptr
=
input_y
->
data
<
half
>
();
fpga
::
format_fp16_ofm
(
out
);
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
auto
out_ptr
=
out
->
mutable_data
<
half
>
();
fpga
::
EWAddArgs
ewaddArgs
=
{
0
};
// ewaddArgs.relu_enabled = relu_enabled;
...
...
src/operators/kernel/fpga/V1/feed_kernel.cpp
浏览文件 @
16927084
...
...
@@ -19,19 +19,35 @@ namespace operators {
template
<
>
bool
FeedKernel
<
FPGA
,
float
>::
Init
(
FeedParam
<
FPGA
>
*
param
)
{
Tensor
*
output
=
param
->
Out
();
auto
output
=
param
->
Out
();
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
input
->
init
(
typeid
(
float
));
input
->
Resize
(
output
->
dims
());
if
(
output
->
dims
().
size
()
!=
4
)
{
auto
input_ptr
=
input
->
mutable_data
<
float
>
();
size_t
size
=
output
->
numel
()
*
sizeof
(
float
);
auto
p
=
fpga
::
fpga_malloc
(
size
);
memcpy
(
p
,
input_ptr
,
size
);
output
->
reset_data_ptr
(
p
);
return
true
;
}
fpga
::
format_fp16_ofm
(
output
);
return
true
;
}
template
<
>
void
FeedKernel
<
FPGA
,
float
>::
Compute
(
const
FeedParam
<
FPGA
>
&
param
)
{
auto
input
=
reinterpret_cast
<
Tensor
*>
(
const_cast
<
LoDTensor
*>
(
param
.
InputX
()));
auto
output
=
param
.
Out
();
auto
input
=
const_cast
<
LoDTensor
*>
(
param
.
InputX
());
if
(
input
->
dims
().
size
()
!=
4
)
{
return
;
}
fpga
::
format_image
(
input
);
auto
input_ptr
=
input
->
data
<
float
>
();
Tensor
*
output
=
param
.
Out
();
auto
output_ptr
=
output
->
data
<
float
>
();
auto
output_ptr
=
output
->
data
<
half
>
();
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP32
};
...
...
@@ -39,7 +55,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
image
.
address
=
reinterpret_cast
<
void
*>
(
input_ptr
)
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
...
...
@@ -48,6 +64,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
output
.
address
=
output_ptr
;
args
.
output
.
scale_address
=
output
->
scale
;
fpga
::
PerformBypass
(
args
);
input
->
external_data
=
nullptr
;
}
template
class
FeedKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/fetch_kernel.cpp
浏览文件 @
16927084
...
...
@@ -19,20 +19,15 @@ namespace operators {
template
<
>
bool
FetchKernel
<
FPGA
,
float
>::
Init
(
FetchParam
<
FPGA
>
*
param
)
{
Tensor
*
output
=
param
->
Out
();
// fpga::format_fp16_ofm(output);
return
true
;
}
template
<
>
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
param
.
Out
()
->
ShareDataWith
(
*
(
param
.
InputX
()));
/*auto input =
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
output
=
param
->
Out
();
if
(
input
->
type
()
==
typeid
(
float
))
{
output
->
ShareDataWith
(
*
input
);
return
true
;
}
output
->
init
(
typeid
(
float
));
output
->
Resize
(
input
->
dims
());
fpga
::
format_fp32_ofm
(
output
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
...
...
@@ -40,13 +35,28 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
= output_ptr; args.output.scale_address = output->scale;
fpga::PerformBypass(args);*/
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
channels
=
(
uint32_t
)
product
(
input
->
dims
());
args
.
image
.
height
=
1
;
args
.
image
.
width
=
1
;
args
.
image
.
pad_height
=
0
;
args
.
image
.
pad_width
=
0
;
args
.
output
.
address
=
output
->
data
<
float
>
();
args
.
output
.
scale_address
=
output
->
scale
;
param
->
fpga_bypass_args
=
args
;
return
true
;
}
template
<
>
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
auto
input
=
param
.
InputX
();
if
(
input
->
type
()
==
typeid
(
float
))
{
return
;
}
fpga
::
PerformBypass
(
param
.
fpga_bypass_args
);
// TODO: DEalign: get rid of extra 0
}
template
class
FetchKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/pool_kernel.cpp
浏览文件 @
16927084
...
...
@@ -22,10 +22,10 @@ namespace operators {
template
<
>
bool
PoolKernel
<
FPGA
,
float
>::
Init
(
PoolParam
<
FPGA
>
*
param
)
{
auto
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
Tensor
*
output
=
param
->
Output
();
fpga
::
format_fp16_ofm
(
output
);
auto
output_ptr
=
output
->
mutable_data
<
float
>
();
auto
output_ptr
=
output
->
mutable_data
<
half
>
();
vector
<
int
>
ksize
=
param
->
Ksize
();
vector
<
int
>
strides
=
param
->
Strides
();
vector
<
int
>
paddings
=
param
->
Paddings
();
...
...
src/operators/kernel/fpga/V1/proposal_kernel.cpp
浏览文件 @
16927084
...
...
@@ -14,20 +14,422 @@ limitations under the License. */
#ifdef PROPOSAL_OP
#include <algorithm>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
template
<
>
bool
ProposalKernel
<
FPGA
,
float
>::
Init
(
ProposalParam
<
FPGA
>
*
param
)
{
int
post_nms_top_n
=
param
->
post_nms_topn_
;
int64_t
batch
=
param
->
scores_
->
dims
()[
0
];
auto
total
=
post_nms_top_n
*
batch
;
param
->
rpn_rois_
->
mutable_data
<
float
>
({
total
,
4
});
param
->
rpn_probs_
->
mutable_data
<
float
>
({
total
,
1
});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param
->
float_bbox
=
std
::
make_shared
<
Tensor
>
();
param
->
float_bbox
->
Resize
(
param
->
bbox_deltas_
->
dims
());
param
->
float_bbox
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
param
->
float_bbox
.
get
());
param
->
float_score
=
std
::
make_shared
<
Tensor
>
();
param
->
float_score
->
Resize
(
param
->
scores_
->
dims
());
param
->
float_score
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
param
->
float_score
.
get
());
auto
input
=
param
->
bbox_deltas_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_bbox
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_bbox
->
scale
;
param
->
bbox_arg
=
args
;
input
=
param
->
scores_
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_score
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_score
->
scale
;
param
->
score_arg
=
args
;
return
true
;
}
void
AppendProposals
(
Tensor
*
dst
,
int64_t
offset
,
const
Tensor
&
src
)
{
auto
*
out_data
=
dst
->
data
<
void
>
();
auto
*
to_add_data
=
src
.
data
<
void
>
();
size_t
size_of_t
=
framework
::
SizeOfType
(
src
.
type
());
offset
*=
size_of_t
;
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
out_data
)
+
offset
),
to_add_data
,
src
.
numel
()
*
size_of_t
);
}
template
<
class
T
>
static
inline
void
BoxCoder
(
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
();
int64_t
row
=
all_anchors
->
dims
()[
0
];
int64_t
len
=
all_anchors
->
dims
()[
1
];
auto
*
bbox_deltas_data
=
bbox_deltas
->
data
<
T
>
();
auto
*
anchor_data
=
all_anchors
->
data
<
T
>
();
const
T
*
variances_data
=
nullptr
;
if
(
variances
)
{
variances_data
=
variances
->
data
<
T
>
();
}
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
T
anchor_width
=
anchor_data
[
i
*
len
+
2
]
-
anchor_data
[
i
*
len
]
+
1.0
;
T
anchor_height
=
anchor_data
[
i
*
len
+
3
]
-
anchor_data
[
i
*
len
+
1
]
+
1.0
;
T
anchor_center_x
=
anchor_data
[
i
*
len
]
+
0.5
*
anchor_width
;
T
anchor_center_y
=
anchor_data
[
i
*
len
+
1
]
+
0.5
*
anchor_height
;
T
bbox_center_x
=
0
,
bbox_center_y
=
0
;
T
bbox_width
=
0
,
bbox_height
=
0
;
if
(
variances
)
{
bbox_center_x
=
variances_data
[
i
*
len
]
*
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
variances_data
[
i
*
len
+
1
]
*
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
],
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
],
kBBoxClipDefault
))
*
anchor_height
;
}
else
{
bbox_center_x
=
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
kBBoxClipDefault
))
*
anchor_height
;
}
proposals_data
[
i
*
len
]
=
bbox_center_x
-
bbox_width
/
2
;
proposals_data
[
i
*
len
+
1
]
=
bbox_center_y
-
bbox_height
/
2
;
proposals_data
[
i
*
len
+
2
]
=
bbox_center_x
+
bbox_width
/
2
-
1
;
proposals_data
[
i
*
len
+
3
]
=
bbox_center_y
+
bbox_height
/
2
-
1
;
}
// return proposals;
}
template
<
class
T
>
static
inline
void
ClipTiledBoxes
(
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
();
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
zero
(
0
);
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
}
}
template
<
class
T
>
static
inline
void
FilterBoxes
(
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
();
T
im_scale
=
im_info_data
[
2
];
keep
->
Resize
({
boxes
->
dims
()[
0
]});
min_size
=
std
::
max
(
min_size
,
1.0
f
);
int
*
keep_data
=
keep
->
mutable_data
<
int
>
();
int
keep_len
=
0
;
for
(
int
i
=
0
;
i
<
boxes
->
dims
()[
0
];
++
i
)
{
T
ws
=
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
]
+
1
;
T
hs
=
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
]
+
1
;
T
ws_origin_scale
=
(
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
])
/
im_scale
+
1
;
T
hs_origin_scale
=
(
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
])
/
im_scale
+
1
;
T
x_ctr
=
boxes_data
[
4
*
i
]
+
ws
/
2
;
T
y_ctr
=
boxes_data
[
4
*
i
+
1
]
+
hs
/
2
;
if
(
ws_origin_scale
>=
min_size
&&
hs_origin_scale
>=
min_size
&&
x_ctr
<=
im_info_data
[
1
]
&&
y_ctr
<=
im_info_data
[
0
])
{
keep_data
[
keep_len
++
]
=
i
;
}
}
keep
->
Resize
({
keep_len
});
}
template
<
class
T
>
static
inline
std
::
vector
<
std
::
pair
<
T
,
int
>>
GetSortedScoreIndex
(
const
std
::
vector
<
T
>
&
scores
)
{
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
sorted_indices
.
reserve
(
scores
.
size
());
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
.
emplace_back
(
scores
[
i
],
i
);
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
[](
const
std
::
pair
<
T
,
int
>
&
a
,
const
std
::
pair
<
T
,
int
>
&
b
)
{
return
a
.
first
<
b
.
first
;
});
return
sorted_indices
;
}
template
<
class
T
>
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
typename
T
>
static
inline
Tensor
VectorToTensor
(
const
std
::
vector
<
T
>
&
selected_indices
,
int
selected_num
)
{
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
auto
*
keep_data
=
keep_nms
.
mutable_data
<
T
>
();
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
class
T
>
static
inline
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
inter_xmin
=
std
::
max
(
box1
[
0
],
box2
[
0
]);
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
std
::
max
(
T
(
0
),
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
T
(
0
),
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
return
inter_area
/
(
bbox1_area
+
bbox2_area
-
inter_area
);
}
}
template
<
class
T
>
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
if
(
flag
)
{
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
}
else
{
break
;
}
}
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
++
selected_num
;
}
sorted_indices
.
erase
(
sorted_indices
.
end
()
-
1
);
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
return
VectorToTensor
(
selected_indices
,
selected_num
);
}
template
<
typename
T
>
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
// Sort index
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
int
*
index
=
index_t
.
mutable_data
<
int
>
();
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
}
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
if
(
pre_nms_top_n
<=
0
||
pre_nms_top_n
>=
scores_slice
.
numel
())
{
std
::
sort
(
index
,
index
+
scores_slice
.
numel
(),
compare
);
}
else
{
std
::
nth_element
(
index
,
index
+
pre_nms_top_n
,
index
+
scores_slice
.
numel
(),
compare
);
index_t
.
Resize
({
pre_nms_top_n
});
}
Tensor
scores_sel
,
bbox_sel
,
anchor_sel
,
var_sel
;
scores_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
1
});
bbox_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
anchor_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
var_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
BoxCoder
<
T
>
(
&
anchor_sel
,
&
bbox_sel
,
&
var_sel
,
&
proposals
);
ClipTiledBoxes
<
T
>
(
im_info_slice
,
&
proposals
);
Tensor
keep
;
FilterBoxes
<
T
>
(
&
proposals
,
min_size
,
im_info_slice
,
&
keep
);
Tensor
scores_filter
;
bbox_sel
.
mutable_data
<
T
>
({
keep
.
numel
(),
4
});
scores_filter
.
mutable_data
<
T
>
({
keep
.
numel
(),
1
});
if
(
nms_thresh
<=
0
)
{
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
}
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
keep_nms
.
Resize
({
post_nms_top_n
});
}
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
});
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
});
return
std
::
make_pair
(
proposals
,
scores_sel
);
}
template
<
>
void
ProposalKernel
<
FPGA
,
float
>::
Compute
(
const
ProposalParam
<
FPGA
>
&
param
)
{
// TODO(hjchen2)
auto
score_tensor
=
param
.
float_score
.
get
();
fpga
::
PerformBypass
(
param
.
score_arg
);
fpga
::
fpga_invalidate
(
score_tensor
->
data
<
float
>
(),
score_tensor
->
numel
()
*
sizeof
(
float
));
auto
bbox_tensor
=
param
.
float_bbox
.
get
();
fpga
::
PerformBypass
(
param
.
bbox_arg
);
fpga
::
fpga_invalidate
(
bbox_tensor
->
data
<
float
>
(),
bbox_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
scores
=
param
.
float_score
.
get
();
auto
*
bbox_deltas
=
param
.
float_bbox
.
get
();
auto
*
im_info
=
param
.
im_info_
;
auto
anchors
=
*
param
.
anchors_
;
auto
variances
=
*
param
.
variances_
;
auto
*
rpn_rois
=
param
.
rpn_rois_
;
auto
*
rpn_roi_probs
=
param
.
rpn_probs_
;
int
pre_nms_top_n
=
param
.
pre_nms_topn_
;
int
post_nms_top_n
=
param
.
post_nms_topn_
;
float
nms_thresh
=
param
.
nms_thresh_
;
float
min_size
=
param
.
min_size_
;
float
eta
=
param
.
eta_
;
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
//
Tensor
bbox_deltas_swap
,
scores_swap
;
bbox_deltas_swap
.
mutable_data
<
float
>
({
num
,
h_bbox
,
w_bbox
,
c_bbox
});
scores_swap
.
mutable_data
<
float
>
({
num
,
h_score
,
w_score
,
c_score
});
framework
::
LoD
lod
;
lod
.
resize
(
1
);
auto
&
lod0
=
lod
[
0
];
lod0
.
push_back
(
0
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
Tensor
im_info_slice
=
im_info
->
Slice
(
i
,
i
+
1
);
Tensor
bbox_deltas_slice
=
bbox_deltas_swap
.
Slice
(
i
,
i
+
1
);
Tensor
scores_slice
=
scores_swap
.
Slice
(
i
,
i
+
1
);
bbox_deltas_slice
.
Resize
({
h_bbox
*
w_bbox
*
c_bbox
/
4
,
4
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
AppendProposals
(
rpn_rois
,
4
*
num_proposals
,
proposals
);
AppendProposals
(
rpn_roi_probs
,
num_proposals
,
scores
);
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
push_back
(
num_proposals
);
}
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
rpn_roi_probs
->
Resize
({
num_proposals
,
1
});
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
浏览文件 @
16927084
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
...
...
@@ -21,13 +22,180 @@ namespace paddle_mobile {
namespace
operators
{
template
<
>
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>
*
param
)
{
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>*
param
)
{
auto
dims
=
param
->
input_x_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
"data not aligned"
);
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
param
->
float_output
=
std
::
make_shared
<
Tensor
>
();
param
->
float_output
->
mutable_data
<
float
>
(
param
->
output_
->
dims
());
auto
input
=
param
->
input_x_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
param
->
input_arg
=
args
;
fpga
::
format_fp16_ofm
(
param
->
output_
);
input
=
param
->
float_output
.
get
();
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
image
.
address
=
input
->
data
<
float
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
output_
->
mutable_data
<
half
>
();
args
.
output
.
scale_address
=
param
->
output_
->
scale
;
param
->
input_arg
=
args
;
return
true
;
}
template
<
>
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>
&
param
)
{
// TODO(hjchen2)
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
auto
input_tensor
=
param
.
float_input
.
get
();
fpga
::
PerformBypass
(
param
.
input_arg
);
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
input_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
in
=
input_tensor
;
auto
*
rois
=
param
.
input_rois_
;
auto
*
out
=
param
.
float_output
.
get
();
auto
pooled_height
=
param
.
pooled_height_
;
auto
pooled_width
=
param
.
pooled_width_
;
auto
spatial_scale
=
param
.
spatial_scale_
;
auto
output_channels
=
param
.
output_channels_
;
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto
in_stride
=
framework
::
stride
({
batch_size
,
height
,
width
,
input_channels
});
auto
out_stride
=
framework
::
stride
(
{
out
->
dims
()[
0
],
out
->
dims
()[
2
],
out
->
dims
()[
3
],
out
->
dims
()[
1
]});
const
float
*
input_data
=
in
->
data
<
float
>
();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
return
;
PADDLE_MOBILE_ENFORCE
(
rois
->
NumLevels
()
>
0
,
"ROIS should not be empty"
);
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_MOBILE_ENFORCE
(
rois_batch_size
==
batch_size
,
"the rois_batch_size and input(X) batch_size should be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_MOBILE_ENFORCE
(
rois_num_with_lod
==
rois_num
,
"the rois_num from input and lod must be the same"
);
PADDLE_MOBILE_ENFORCE
(
input_channels
==
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// set roi batch id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
auto
roi_end_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
auto
roi_end_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small rois to be 1 x 1
auto
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
// avoid 0
auto
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// Compute bin size w and h at input feature map
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
DLOG
<<
3
;
// calculate each pixel of the output feature map.
int
out_roi_offset
=
n
*
out_stride
[
0
];
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int
out_plane_offset
=
out_roi_offset
+
c
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
++
ph
)
{
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int
out_row_offset
=
out_plane_offset
+
ph
*
out_stride
[
1
];
for
(
int
pw
=
0
;
pw
<
pooled_width
;
++
pw
)
{
// calculate w and h at input feature map
int
hstart
=
floor
(
static_cast
<
float
>
(
ph
)
*
bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
float
>
(
pw
)
*
bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
float
>
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
);
int
wend
=
ceil
(
static_cast
<
float
>
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
// TODO int output_index = out_row_offset + pw;
int
output_index
=
out_row_offset
+
pw
*
output_channels
;
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int
input_plane_offset
=
roi_batch_id
*
in_stride
[
0
]
+
input_channel
;
auto
offset_input_data
=
input_data
+
input_plane_offset
;
float
out_sum
=
0.
;
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
in_stride
[
1
]
+
iw
*
input_channel
;
out_sum
+=
offset_input_data
[
input_index
];
}
}
float
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
output_data
[
output_index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
}
}
}
}
fpga
::
format_image
(
out
);
fpga
::
PerformBypass
(
param
.
output_arg
);
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/reshape2_kernel.cpp
浏览文件 @
16927084
...
...
@@ -15,18 +15,61 @@ limitations under the License. */
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
Reshape2Kernel
<
FPGA
,
float
>::
Init
(
Reshape2Param
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
output
=
param
->
Out
();
auto
shape
=
param
->
Shape
();
output
->
ShareDataWith
(
*
input
);
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
PADDLE_MOBILE_ENFORCE
(
num_shape
!=
0
,
"0 index is not supported"
);
for
(
int
i
=
0
;
i
<
shape
.
size
();
i
++
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
static_cast
<
int
>
(
-
num_in
/
num_shape
);
break
;
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
DLOG
<<
"input: "
<<
input
;
DLOG
<<
"output: "
<<
output
;
return
true
;
}
template
<
>
void
Reshape2Kernel
<
FPGA
,
float
>::
Compute
(
const
Reshape2Param
<
FPGA
>
&
param
)
{
return
;
auto
input
=
const_cast
<
LoDTensor
*>
(
param
.
InputX
());
auto
output
=
param
.
Out
();
auto
shape
=
param
.
Shape
();
if
(
output
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
}
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
PADDLE_MOBILE_ENFORCE
(
num_shape
!=
0
,
"0 index is not supported"
);
for
(
int
i
=
0
;
i
<
shape
.
size
();
i
++
)
{
if
(
shape
[
i
]
==
-
1
)
{
shape
[
i
]
=
static_cast
<
int
>
(
-
num_in
/
num_shape
);
break
;
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
if
(
output
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
DLOG
<<
output
;
}
//
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
浏览文件 @
16927084
...
...
@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile
::
fpga
::
SIGMOID
;
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp16_ofm
(
out
);
...
...
@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args
.
image
.
width
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
out
->
data
<
float
>
();
args
.
output
.
address
=
out
->
data
<
half
>
();
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
activation
.
activation_type
=
activation_enable
;
args
.
output
.
activation
.
leaky_relu_negative_slope
=
leaky_relu_negative_slope
;
...
...
src/operators/kernel/fpga/V1/slice_kernel.cpp
浏览文件 @
16927084
...
...
@@ -21,10 +21,37 @@ namespace operators {
template
<
>
bool
SliceKernel
<
FPGA
,
float
>::
Init
(
SliceParam
<
FPGA
>*
param
)
{
auto
output
=
param
->
output_
;
fpga
::
format_fp16_ofm
(
output
);
DLOG
<<
"input: "
<<
param
->
input_
;
DLOG
<<
"output: "
<<
param
->
output_
;
if
(
param
->
input_
->
type
()
!=
typeid
(
half
))
{
DLOG
<<
"wrong type"
;
}
return
true
;
}
template
<
>
void
SliceKernel
<
FPGA
,
float
>::
Compute
(
const
SliceParam
<
FPGA
>&
param
)
{}
void
SliceKernel
<
FPGA
,
float
>::
Compute
(
const
SliceParam
<
FPGA
>&
param
)
{
// Only support slicing in channel dimension
auto
input
=
param
.
input_
;
DLOG
<<
input
;
int
HW
=
input
->
dims
()[
2
]
*
input
->
dims
()[
3
];
int
channel
=
input
->
dims
()[
1
];
auto
input_ptr
=
input
->
data
<
half
>
();
auto
output_ptr
=
param
.
output_
->
data
<
half
>
();
int
start
=
param
.
starts_
[
0
],
end
=
param
.
ends_
[
0
];
start
=
start
<
0
?
start
+
channel
:
start
;
end
=
end
<
0
?
end
+
channel
:
end
;
start
=
start
>
channel
?
channel
:
start
;
end
=
end
>
channel
?
channel
:
end
;
int
len
=
end
-
start
;
for
(
int
i
=
0
;
i
<
HW
;
i
++
)
{
memcpy
(
output_ptr
+
len
*
i
,
input_ptr
+
i
*
channel
+
start
,
len
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
16927084
...
...
@@ -23,49 +23,72 @@ namespace operators {
template
<
>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
half
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp32_ofm
(
out
);
auto
float_input
=
new
Tensor
;
if
(
input
->
dims
().
size
()
==
2
)
{
float_input
->
mutable_data
<
float
>
({
1
,
input
->
dims
()[
1
]});
}
else
if
(
input
->
dims
().
size
()
==
4
)
{
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
2
],
input
->
dims
()[
3
],
input
->
dims
()[
1
]});
}
else
{
DLOG
<<
"wrong dimension of softmax input"
;
PADDLE_MOBILE_ENFORCE
(
input
->
dims
().
size
()
==
4
,
"Softmax should have 4-order input"
);
auto
dims
=
framework
::
vectorize
(
input
->
dims
());
auto
channel
=
dims
[
3
];
if
(
channel
==
1
)
{
// This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE
(
dims
[
2
]
==
1
,
"Softmax input must come from FC op"
);
dims
[
3
]
=
dims
[
1
];
dims
[
1
]
=
1
;
}
input
->
Resize
(
framework
::
make_ddim
(
dims
));
float_input
->
Resize
(
framework
::
make_ddim
(
dims
));
if
(
channel
!=
2
)
{
// Use CPU
float_input
->
init
(
typeid
(
float
));
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
format_fp32_ofm
(
out
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
uint32_t
)
dims
[
1
];
args
.
image
.
width
=
(
uint32_t
)
dims
[
2
];
args
.
image
.
channels
=
(
uint32_t
)
dims
[
3
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
param
->
SetFloatInput
(
float_input
);
param
->
SetFpgaArgs
(
args
);
}
else
{
// Use FPGA
fpga
::
format_fp16_ofm
(
out
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
output
.
address
=
out
->
data
<
half
>
();
args
.
output
.
scale_address
=
out
->
scale
;
args
.
output
.
activation
.
activation_type
=
fpga
::
SOFTMAX
;
param
->
SetFpgaArgs
(
args
);
}
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
2
]
:
1
;
args
.
image
.
width
=
(
input
->
dims
().
size
()
==
4
)
?
(
uint32_t
)
input
->
dims
()[
3
]
:
1
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
param
->
SetFloatInput
(
float_input
);
param
->
SetFpgaArgs
(
args
);
return
true
;
}
template
<
>
void
SoftmaxKernel
<
FPGA
,
float
>::
Compute
(
const
SoftmaxParam
<
FPGA
>
&
param
)
{
Tensor
*
in_x
=
param
.
FloatInput
();
Tensor
*
out
=
param
.
Out
();
fpga
::
PerformBypass
(
param
.
FpgaArgs
());
fpga
::
fpga_invalidate
((
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
in_x
->
numel
()
*
sizeof
(
float
));
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
if
(
param
.
FpgaArgs
().
output
.
activation
.
activation_type
!=
fpga
::
SOFTMAX
)
{
Tensor
*
out
=
param
.
Out
();
Tensor
*
in_x
=
param
.
FloatInput
();
fpga
::
fpga_invalidate
(
in_x
->
data
<
float
>
(),
in_x
->
numel
()
*
sizeof
(
float
));
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
}
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/split_kernel.cpp
浏览文件 @
16927084
...
...
@@ -34,16 +34,18 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
float
*
)));
auto
out_channels
=
reinterpret_cast
<
uint32_t
*>
(
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
uint32_t
)));
DLOG
<<
"input: "
<<
in
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
fpga
::
format_fp16_ofm
(
outs
[
i
]);
images_out
[
i
]
=
outs
[
i
]
->
mutable_data
<
float
>
();
DLOG
<<
"output: "
<<
outs
[
i
];
images_out
[
i
]
=
outs
[
i
]
->
mutable_data
<
half
>
();
scales_out
[
i
]
=
outs
[
i
]
->
scale
;
out_channels
[
i
]
=
(
uint32_t
)
sections
[
i
];
}
fpga
::
SplitArgs
arg
=
{
0
};
arg
.
image_num
=
image_num
;
arg
.
image_in
=
(
half
*
)
in
->
data
<
float
>
();
arg
.
image_in
=
in
->
data
<
half
>
();
arg
.
scale_in
=
in
->
scale
;
arg
.
images_out
=
images_out
;
arg
.
scales_out
=
scales_out
;
...
...
src/operators/kernel/fpga/V1/tanh_kernel.cpp
浏览文件 @
16927084
...
...
@@ -22,8 +22,10 @@ namespace operators {
template
<
>
bool
TanhKernel
<
FPGA
,
float
>::
Init
(
TanhParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
DLOG
<<
"input: "
<<
input
;
auto
input_ptr
=
input
->
data
<
half
>
();
auto
float_input
=
new
Tensor
;
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
1
],
input
->
dims
()[
2
],
input
->
dims
()[
3
]});
fpga
::
format_fp32_ofm
(
float_input
);
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
16927084
...
...
@@ -20,7 +20,21 @@ namespace operators {
template
<
>
bool
Transpose2Kernel
<
FPGA
,
float
>::
Init
(
Transpose2Param
<
FPGA
>
*
param
)
{
param
->
Out
()
->
ShareDataWith
(
*
param
->
InputX
());
auto
input
=
param
->
InputX
();
auto
output
=
param
->
Out
();
auto
axis
=
param
->
Axis
();
auto
dim
=
input
->
dims
();
output
->
ShareDataWith
(
*
input
);
auto
dim_v
=
vectorize
(
dim
);
for
(
int
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
dim_v
[
i
]
=
dim
[
axis
[
i
]];
}
output
->
Resize
(
framework
::
make_ddim
(
dim_v
));
DLOG
<<
"input: "
<<
input
;
DLOG
<<
"output: "
<<
output
;
return
true
;
}
...
...
src/operators/op_param.h
浏览文件 @
16927084
...
...
@@ -1172,6 +1172,12 @@ class FeedParam : public OpParam {
public:
FeedParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
#ifdef PADDLE_MOBILE_FPGA
static
int
feed_num
=
0
;
auto
new_name
=
std
::
string
(
"feed"
)
+
std
::
to_string
(
feed_num
++
);
const_cast
<
VariableNameMap
&>
(
inputs
).
at
(
"X"
)
=
{
string
(
new_name
)};
#endif
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
out_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
auto
var
=
scope
.
FindVar
(
"batch_size"
);
...
...
@@ -1195,6 +1201,11 @@ class FetchParam : public OpParam {
public:
FetchParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
#ifdef PADDLE_MOBILE_FPGA
static
int
fetch_num
=
0
;
auto
new_name
=
std
::
string
(
"fetch"
)
+
std
::
to_string
(
fetch_num
++
);
const_cast
<
VariableNameMap
&>
(
outputs
).
at
(
"Out"
)
=
{
string
(
new_name
)};
#endif
input_x_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
out_
=
OutFrom
(
outputs
,
scope
);
}
...
...
@@ -1210,18 +1221,9 @@ class FetchParam : public OpParam {
RType
*
input_x_
;
Tensor
*
out_
;
#ifdef PADDLE_MOBILE_FPGA
private:
std
::
shared_ptr
<
RType
>
float_input_x_
;
public:
fpga
::
BypassArgs
fpga_bypass_args
;
public:
RType
*
FloatInput
()
const
{
return
float_input_x_
==
nullptr
?
input_x_
:
float_input_x_
.
get
();
}
void
SetFloatInput
(
Tensor
*
input
)
{
float_input_x_
.
reset
(
input
);
}
const
fpga
::
BypassArgs
&
FpgaArgs
()
const
{
return
fpga_bypass_args
;
}
void
SetFpgaArgs
(
const
fpga
::
BypassArgs
&
args
)
{
fpga_bypass_args
=
args
;
}
#endif
};
...
...
test/fpga/test_resnet50.cpp
浏览文件 @
16927084
...
...
@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
}
}
void
dump
(
std
::
string
filename
,
const
Tensor
input_tensor
)
{
auto
dataptr
=
input_tensor
.
data
<
float
>
(
);
void
dump
(
std
::
string
filename
,
Tensor
input_tensor
)
{
auto
dataptr
=
reinterpret_cast
<
half
*>
(
input_tensor
.
get_data
()
);
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
++
i
)
{
...
...
@@ -61,12 +61,11 @@ void dump(std::string filename, const Tensor input_tensor) {
}
out
.
close
();
}
void
dump_stride
(
std
::
string
filename
,
const
Tensor
input_tensor
,
const
int
dumpnum
)
{
void
dump_stride
(
std
::
string
filename
,
Tensor
input_tensor
,
const
int
dumpnum
)
{
int
c
=
(
input_tensor
.
dims
())[
1
];
int
h
=
(
input_tensor
.
dims
())[
2
];
int
w
=
(
input_tensor
.
dims
())[
3
];
auto
data_ptr
=
input_tensor
.
data
<
float
>
();
auto
data_ptr
=
input_tensor
.
get_data
();
int16_t
*
data_tmp
=
(
int16_t
*
)
malloc
(
c
*
h
*
w
*
sizeof
(
int16_t
));
int16_t
*
data_ptr_16
=
(
int16_t
*
)
data_ptr
;
convert_to_chw
(
&
data_ptr_16
,
c
,
h
,
w
,
data_tmp
);
...
...
@@ -98,9 +97,9 @@ int main() {
for
(
int
i
=
0
;
i
<
73
;
i
++
)
{
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
i
);
std
::
string
saveName
=
"resnet50_result_"
+
std
::
to_string
(
i
);
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
data
<
float
>
(),
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
get_data
(),
tensor_ptr
->
numel
()
*
sizeof
(
half
));
dump_stride
(
saveName
,
(
*
tensor_ptr
),
20
);
//
dump_stride(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr));
}
...
...
test/fpga/test_rfcn.cpp
浏览文件 @
16927084
...
...
@@ -23,29 +23,38 @@ limitations under the License. */
#include "fpga/V2/api.h"
#endif
// static const char *g_densebox_combine = "../models/densebox";
static
const
char
*
g_densebox_combine
=
"../models/rfcn"
;
void
readStream
(
std
::
string
filename
,
uint8_t
*
buf
)
{
std
::
ifstream
in
;
in
.
open
(
filename
,
std
::
ios
::
in
);
if
(
!
in
.
is_open
())
{
std
::
cout
<<
"open File Failed."
<<
std
::
endl
;
return
;
}
int
i
=
0
;
while
(
!
in
.
eof
())
{
in
>>
buf
[
i
];
i
++
;
}
in
.
close
();
}
static
const
char
*
g_rfcn_combine
=
"../models/rfcn"
;
const
std
::
string
g_image_src_float
=
"../models/rfcn/data.bin"
;
int
main
()
{
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
// paddle_mobile.SetThreadNum(4);
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_densebox_combine
)
+
"/model"
,
std
::
string
(
g_densebox_combine
)
+
"/params"
,
true
,
false
,
1
,
true
))
{
// std::vector<float> input;
// std::vector<int64_t> dims{1, 3, 512, 1024};
// GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
// auto vec_result = paddle_mobile.Predict(input, dims);
return
0
;
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
512
,
1024
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
// readStream(g_image_src_float,
// input_tensor.mutable_data<float>({1, 3, 224, 224}));
paddle_mobile
.
FeedData
(
input_tensor
);
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_rfcn_combine
)
+
"/model"
,
std
::
string
(
g_rfcn_combine
)
+
"/params"
,
true
,
false
,
1
,
true
))
{
float
img_info
[
3
]
=
{
768
,
1536
,
768.0
f
/
960.0
f
};
auto
img
=
fpga
::
fpga_malloc
(
768
*
1536
*
3
*
sizeof
(
float
));
readStream
(
g_image_src_float
,
reinterpret_cast
<
uint8_t
*>
(
img
));
std
::
vector
<
void
*>
v
(
3
,
nullptr
);
paddle_mobile
.
FeedData
({
img_info
,
img
});
paddle_mobile
.
Predict_To
(
-
1
);
paddle_mobile
.
GetResults
(
&
v
);
DLOG
<<
"Computation done"
;
}
return
0
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录