Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
14b944f3
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
14b944f3
编写于
12月 10, 2018
作者:
H
Houjiang Chen
提交者:
GitHub
12月 10, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into dev-latest
上级
f7eb7352
344c1df7
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
504 addition
and
266 deletion
+504
-266
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+208
-40
src/fpga/V1/bias_scale.h
src/fpga/V1/bias_scale.h
+0
-2
src/fpga/V1/deconv_bias_scale.h
src/fpga/V1/deconv_bias_scale.h
+0
-2
src/fpga/V1/filter.h
src/fpga/V1/filter.h
+0
-3
src/fpga/V1/image.cpp
src/fpga/V1/image.cpp
+21
-9
src/fpga/V1/image.h
src/fpga/V1/image.h
+9
-7
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+57
-110
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+0
-4
src/fpga/common/driver.h
src/fpga/common/driver.h
+0
-7
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+66
-1
src/operators/kernel/arm/concat_kernel.cpp
src/operators/kernel/arm/concat_kernel.cpp
+5
-1
src/operators/kernel/central-arm-func/concat_arm_func.h
src/operators/kernel/central-arm-func/concat_arm_func.h
+7
-7
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+1
-0
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+1
-0
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+10
-6
test/operators/test_concat_op.cpp
test/operators/test_concat_op.cpp
+116
-67
test/operators/test_fusion_fc_op.cpp
test/operators/test_fusion_fc_op.cpp
+3
-0
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
14b944f3
...
...
@@ -21,6 +21,9 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
fpga
{
#define USE_RELU 1
#define USE_BIAS 2
int
get_align_image_cw
(
int
cw
)
{
return
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
...
...
@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width,
out
->
reset_data_ptr
(
data_ptr
);
}
void
expand_conv_arg
(
ConvArgs
*
arg
)
{
ConvArgs
args
=
*
arg
;
uint64_t
filterlen
=
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
*
(
uint64_t
)
args
.
image
.
channels
;
filterlen
=
align_to_x
(
filterlen
,
FILTER_ELEMENT_ALIGNMENT
);
filterlen
*=
align_to_x
((
uint64_t
)
args
.
filter_num
,
FILTER_NUM_ALIGNMENT
);
uint64_t
fpga_bias_scale_len
=
align_to_x
(
args
.
filter_num
/
args
.
group_num
,
8
)
*
args
.
group_num
;
uint64_t
output_height
=
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
1
;
uint64_t
output_width
=
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
1
;
uint64_t
output_size
=
output_height
*
output_width
*
(
uint64_t
)
args
.
filter_num
;
auto
filter_per_group
=
(
uint64_t
)(
args
.
filter_num
/
args
.
group_num
);
auto
channel_per_group
=
(
uint64_t
)(
args
.
image
.
channels
/
args
.
group_num
);
uint64_t
image_row_count
=
((
uint64_t
)
args
.
image
.
width
)
*
((
uint64_t
)
args
.
image
.
channels
);
// without align
uint64_t
image_amount_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
);
uint64_t
image_one_pad_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
)
+
((
uint64_t
)
args
.
image
.
pad_width
)
*
((
uint64_t
)
args
.
image
.
channels
);
uint64_t
filter_amount_all
=
align_to_x
(((
uint64_t
)
args
.
kernel
.
height
)
*
((
uint64_t
)
args
.
kernel
.
width
)
*
channel_per_group
,
FILTER_ELEMENT_ALIGNMENT
);
uint64_t
output_amount_per_row
=
align_to_x
(
output_width
*
((
uint64_t
)
args
.
filter_num
),
IMAGE_ALIGNMENT
);
// find the opt partition strategy
uint64_t
res_win
;
uint64_t
res_fit
=
0
;
for
(
res_win
=
1
;
res_win
<=
output_width
;
res_win
=
res_win
+
1
)
{
if
((
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
res_win
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
)
*
args
.
kernel
.
height
>
2048
)
{
break
;
}
}
if
(
res_win
!=
output_width
)
{
res_win
-=
1
;
}
if
(((
res_win
%
2
)
!=
0
)
&&
(
res_win
!=
1
))
{
res_win
=
res_win
-
1
;
}
res_fit
=
res_win
;
uint64_t
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
uint64_t
block_len
=
res_fit
;
uint64_t
block_last
=
output_width
-
res_fit
*
(
block_num
-
1
);
uint64_t
res_amount_per_row
=
output_width
*
args
.
filter_num
;
uint64_t
res_amount_per_row_pad
=
output_amount_per_row
-
res_amount_per_row
;
uint64_t
image_block_amount_per_row
=
args
.
kernel
.
stride_w
*
(
res_fit
)
*
args
.
image
.
channels
;
uint64_t
filter_pad_width_mul_channel
=
args
.
image
.
pad_width
*
args
.
image
.
channels
;
uint64_t
image_amount_per_row_multi_win_first
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
uint64_t
image_amount_per_row_multi_win
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
);
uint64_t
image_block_num
=
block_num
;
uint64_t
image_block_len
=
align_to_x
((
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_len
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
;
uint64_t
image_block_len_last
=
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_last
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
;
uint64_t
image_win_cnt
=
block_len
;
uint64_t
image_win_cnt_last
=
block_last
;
uint64_t
res_row_data_align4_pad
=
res_amount_per_row_pad
/
8
;
uint64_t
prog_full_cnt
=
2048
/
(
filter_amount_all
/
16
*
2
)
-
1
;
if
(
prog_full_cnt
==
1023
)
{
prog_full_cnt
--
;
}
uint64_t
post_prog_full_cnt
=
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
>
2
)
?
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
-
2
)
:
0
;
uint64_t
cmd
=
0UL
|
(
args
.
relu_enabled
?
USE_RELU
:
0
)
|
USE_BIAS
;
(
*
arg
).
driver
.
image_address_phy
=
vaddr_to_paddr
(
args
.
image
.
address
);
(
*
arg
).
driver
.
sb_address_phy
=
vaddr_to_paddr
(
args
.
sb_address
);
(
*
arg
).
driver
.
filter_address_phy
=
vaddr_to_paddr
(
args
.
filter_address
);
(
*
arg
).
driver
.
output_address_phy
=
vaddr_to_paddr
(
args
.
output
.
address
);
(
*
arg
).
driver
.
output_height
=
output_height
;
(
*
arg
).
driver
.
output_width
=
output_width
;
(
*
arg
).
driver
.
filter_per_group
=
filter_per_group
;
(
*
arg
).
driver
.
channel_per_group
=
channel_per_group
;
(
*
arg
).
driver
.
image_amount_per_row
=
image_amount_per_row
;
(
*
arg
).
driver
.
image_one_pad_per_row
=
image_one_pad_per_row
;
(
*
arg
).
driver
.
filter_amount_all
=
filter_amount_all
;
(
*
arg
).
driver
.
output_amount_per_row
=
output_amount_per_row
;
(
*
arg
).
driver
.
image_block_amount_per_row
=
image_block_amount_per_row
;
(
*
arg
).
driver
.
filter_pad_width_mul_channel
=
filter_pad_width_mul_channel
;
(
*
arg
).
driver
.
image_amount_per_row_multi_win_first
=
image_amount_per_row_multi_win_first
;
(
*
arg
).
driver
.
image_amount_per_row_multi_win
=
image_amount_per_row_multi_win
;
(
*
arg
).
driver
.
image_block_num
=
image_block_num
;
(
*
arg
).
driver
.
image_block_len
=
image_block_len
;
(
*
arg
).
driver
.
image_block_len_last
=
image_block_len_last
;
(
*
arg
).
driver
.
image_win_cnt
=
image_win_cnt
;
(
*
arg
).
driver
.
image_win_cnt_last
=
image_win_cnt_last
;
(
*
arg
).
driver
.
res_row_data_align4_pad
=
res_row_data_align4_pad
;
(
*
arg
).
driver
.
prog_full_cnt
=
prog_full_cnt
;
(
*
arg
).
driver
.
post_prog_full_cnt
=
post_prog_full_cnt
;
(
*
arg
).
driver
.
fpga_bias_scale_len
=
fpga_bias_scale_len
;
(
*
arg
).
driver
.
cmd
=
cmd
;
}
// expand_conv_arg()
void
expand_EW_arg
(
EWAddArgs
*
arg
)
{
EWAddArgs
args
=
*
arg
;
uint64_t
cmd
=
args
.
relu_enabled
?
USE_RELU
:
0
;
uint64_t
datalen
=
(
uint64_t
)
args
.
image0
.
width
*
(
uint64_t
)
args
.
image0
.
height
*
(
uint64_t
)
args
.
image0
.
channels
;
uint64_t
coefficient
=
(
uint64_t
)
args
.
const0
<<
32
|
(
uint64_t
)
args
.
const1
;
uint64_t
image0_address_phy
=
vaddr_to_paddr
(
args
.
image0
.
address
);
uint64_t
image1_address_phy
=
vaddr_to_paddr
(
args
.
image1
.
address
);
uint64_t
output_address_phy
=
vaddr_to_paddr
(
args
.
output
.
address
);
uint64_t
image_amount_per_row
=
align_to_x
((
uint64_t
)
args
.
image0
.
width
*
(
uint64_t
)
args
.
image0
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
image_image_pixel
=
((
uint64_t
)
args
.
image0
.
channels
<<
32
)
|
((
uint64_t
)
args
.
image0
.
width
<<
16
)
|
(
uint64_t
)
args
.
image0
.
height
;
(
*
arg
).
driver
.
image0_address_phy
=
image0_address_phy
;
(
*
arg
).
driver
.
image1_address_phy
=
image1_address_phy
;
(
*
arg
).
driver
.
datalen
=
datalen
;
(
*
arg
).
driver
.
image_image_pixel
=
image_image_pixel
;
(
*
arg
).
driver
.
image_amount_per_row
=
image_amount_per_row
;
(
*
arg
).
driver
.
output_address_phy
=
output_address_phy
;
(
*
arg
).
driver
.
coefficient
=
coefficient
;
(
*
arg
).
driver
.
cmd
=
cmd
;
}
// expand_EW_arg
void
fill_split_arg
(
struct
SplitConvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
group_num
,
int
stride_h
,
...
...
@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto
channel
=
(
int
)
out
->
dims
()[
1
];
// NOLINT
int
filter_num_per_div
=
get_filter_num_per_div
(
filter
,
group_num
);
int
element_num
=
get_aligned_filter_element_num
(
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
]
);
(
int
)(
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
])
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
arg
->
conv_arg
[
i
].
relu_enabled
=
relu_enabled
;
...
...
@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
conv_arg
[
i
].
image
.
pad_height
=
(
uint32_t
)
padding_h
;
arg
->
conv_arg
[
i
].
image
.
pad_width
=
(
uint32_t
)
padding_w
;
arg
->
conv_arg
[
i
].
filter_scale_address
=
filter
->
scale
;
// arg->conv_arg[i].filter_address = &(
// (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
// NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg
->
conv_arg
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
channel
-
(
n
-
1
)
*
filter_num_per_div
// NOLINT
:
filter_num_per_div
);
size_t
filter_size
=
element_num
*
arg
->
conv_arg
[
i
].
filter_num
*
sizeof
(
int8_t
);
element_num
*
align_to_x
(
arg
->
conv_arg
[
i
].
filter_num
,
FILTER_NUM_ALIGNMENT
)
*
sizeof
(
int8_t
);
auto
filter_head
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
*
filter_num_per_div
];
arg
->
conv_arg
[
i
].
filter_address
=
fpga_malloc
(
filter_size
);
memcpy
(
arg
->
conv_arg
[
i
].
filter_address
,
filter_head
,
filter_size
);
fpga_flush
(
arg
->
conv_arg
[
i
].
filter_address
,
filter_size
);
size_t
bs_size
=
2
*
arg
->
conv_arg
[
i
].
filter_num
*
sizeof
(
float
);
size_t
bs_size
=
2
*
align_to_x
(
arg
->
conv_arg
[
i
].
filter_num
,
BS_NUM_ALIGNMENT
)
*
sizeof
(
float
);
auto
bs_head
=
&
bs_ptr
[
i
*
filter_num_per_div
*
2
];
arg
->
conv_arg
[
i
].
sb_address
=
fpga_malloc
(
bs_size
);
memcpy
(
arg
->
conv_arg
[
i
].
sb_address
,
bs_head
,
bs_size
);
...
...
@@ -249,11 +415,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
if
(
n
>
1
)
{
arg
->
conv_arg
[
i
].
output
.
scale_address
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
// NOLINT
arg
->
conv_arg
[
i
].
output
.
address
=
fpga_malloc
(
out
->
dims
()[
2
]
*
align_to_x
(
out
->
dims
()[
3
]
*
arg
->
conv_arg
[
i
].
filter_num
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
arg
->
conv_arg
[
i
].
output
.
address
=
fpga_malloc
(
out
->
dims
()[
2
]
*
align_to_x
((
int
)(
out
->
dims
()[
3
]
*
arg
->
conv_arg
[
i
].
filter_num
)
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
}
else
{
arg
->
conv_arg
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
conv_arg
[
i
].
output
.
address
=
out_ptr
;
...
...
@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
(
half
*
)
arg
->
conv_arg
[
i
].
output
.
address
;
// NOLINT
arg
->
concat_arg
.
scales_in
[
i
]
=
arg
->
conv_arg
[
i
].
output
.
scale_address
;
arg
->
concat_arg
.
channel_num
[
i
]
=
arg
->
conv_arg
[
i
].
filter_num
;
expand_conv_arg
(
&
arg
->
conv_arg
[
i
]);
}
filter
->
reset_data_ptr
(
nullptr
);
fpga_free
(
bs_ptr
);
}
}
// fill_split_arg
void
fill_deconv_arg
(
struct
DeconvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
group_num
,
int
stride_h
,
...
...
@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
auto
out_ptr
=
out
->
data
<
float
>
();
arg
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
sub_conv_num
=
stride_h
;
arg
->
sub_conv_num
=
(
uint32_t
)
stride_h
;
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
int
sub_conv_num
=
arg
->
sub_conv_num
;
int
sub_stride
=
1
;
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
(
filter
->
dims
()[
3
],
padding_w
,
stride_w
);
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
filter
->
dims
()[
3
],
stride_w
);
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
(
(
int
)
filter
->
dims
()[
3
]
,
padding_w
,
stride_w
);
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
(
int
)
filter
->
dims
()[
3
],
stride_w
);
int
sub_output_width
=
deconv_filter
::
deconv_get_sub_out_axis
(
input
->
dims
()[
3
],
sub_pad
,
sub_filter_width
);
(
int
)
input
->
dims
()[
3
],
sub_pad
,
sub_filter_width
);
int
sub_output_height
=
deconv_filter
::
deconv_get_sub_out_axis
(
input
->
dims
()[
2
],
sub_pad
,
sub_filter_width
);
(
int
)
input
->
dims
()[
2
],
sub_pad
,
sub_filter_width
);
arg
->
sub_output_width
=
sub_output_width
;
arg
->
sub_output_height
=
sub_output_height
;
arg
->
omit_size
=
deconv_filter
::
deconv_get_omit
(
stride_w
,
filter
->
dims
()[
3
],
padding_w
);
arg
->
sub_output_width
=
(
uint32_t
)
sub_output_width
;
arg
->
sub_output_height
=
(
uint32_t
)
sub_output_height
;
arg
->
omit_size
=
(
uint32_t
)
deconv_filter
::
deconv_get_omit
(
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
int
sub_channels
=
(
int
32_t
)
input
->
dims
()[
1
];
int
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
...
...
@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
conv_args
[
i
].
filter_num
=
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
);
arg
->
conv_args
[
i
].
group_num
=
group_num
;
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
arg
->
conv_args
[
i
].
kernel
.
width
=
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
height
=
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_args
[
i
].
image
.
channels
=
sub_channels
;
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
conv_args
[
i
].
image
.
pad_width
=
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
(
void
*
)
bs_ptr
;
char
*
filter_sub_space
=
auto
filter_sub_space
=
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
fpga_copy
(
filter_sub_space
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
align_conv_sub_filter_count
);
(
size_t
)
align_conv_sub_filter_count
);
arg
->
conv_args
[
i
].
filter_address
=
(
void
*
)(
filter_sub_space
);
fpga_flush
(
filter_sub_space
,
align_conv_sub_filter_count
);
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
if
(
sub_conv_num
==
1
)
{
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
}
else
{
half
*
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
);
float
*
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
}
}
...
...
@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
// fpga_free(filter_ptr);
}
}
// fill_deconv_arg
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/bias_scale.h
浏览文件 @
14b944f3
...
...
@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once
#define BS_NUM_ALIGNMENT 8
namespace
paddle_mobile
{
namespace
fpga
{
namespace
bias_scale
{
...
...
src/fpga/V1/deconv_bias_scale.h
浏览文件 @
14b944f3
...
...
@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once
#define BS_NUM_ALIGNMENT 8
namespace
paddle_mobile
{
namespace
fpga
{
namespace
deconv_bias_scale
{
...
...
src/fpga/V1/filter.h
浏览文件 @
14b944f3
...
...
@@ -14,9 +14,6 @@ limitations under the License. */
#pragma once
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
namespace
paddle_mobile
{
namespace
fpga
{
namespace
filter
{
...
...
src/fpga/V1/image.cpp
浏览文件 @
14b944f3
...
...
@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
fpga_flush
(
image_out
,
height
*
align_each_out_area_cw
*
sizeof
(
int16_t
));
}
void
split_image
(
int16_t
*
image_in
,
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
uint32_t
*
channel_nums
,
int
height
,
int
width
)
{
void
split_image
(
int16_t
*
image_in
,
const
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
const
uint32_t
*
channel_nums
,
int
height
,
int
width
)
{
int
total_channel
=
0
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
scales_out
[
i
][
0
]
=
scale_in
[
0
];
scales_out
[
i
][
1
]
=
scale_in
[
1
];
total_channel
+=
channel_nums
[
i
];
}
int
element_num
=
height
*
align_to_x
(
width
*
total_channel
,
IMAGE_ALIGNMENT
);
fpga_invalidate
(
image_in
,
element_num
*
sizeof
(
int16_t
));
int
src_offset
=
0
,
des_offset
=
0
;
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int
src_offset
=
h
*
align_to_x
(
total_channel
*
width
,
IMAGE_ALIGNMENT
);
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
int
des_offset
=
h
*
align_to_x
(
channel_nums
[
i
]
*
width
,
IMAGE_ALIGNMENT
);
memcpy
((
int16_t
*
)
images_out
[
i
]
+
des_offset
,
image_in
+
src_offset
,
channel_nums
[
i
]
*
sizeof
(
int16_t
));
src_offset
+=
channel_nums
[
i
];
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
src_offset
=
h
*
align_to_x
(
total_channel
*
width
,
IMAGE_ALIGNMENT
)
+
w
*
total_channel
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
des_offset
=
h
*
align_to_x
(
channel_nums
[
i
]
*
width
,
IMAGE_ALIGNMENT
)
+
w
*
channel_nums
[
i
];
memcpy
((
int16_t
*
)
images_out
[
i
]
+
des_offset
,
image_in
+
src_offset
,
channel_nums
[
i
]
*
sizeof
(
int16_t
));
src_offset
+=
channel_nums
[
i
];
}
}
}
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
element_num
=
height
*
align_to_x
(
width
*
channel_nums
[
i
],
IMAGE_ALIGNMENT
);
fpga_flush
(
images_out
[
i
],
element_num
*
sizeof
(
int16_t
));
}
}
}
// namespace image
...
...
src/fpga/V1/image.h
浏览文件 @
14b944f3
...
...
@@ -14,9 +14,8 @@ limitations under the License. */
#pragma once
#include <
stdint.h
>
#include <
cstdint
>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace
paddle_mobile
{
namespace
fpga
{
namespace
image
{
...
...
@@ -24,13 +23,16 @@ namespace image {
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
);
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
// Concat featuremaps along channel direction
void
concat_images
(
int16_t
**
images_in
,
float
**
scales_in
,
void
*
image_out
,
float
*
scale_out
,
int
image_num
,
uint32_t
*
channel_num
,
int
height
,
int
width
);
// Concat featuremaps along channel direction
void
split_image
(
int16_t
*
image_in
,
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
uint32_t
*
channel_nums
,
int
height
,
int
width
);
int
height
,
int
width
);
// Split featuremap along channel direction
void
split_image
(
int16_t
*
image_in
,
const
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
const
uint32_t
*
channel_nums
,
int
height
,
int
width
);
}
// namespace image
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/pe.cpp
浏览文件 @
14b944f3
...
...
@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
cout
<<
" relu_enabled:"
<<
args
.
relu_enabled
<<
" sb_address:"
<<
args
.
sb_address
<<
" filter_address:"
<<
args
.
filter_address
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
;
cout
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
cout
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
cout
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"Conv"
;
// return
0;
uint64_t
timer_cnt
;
int
ret
=
0
;
uint64_t
output_scale
=
0
;
/*
uint64_t output_scale;
uint64_t image_scale;
uint64_t filter_scale;
...
...
@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
uint64_t sb_address_phy = 0;
uint64_t filter_address_phy = 0;
uint64_t output_address_phy = 0;
int
ret
=
0
;
fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
cout
<<
"image_scale :"
<<
hex
<<
(
image_scale
)
<<
endl
;
cout
<<
"filter_scale :"
<<
hex
<<
(
filter_scale
)
<<
endl
;
uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height *
(uint64_t)args.image.channels;
...
...
@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
filter_address_phy = vaddr_to_paddr(args.filter_address);
output_address_phy = vaddr_to_paddr(args.output.address);
/*SDK刷Cache保证数据一致性*/
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
*/
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
)
{
...
...
@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
/*restart scale*/
reg_writeq
(
output_scale
,
REG_SCALE_PARAMETER
);
reg_writeq
(
image_address_phy
,
REG_CONV_IMAGE_BASE_ADDR
);
reg_writeq
(
filter_address_phy
,
REG_CONV_FILTER_BASE_ADDR
);
reg_writeq
(
sb_address_phy
,
REG_CONV_SB_BASE_ADDR
);
reg_writeq
(
output_address_phy
,
REG_CONV_RESULT_BASE_ADDR
);
reg_writeq
(
((
uint64_t
)
args
.
image
.
height
)
|
(((
uint64_t
)
args
.
image
.
width
)
<<
32
),
REG_CONV_IMAGE_PIXEL
);
reg_writeq
(
((
uint64_t
)
args
.
kernel
.
height
)
|
(((
uint64_t
)
args
.
kernel
.
width
)
<<
32
),
REG_CONV_FILTER_PIXEL
);
reg_writeq
(
output_height
|
(
output_width
<<
32
),
REG_CONV_RESULT_PIXEL
);
reg_writeq
(
args
.
driver
.
output_height
|
(
args
.
driver
.
output_width
<<
32
),
REG_CONV_RESULT_PIXEL
);
reg_writeq
(((
uint64_t
)
args
.
image
.
pad_height
)
|
(((
uint64_t
)
args
.
image
.
pad_width
)
<<
32
),
REG_CONV_PAD_PIXEL
);
reg_writeq
(((
uint64_t
)
args
.
kernel
.
stride_h
)
|
(((
uint64_t
)
args
.
kernel
.
stride_w
)
<<
32
),
REG_CONV_STEP_PIXEL
);
reg_writeq
((
uint64_t
)
args
.
group_num
,
REG_CONV_GROUP_NUMBER
);
reg_writeq
((
uint64_t
)
args
.
filter_num
,
REG_CONV_FILTER_NUMBER
);
reg_writeq
((
uint64_t
)
args
.
image
.
channels
,
REG_CONV_CHANNEL_NUMBER
);
reg_writeq
(
*
(
uint64_t
*
)
args
.
image
.
scale_address
,
REG_CONV_IMAGE_SCALE
);
reg_writeq
(
*
(
uint64_t
*
)
args
.
filter_scale_address
,
REG_CONV_FILTER_SCALE
);
reg_writeq
(
args
.
driver
.
image_address_phy
,
REG_CONV_IMAGE_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
filter_address_phy
,
REG_CONV_FILTER_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
sb_address_phy
,
REG_CONV_SB_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
output_address_phy
,
REG_CONV_RESULT_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
filter_per_group
,
REG_CONV_FILTER_PER_GROUP
);
reg_writeq
(
args
.
driver
.
channel_per_group
,
REG_CONV_CHANNEL_PER_GROUP
);
reg_writeq
(
args
.
driver
.
image_amount_per_row
,
REG_CONV_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
args
.
driver
.
image_one_pad_per_row
,
REG_CONV_IMAGE_ONE_PAD_PER_ROW
);
reg_writeq
(
args
.
driver
.
filter_amount_all
,
REG_CONV_FILTER_AMOUNT_ALL
);
reg_writeq
(
args
.
driver
.
output_amount_per_row
,
REG_CONV_RESULT_AMOUNT_PER_ROW
);
reg_writeq
(
args
.
driver
.
image_block_amount_per_row
,
0xca8
);
reg_writeq
(
args
.
driver
.
filter_pad_width_mul_channel
,
0xcb0
);
reg_writeq
(
args
.
driver
.
image_amount_per_row_multi_win_first
,
0xcb8
);
reg_writeq
(
args
.
driver
.
image_amount_per_row_multi_win
,
0xcc0
);
reg_writeq
(
args
.
driver
.
image_block_num
,
0xcc8
);
reg_writeq
(
args
.
driver
.
image_block_len
,
0xcd0
);
reg_writeq
(
args
.
driver
.
image_block_len_last
,
0xcd8
);
reg_writeq
(
args
.
driver
.
image_win_cnt
,
0xce0
);
reg_writeq
(
args
.
driver
.
image_win_cnt_last
,
0xce8
);
reg_writeq
(
args
.
driver
.
res_row_data_align4_pad
,
0xcf8
);
reg_writeq
(
args
.
driver
.
prog_full_cnt
,
0xd08
);
reg_writeq
(
args
.
driver
.
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
args
.
driver
.
fpga_bias_scale_len
/
4
,
0xd20
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_CONV_CMD
);
reg_writeq
(
filter_per_group
,
REG_CONV_FILTER_PER_GROUP
);
reg_writeq
(
channel_per_group
,
REG_CONV_CHANNEL_PER_GROUP
);
reg_writeq
(
image_amount_per_row
,
REG_CONV_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
image_one_pad_per_row
,
REG_CONV_IMAGE_ONE_PAD_PER_ROW
);
reg_writeq
(
filter_amount_all
,
REG_CONV_FILTER_AMOUNT_ALL
);
reg_writeq
(
output_amount_per_row
,
REG_CONV_RESULT_AMOUNT_PER_ROW
);
reg_writeq
(
image_block_amount_per_row
,
0xca8
);
reg_writeq
(
filter_pad_width_mul_channel
,
0xcb0
);
reg_writeq
(
image_amount_per_row_multi_win_first
,
0xcb8
);
reg_writeq
(
image_amount_per_row_multi_win
,
0xcc0
);
reg_writeq
(
image_block_num
,
0xcc8
);
reg_writeq
(
image_block_len
,
0xcd0
);
reg_writeq
(
image_block_len_last
,
0xcd8
);
reg_writeq
(
image_win_cnt
,
0xce0
);
reg_writeq
(
image_win_cnt_last
,
0xce8
);
reg_writeq
(
res_row_data_align4_pad
,
0xcf8
);
reg_writeq
(
prog_full_cnt
,
0xd08
);
reg_writeq
(
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
fpga_bias_scale_len
/
4
,
0xd20
);
/*write scale*/
reg_writeq
(
image_scale
,
REG_CONV_IMAGE_SCALE
);
reg_writeq
(
filter_scale
,
REG_CONV_FILTER_SCALE
);
reg_writeq
(
cmd
,
REG_CONV_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_CONV
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"Conv Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
cout
<<
"output_scale :"
<<
hex
<<
(
output_scale
)
<<
endl
;
//*(args.output.scale_address) = output_scale;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
...
...
@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG
<<
"Pooling Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
...
...
@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"Conv"
;
// return 0;
int
ret
=
0
;
uint64_t
output_scale
=
0
;
uint64_t
timer_cnt
=
0
;
/*
uint64_t timer_cnt = 0;
uint64_t image0_address_phy = 0;
uint64_t image1_address_phy = 0;
uint64_t output_address_phy = 0;
...
...
@@ -629,54 +587,44 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
(uint64_t)args.image0.height *
(uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_POOLING
]
->
status
)
{
ret
=
-
EIO
;
DLOG
<<
"Conv Status Error!"
;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
image0_address_phy = vaddr_to_paddr(args.image0.address);
image1_address_phy = vaddr_to_paddr(args.image1.address);
output_address_phy = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row =
align_to_x
((
uint64_t
)
args
.
image0
.
width
*
(
uint64_t
)
args
.
image0
.
channels
,
IMAGE_ALIGN
);
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGN);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) |
(
uint64_t
)
args
.
image0
.
height
;
(uint64_t)args.image0.height;
*/
/*SDK刷Cache保证数据一致性*/
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_EW
]
->
status
)
{
ret
=
-
EIO
;
DLOG
<<
"EW Status Error!"
;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
/*restart scale*/
reg_writeq
(
output_scale
,
REG_SCALE_PARAMETER
);
reg_writeq
(
image0_address_phy
,
REG_EW_IMAGE0_BASE_ADDR
);
reg_writeq
(
image1_address_phy
,
REG_EW_IMAGE1_BASE_ADDR
);
reg_writeq
(
datalen
,
REG_EW_DATA_LEN
);
reg_writeq
(
image_image_pixel
,
REG_EW_IMAGE_PIXEL
);
reg_writeq
(
image_amount_per_row
,
REG_EW_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
output_address_phy
,
REG_EW_RESULT_BASE_ADDR
);
reg_writeq
(
coefficient
,
REG_EW_COEFFICIENT
);
reg_writeq
(
cmd
,
REG_EW_CMD
);
reg_writeq
(
args
.
driver
.
image0_address_phy
,
REG_EW_IMAGE0_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
image1_address_phy
,
REG_EW_IMAGE1_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
datalen
,
REG_EW_DATA_LEN
);
reg_writeq
(
args
.
driver
.
image_image_pixel
,
REG_EW_IMAGE_PIXEL
);
reg_writeq
(
args
.
driver
.
image_amount_per_row
,
REG_EW_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
args
.
driver
.
output_address_phy
,
REG_EW_RESULT_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
coefficient
,
REG_EW_COEFFICIENT
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_EW_CMD
);
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_POOLING
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_
POOLING
]
->
status
=
ERROR
;
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_
EW
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"EW Wait Irq Timeout!"
;
}
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
...
...
@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) {
DLOG
<<
"BYPASS Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
...
...
@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
*
data_in
=
ptr_deconv
;
fpga_free
(
ptr_tmp
);
}
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
)
{
#ifdef FPGA_
TES
T_MODE
#ifdef FPGA_
PRIN
T_MODE
DLOG
<<
"=============ComputeFPGADeConv==========="
;
DLOG
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
...
...
src/fpga/common/driver.cpp
浏览文件 @
14b944f3
...
...
@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for
(
i
=
0
;
i
<
timeout
;
i
++
)
{
if
(
val
==
reg_readq
(
reg
))
{
std
::
cout
<<
"fpga_regpoll:"
<<
i
<<
"val:"
<<
val
<<
"reg:"
<<
reg
<<
std
::
endl
;
break
;
}
}
...
...
@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
DLOG
<<
"dest:"
<<
dest
<<
" src:"
<<
src
<<
" size:"
<<
num
;
for
(
i
=
0
;
i
<
num
;
i
++
)
{
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*
((
int8_t
*
)
dest
+
i
)
=
*
((
int8_t
*
)
src
+
i
);
// NOLINT
}
...
...
src/fpga/common/driver.h
浏览文件 @
14b944f3
...
...
@@ -103,22 +103,15 @@ struct FPGA_INFO {
extern
struct
FPGA_INFO
g_fpgainfo
;
inline
uint64_t
reg_readq
(
uint32_t
offset
)
{
// DLOG << "offset : " << offset;
uint64_t
value
=
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
// NOLINT
offset
);
// NOLINT
// DLOG << "read end";
usleep
(
10
);
return
value
;
}
inline
void
reg_writeq
(
uint64_t
value
,
uint32_t
offset
)
{
// DLOG << "offset : " << offset << ", value : " << value;
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
// NOLINT
offset
)
=
value
;
// DLOG << "write end";
usleep
(
10
);
}
int
open_device_driver
();
...
...
src/fpga/common/fpga_common.h
浏览文件 @
14b944f3
...
...
@@ -20,6 +20,13 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
fpga
{
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#endif
enum
DataType
{
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP16
=
0
,
...
...
@@ -52,19 +59,70 @@ struct ImageOutputArgs {
float
*
scale_address
;
// output scale address;
uint64_t
timer_cnt
;
// time counter for FPGA computation
};
#ifdef PADDLE_MOBILE_FPGA_V1
struct
ConvDriverParam
{
uint64_t
image_address_phy
;
uint64_t
filter_address_phy
;
uint64_t
sb_address_phy
;
uint64_t
output_address_phy
;
uint64_t
output_height
;
uint64_t
output_width
;
uint64_t
filter_per_group
;
uint64_t
channel_per_group
;
uint64_t
image_amount_per_row
;
uint64_t
image_one_pad_per_row
;
uint64_t
filter_amount_all
;
uint64_t
output_amount_per_row
;
uint64_t
image_block_amount_per_row
;
uint64_t
filter_pad_width_mul_channel
;
uint64_t
image_amount_per_row_multi_win_first
;
uint64_t
image_amount_per_row_multi_win
;
uint64_t
image_block_num
;
uint64_t
image_block_len
;
uint64_t
image_block_len_last
;
uint64_t
image_win_cnt
;
uint64_t
image_win_cnt_last
;
uint64_t
res_row_data_align4_pad
;
uint64_t
prog_full_cnt
;
uint64_t
post_prog_full_cnt
;
uint64_t
fpga_bias_scale_len
;
uint64_t
cmd
;
};
struct
EWAddDriverParam
{
uint64_t
image0_address_phy
;
uint64_t
image1_address_phy
;
uint64_t
datalen
;
uint64_t
image_image_pixel
;
uint64_t
image_amount_per_row
;
uint64_t
output_address_phy
;
uint64_t
coefficient
;
uint64_t
cmd
;
};
#endif
struct
ConvArgs
{
bool
relu_enabled
;
void
*
sb_address
;
// scale and bias
void
*
filter_address
;
float
*
filter_scale_address
;
void
*
free_space
;
// used by FPGA logic
uint32_t
filter_num
;
uint32_t
group_num
;
struct
KernelArgs
kernel
;
struct
ImageInputArgs
image
;
// input image;
struct
ImageOutputArgs
output
;
#ifdef PADDLE_MOBILE_FPGA_V2
void
*
free_space
;
// used by FPGA logic
#endif
#ifdef PADDLE_MOBILE_FPGA_V1
struct
ConvDriverParam
driver
;
#endif
};
struct
ConcatArgs
{
...
...
@@ -115,6 +173,9 @@ struct EWAddArgs {
struct
ImageInputArgs
image0
;
struct
ImageInputArgs
image1
;
struct
ImageOutputArgs
output
;
#ifdef PADDLE_MOBILE_FPGA_V1
struct
EWAddDriverParam
driver
;
#endif
};
struct
BypassArgs
{
...
...
@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num);
int
fpga_flush
(
void
*
address
,
size_t
size
);
int
fpga_invalidate
(
void
*
address
,
size_t
size
);
uint64_t
vaddr_to_paddr
(
void
*
address
);
void
expand_conv_arg
(
ConvArgs
*
arg
);
void
expand_EW_arg
(
EWAddArgs
*
arg
);
}
// namespace fpga
}
// namespace paddle_mobile
src/operators/kernel/arm/concat_kernel.cpp
浏览文件 @
14b944f3
...
...
@@ -27,7 +27,11 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
template
<
>
void
ConcatKernel
<
CPU
,
float
>::
Compute
(
const
ConcatParam
<
CPU
>
&
param
)
{
ConcatCompute
<
float
>
(
param
);
if
(
param
.
Inputs
()[
0
]
->
type
()
==
typeid
(
int8_t
))
{
ConcatCompute
<
int8_t
>
(
param
);
}
else
{
ConcatCompute
<
float
>
(
param
);
}
param
.
Out
()
->
set_lod
(
param
.
Inputs
()[
0
]
->
lod
());
}
...
...
src/operators/kernel/central-arm-func/concat_arm_func.h
浏览文件 @
14b944f3
...
...
@@ -57,8 +57,8 @@ template <typename P>
void
ConcatCompute
(
const
ConcatParam
<
CPU
>
&
param
)
{
auto
inputs
=
param
.
Inputs
();
auto
*
out
=
param
.
Out
();
int
64_t
axis
=
param
.
Axis
();
out
->
mutable_data
<
float
>
();
int
axis
=
param
.
Axis
();
out
->
mutable_data
<
P
>
();
/// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
axis
==
0
&&
inputs
.
size
()
<
10
)
{
...
...
@@ -66,12 +66,12 @@ void ConcatCompute(const ConcatParam<CPU> ¶m) {
for
(
auto
*
in
:
inputs
)
{
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
auto
dst
=
out
->
data
<
float
>
()
+
output_offset
;
auto
src
=
in
->
data
<
float
>
();
auto
dst
=
out
->
data
<
P
>
()
+
output_offset
;
auto
src
=
in
->
data
<
P
>
();
PADDLE_MOBILE_ENFORCE
(
in_stride
.
size
()
==
out_stride
.
size
(),
"src and dst tensor should have the same dims size."
);
memory
::
Copy
(
dst
,
src
,
sizeof
(
float
)
*
in_stride
[
0
]);
memory
::
Copy
(
dst
,
src
,
sizeof
(
P
)
*
in_stride
[
0
]);
output_offset
+=
in_stride
[
0
];
}
}
else
{
...
...
@@ -79,8 +79,8 @@ void ConcatCompute(const ConcatParam<CPU> ¶m) {
for
(
int
j
=
0
;
j
<
inputs
.
size
();
++
j
)
{
inputs_concat
[
j
]
=
*
inputs
[
j
];
}
ConcatFunctor
<
float
>
concat_functor
;
concat_functor
(
inputs_concat
,
static_cast
<
int
>
(
axis
)
,
out
);
ConcatFunctor
<
P
>
concat_functor
;
concat_functor
(
inputs_concat
,
axis
,
out
);
}
}
...
...
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
浏览文件 @
14b944f3
...
...
@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
ewaddArgs
.
image1
.
pad_width
=
0
;
ewaddArgs
.
output
.
scale_address
=
out
->
scale
;
ewaddArgs
.
output
.
address
=
out_ptr
;
fpga
::
expand_EW_arg
(
&
ewaddArgs
);
param
->
SetFpgaArgs
(
ewaddArgs
);
return
true
;
}
...
...
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
浏览文件 @
14b944f3
...
...
@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs
.
image1
.
pad_width
=
0
;
ewaddArgs
.
output
.
scale_address
=
out
->
scale
;
ewaddArgs
.
output
.
address
=
out_ptr
;
fpga
::
expand_EW_arg
(
&
ewaddArgs
);
param
->
SetFpgaArgs
(
ewaddArgs
);
return
true
;
}
...
...
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
14b944f3
...
...
@@ -24,8 +24,12 @@ template <>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp32_ofm
(
out
);
auto
float_input
=
new
Tensor
;
float_input
->
mutable_data
<
float
>
({
1
,
input
->
dims
()[
1
]});
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
2
],
input
->
dims
()[
3
],
input
->
dims
()[
1
]});
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
...
...
@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
1
;
args
.
image
.
width
=
1
;
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
]
;
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
]
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
...
...
@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
Tensor
*
out
=
param
.
Out
();
fpga
::
PerformBypass
(
param
.
FpgaArgs
());
fpga
::
fpga_invalidate
(
(
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
fpga
::
get_align_image_cw
(
in_x
->
dims
()[
1
])
*
sizeof
(
float
));
fpga
::
fpga_invalidate
(
(
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
in_x
->
numel
()
*
sizeof
(
float
));
// TODO: In general case, 0 should be squeezed before softmax input
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
}
...
...
test/operators/test_concat_op.cpp
浏览文件 @
14b944f3
...
...
@@ -12,76 +12,125 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include <iostream>
#include <vector>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/concat_op.h"
namespace
paddle_mobile
{
using
framework
::
AttributeMap
;
using
framework
::
DDim
;
using
framework
::
LoDTensor
;
using
framework
::
Scope
;
using
framework
::
make_ddim
;
template
<
typename
T
>
void
concat
(
const
std
::
vector
<
LoDTensor
>
&
input
,
LoDTensor
&
output
,
int
axis
)
{
int
num
=
input
.
size
();
int
rows
=
1
;
auto
dim_0
=
input
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
rows
*=
dim_0
[
i
];
}
int
out_rows
=
rows
,
out_cols
=
0
;
std
::
vector
<
int
>
input_cols
(
input
.
size
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
input
[
i
].
numel
()
/
rows
;
out_cols
+=
t_cols
;
input_cols
[
i
]
=
t_cols
;
}
// computation
auto
output_data
=
output
.
data
<
T
>
();
int
col_idx
=
0
;
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
input_cols
[
j
];
auto
input_data
=
input
[
j
].
data
<
T
>
();
for
(
int
k
=
0
;
k
<
out_rows
;
++
k
)
{
memcpy
(
output_data
+
k
*
out_cols
+
col_idx
,
input_data
+
k
*
col_len
,
sizeof
(
T
)
*
col_len
);
}
col_idx
+=
col_len
;
}
}
template
<
typename
T
>
int
TestConcatOP
()
{
DDim
inputA_shape
=
make_ddim
({
10
,
4
,
2
,
2
});
DDim
inputB_shape
=
make_ddim
({
20
,
4
,
2
,
2
});
DDim
inputC_shape
=
make_ddim
({
30
,
4
,
2
,
2
});
DDim
inputD_shape
=
make_ddim
({
40
,
4
,
2
,
2
});
DDim
output_shape
=
make_ddim
({
100
,
4
,
2
,
2
});
int
axis_v
=
0
;
VariableNameMap
inputs
;
VariableNameMap
outputs
;
std
::
vector
<
LoDTensor
>
input_tensors
;
auto
scope
=
std
::
make_shared
<
Scope
>
();
inputs
[
"X"
]
=
std
::
vector
<
std
::
string
>
({
"inputA"
,
"inputB"
,
"inputC"
,
"inputD"
});
outputs
[
"Out"
]
=
std
::
vector
<
std
::
string
>
({
"output"
});
auto
inputA_var
=
scope
.
get
()
->
Var
(
"inputA"
);
auto
inputA
=
inputA_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputA
,
inputA_shape
,
-
127
,
127
);
input_tensors
.
push_back
(
std
::
move
(
*
inputA
));
auto
inputB_var
=
scope
.
get
()
->
Var
(
"inputB"
);
auto
inputB
=
inputB_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputB
,
inputB_shape
,
-
127
,
127
);
input_tensors
.
push_back
(
std
::
move
(
*
inputB
));
auto
inputC_var
=
scope
.
get
()
->
Var
(
"inputC"
);
auto
inputC
=
inputC_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputC
,
inputC_shape
,
-
127
,
127
);
input_tensors
.
push_back
(
std
::
move
(
*
inputC
));
auto
inputD_var
=
scope
.
get
()
->
Var
(
"inputD"
);
auto
inputD
=
inputD_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputD
,
inputD_shape
,
-
127
,
127
);
input_tensors
.
push_back
(
std
::
move
(
*
inputD
));
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
AttributeMap
attrs
;
attrs
[
"axis"
].
Set
<
int
>
(
axis_v
);
auto
*
op
=
new
operators
::
ConcatOp
<
CPU
,
float
>
(
"concat"
,
inputs
,
outputs
,
attrs
,
scope
);
op
->
InferShape
();
op
->
Run
();
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
const
T
*
output_data
=
output
->
data
<
T
>
();
LoDTensor
output_cmp
;
output_cmp
.
mutable_data
<
T
>
(
output_shape
);
concat
<
T
>
(
input_tensors
,
output_cmp
,
axis_v
);
const
T
*
output_cmp_data
=
output_cmp
.
data
<
T
>
();
// compare
int
eq
=
0
;
int
neq
=
0
;
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
PADDLE_MOBILE_ENFORCE
(
output_data
[
i
]
==
output_cmp_data
[
i
],
"The execution of test_concat_op is failed!"
);
if
(
output_data
[
i
]
==
output_cmp_data
[
i
])
{
++
eq
;
}
else
{
++
neq
;
}
}
std
::
cout
<<
"eq = "
<<
eq
<<
", neq = "
<<
neq
<<
std
::
endl
;
delete
op
;
return
0
;
}
}
// namespace paddle_mobile
int
main
()
{
paddle_mobile
::
framework
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
auto
program
=
loader
.
Load
(
g_googlenet
);
PADDLE_MOBILE_ENFORCE
(
program
.
originProgram
!=
nullptr
,
"program file read fail"
);
Executor4Test
<
paddle_mobile
::
CPU
,
paddle_mobile
::
operators
::
ConcatOp
<
paddle_mobile
::
CPU
,
float
>>
executor
(
program
,
"concat"
);
// 1. input_tensors;
vector
<
Tensor
>
input_tensors
;
Tensor
input1
;
auto
input1_data
=
CreateInput
<
float
>
(
&
input1
,
{
4
,
10
,
2
,
2
},
0
,
1
);
input_tensors
.
push_back
(
input1
);
Tensor
input2
;
auto
input2_data
=
CreateInput
<
float
>
(
&
input2
,
{
4
,
20
,
2
,
2
},
0
,
1
);
input_tensors
.
push_back
(
input2
);
Tensor
input3
;
auto
input3_data
=
CreateInput
<
float
>
(
&
input3
,
{
4
,
30
,
2
,
2
},
0
,
1
);
input_tensors
.
push_back
(
input3
);
Tensor
input4
;
auto
input4_data
=
CreateInput
<
float
>
(
&
input4
,
{
4
,
40
,
2
,
2
},
0
,
1
);
input_tensors
.
push_back
(
input4
);
// 2. input_names
vector
<
string
>
input_names
({
"conv2d_3.tmp_1"
,
"conv2d_5.tmp_1"
,
"conv2d_7.tmp_1"
,
"conv2d_8.tmp_1"
,
});
// 3. output_names
vector
<
string
>
output_names
({
"concat_0.tmp_0"
});
// 4. out_dims;
vector
<
DDim
>
out_ddims
;
auto
out_ddim
=
paddle_mobile
::
framework
::
make_ddim
({
3
,
100
,
2
,
2
});
out_ddims
.
push_back
(
out_ddim
);
auto
output
=
executor
.
Predict
<
LoDTensor
>
(
input_tensors
,
input_names
,
output_names
,
out_ddims
);
auto
output0_data
=
output
[
0
]
->
data
<
float
>
();
// 5. test one example.
int
input_n
=
1
;
int
input_c
=
2
;
int
input_h
=
0
;
int
input_w
=
1
;
int
stride0
=
input3
.
numel
()
/
input3
.
dims
()[
0
];
int
stride1
=
input3
.
numel
()
/
input3
.
dims
()[
0
]
/
input3
.
dims
()[
1
];
int
stride2
=
input3
.
dims
()[
3
];
/// inputx1 (4,10,2,2),
/// inputx2 (4,20,2,2),
/// inputx3 (4,30,2,2),
/// inputx4 (4,40,2,2),
/// axis = 1
/// output (4,100,2,2)
int
input_index
=
input_n
*
stride0
+
input_c
*
stride1
+
input_h
*
stride2
+
input_w
;
int
output_index
=
input_n
*
100
*
2
*
2
+
(
input_c
+
input1
.
dims
()[
1
]
+
input2
.
dims
()[
1
])
*
2
*
2
+
input_h
*
2
+
input_w
;
DLOG
<<
" input3 [1, 2,0,1] = "
<<
input3_data
[
input_index
];
DLOG
<<
" output [1,32,0,1] = "
<<
output0_data
[
output_index
];
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
paddle_mobile
::
TestConcatOP
<
float
>
();
paddle_mobile
::
TestConcatOP
<
int8_t
>
();
return
0
;
}
test/operators/test_fusion_fc_op.cpp
浏览文件 @
14b944f3
...
...
@@ -18,6 +18,9 @@ limitations under the License. */
#include "../test_include.h"
#include "framework/operator.h"
#include "operators/fusion_fc_op.h"
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
#endif
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录