Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
965fce05
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
965fce05
编写于
12月 10, 2018
作者:
xiebaiyuan
提交者:
GitHub
12月 10, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into add_concat_int8
上级
88baf9ca
9437e287
变更
22
显示空白变更内容
内联
并排
Showing
22 changed file
with
519 addition
and
198 deletion
+519
-198
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+208
-40
src/fpga/V1/bias_scale.h
src/fpga/V1/bias_scale.h
+0
-2
src/fpga/V1/deconv_bias_scale.h
src/fpga/V1/deconv_bias_scale.h
+0
-2
src/fpga/V1/filter.h
src/fpga/V1/filter.h
+0
-3
src/fpga/V1/image.cpp
src/fpga/V1/image.cpp
+21
-9
src/fpga/V1/image.h
src/fpga/V1/image.h
+9
-7
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+57
-110
src/fpga/V2/api.cpp
src/fpga/V2/api.cpp
+3
-3
src/fpga/V2/api.h
src/fpga/V2/api.h
+1
-1
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+0
-4
src/fpga/common/driver.h
src/fpga/common/driver.h
+0
-7
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+2
-1
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+66
-1
src/fpga/common/pe.h
src/fpga/common/pe.h
+1
-0
src/operators/kernel/arm/quantize_kernel.cpp
src/operators/kernel/arm/quantize_kernel.cpp
+2
-0
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+1
-0
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+1
-0
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+10
-6
src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+1
-1
src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+1
-1
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+56
-0
src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
...operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+79
-0
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
965fce05
...
...
@@ -21,6 +21,9 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
fpga
{
#define USE_RELU 1
#define USE_BIAS 2
int
get_align_image_cw
(
int
cw
)
{
return
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
...
...
@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width,
out
->
reset_data_ptr
(
data_ptr
);
}
void
expand_conv_arg
(
ConvArgs
*
arg
)
{
ConvArgs
args
=
*
arg
;
uint64_t
filterlen
=
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
*
(
uint64_t
)
args
.
image
.
channels
;
filterlen
=
align_to_x
(
filterlen
,
FILTER_ELEMENT_ALIGNMENT
);
filterlen
*=
align_to_x
((
uint64_t
)
args
.
filter_num
,
FILTER_NUM_ALIGNMENT
);
uint64_t
fpga_bias_scale_len
=
align_to_x
(
args
.
filter_num
/
args
.
group_num
,
8
)
*
args
.
group_num
;
uint64_t
output_height
=
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
1
;
uint64_t
output_width
=
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
1
;
uint64_t
output_size
=
output_height
*
output_width
*
(
uint64_t
)
args
.
filter_num
;
auto
filter_per_group
=
(
uint64_t
)(
args
.
filter_num
/
args
.
group_num
);
auto
channel_per_group
=
(
uint64_t
)(
args
.
image
.
channels
/
args
.
group_num
);
uint64_t
image_row_count
=
((
uint64_t
)
args
.
image
.
width
)
*
((
uint64_t
)
args
.
image
.
channels
);
// without align
uint64_t
image_amount_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
);
uint64_t
image_one_pad_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
)
+
((
uint64_t
)
args
.
image
.
pad_width
)
*
((
uint64_t
)
args
.
image
.
channels
);
uint64_t
filter_amount_all
=
align_to_x
(((
uint64_t
)
args
.
kernel
.
height
)
*
((
uint64_t
)
args
.
kernel
.
width
)
*
channel_per_group
,
FILTER_ELEMENT_ALIGNMENT
);
uint64_t
output_amount_per_row
=
align_to_x
(
output_width
*
((
uint64_t
)
args
.
filter_num
),
IMAGE_ALIGNMENT
);
// find the opt partition strategy
uint64_t
res_win
;
uint64_t
res_fit
=
0
;
for
(
res_win
=
1
;
res_win
<=
output_width
;
res_win
=
res_win
+
1
)
{
if
((
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
res_win
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
)
*
args
.
kernel
.
height
>
2048
)
{
break
;
}
}
if
(
res_win
!=
output_width
)
{
res_win
-=
1
;
}
if
(((
res_win
%
2
)
!=
0
)
&&
(
res_win
!=
1
))
{
res_win
=
res_win
-
1
;
}
res_fit
=
res_win
;
uint64_t
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
uint64_t
block_len
=
res_fit
;
uint64_t
block_last
=
output_width
-
res_fit
*
(
block_num
-
1
);
uint64_t
res_amount_per_row
=
output_width
*
args
.
filter_num
;
uint64_t
res_amount_per_row_pad
=
output_amount_per_row
-
res_amount_per_row
;
uint64_t
image_block_amount_per_row
=
args
.
kernel
.
stride_w
*
(
res_fit
)
*
args
.
image
.
channels
;
uint64_t
filter_pad_width_mul_channel
=
args
.
image
.
pad_width
*
args
.
image
.
channels
;
uint64_t
image_amount_per_row_multi_win_first
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
uint64_t
image_amount_per_row_multi_win
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
);
uint64_t
image_block_num
=
block_num
;
uint64_t
image_block_len
=
align_to_x
((
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_len
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
;
uint64_t
image_block_len_last
=
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_last
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
;
uint64_t
image_win_cnt
=
block_len
;
uint64_t
image_win_cnt_last
=
block_last
;
uint64_t
res_row_data_align4_pad
=
res_amount_per_row_pad
/
8
;
uint64_t
prog_full_cnt
=
2048
/
(
filter_amount_all
/
16
*
2
)
-
1
;
if
(
prog_full_cnt
==
1023
)
{
prog_full_cnt
--
;
}
uint64_t
post_prog_full_cnt
=
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
>
2
)
?
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
-
2
)
:
0
;
uint64_t
cmd
=
0UL
|
(
args
.
relu_enabled
?
USE_RELU
:
0
)
|
USE_BIAS
;
(
*
arg
).
driver
.
image_address_phy
=
vaddr_to_paddr
(
args
.
image
.
address
);
(
*
arg
).
driver
.
sb_address_phy
=
vaddr_to_paddr
(
args
.
sb_address
);
(
*
arg
).
driver
.
filter_address_phy
=
vaddr_to_paddr
(
args
.
filter_address
);
(
*
arg
).
driver
.
output_address_phy
=
vaddr_to_paddr
(
args
.
output
.
address
);
(
*
arg
).
driver
.
output_height
=
output_height
;
(
*
arg
).
driver
.
output_width
=
output_width
;
(
*
arg
).
driver
.
filter_per_group
=
filter_per_group
;
(
*
arg
).
driver
.
channel_per_group
=
channel_per_group
;
(
*
arg
).
driver
.
image_amount_per_row
=
image_amount_per_row
;
(
*
arg
).
driver
.
image_one_pad_per_row
=
image_one_pad_per_row
;
(
*
arg
).
driver
.
filter_amount_all
=
filter_amount_all
;
(
*
arg
).
driver
.
output_amount_per_row
=
output_amount_per_row
;
(
*
arg
).
driver
.
image_block_amount_per_row
=
image_block_amount_per_row
;
(
*
arg
).
driver
.
filter_pad_width_mul_channel
=
filter_pad_width_mul_channel
;
(
*
arg
).
driver
.
image_amount_per_row_multi_win_first
=
image_amount_per_row_multi_win_first
;
(
*
arg
).
driver
.
image_amount_per_row_multi_win
=
image_amount_per_row_multi_win
;
(
*
arg
).
driver
.
image_block_num
=
image_block_num
;
(
*
arg
).
driver
.
image_block_len
=
image_block_len
;
(
*
arg
).
driver
.
image_block_len_last
=
image_block_len_last
;
(
*
arg
).
driver
.
image_win_cnt
=
image_win_cnt
;
(
*
arg
).
driver
.
image_win_cnt_last
=
image_win_cnt_last
;
(
*
arg
).
driver
.
res_row_data_align4_pad
=
res_row_data_align4_pad
;
(
*
arg
).
driver
.
prog_full_cnt
=
prog_full_cnt
;
(
*
arg
).
driver
.
post_prog_full_cnt
=
post_prog_full_cnt
;
(
*
arg
).
driver
.
fpga_bias_scale_len
=
fpga_bias_scale_len
;
(
*
arg
).
driver
.
cmd
=
cmd
;
}
// expand_conv_arg()
void
expand_EW_arg
(
EWAddArgs
*
arg
)
{
EWAddArgs
args
=
*
arg
;
uint64_t
cmd
=
args
.
relu_enabled
?
USE_RELU
:
0
;
uint64_t
datalen
=
(
uint64_t
)
args
.
image0
.
width
*
(
uint64_t
)
args
.
image0
.
height
*
(
uint64_t
)
args
.
image0
.
channels
;
uint64_t
coefficient
=
(
uint64_t
)
args
.
const0
<<
32
|
(
uint64_t
)
args
.
const1
;
uint64_t
image0_address_phy
=
vaddr_to_paddr
(
args
.
image0
.
address
);
uint64_t
image1_address_phy
=
vaddr_to_paddr
(
args
.
image1
.
address
);
uint64_t
output_address_phy
=
vaddr_to_paddr
(
args
.
output
.
address
);
uint64_t
image_amount_per_row
=
align_to_x
((
uint64_t
)
args
.
image0
.
width
*
(
uint64_t
)
args
.
image0
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
image_image_pixel
=
((
uint64_t
)
args
.
image0
.
channels
<<
32
)
|
((
uint64_t
)
args
.
image0
.
width
<<
16
)
|
(
uint64_t
)
args
.
image0
.
height
;
(
*
arg
).
driver
.
image0_address_phy
=
image0_address_phy
;
(
*
arg
).
driver
.
image1_address_phy
=
image1_address_phy
;
(
*
arg
).
driver
.
datalen
=
datalen
;
(
*
arg
).
driver
.
image_image_pixel
=
image_image_pixel
;
(
*
arg
).
driver
.
image_amount_per_row
=
image_amount_per_row
;
(
*
arg
).
driver
.
output_address_phy
=
output_address_phy
;
(
*
arg
).
driver
.
coefficient
=
coefficient
;
(
*
arg
).
driver
.
cmd
=
cmd
;
}
// expand_EW_arg
void
fill_split_arg
(
struct
SplitConvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
group_num
,
int
stride_h
,
...
...
@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto
channel
=
(
int
)
out
->
dims
()[
1
];
// NOLINT
int
filter_num_per_div
=
get_filter_num_per_div
(
filter
,
group_num
);
int
element_num
=
get_aligned_filter_element_num
(
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
]
);
(
int
)(
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
])
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
arg
->
conv_arg
[
i
].
relu_enabled
=
relu_enabled
;
...
...
@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
conv_arg
[
i
].
image
.
pad_height
=
(
uint32_t
)
padding_h
;
arg
->
conv_arg
[
i
].
image
.
pad_width
=
(
uint32_t
)
padding_w
;
arg
->
conv_arg
[
i
].
filter_scale_address
=
filter
->
scale
;
// arg->conv_arg[i].filter_address = &(
// (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
// NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg
->
conv_arg
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
channel
-
(
n
-
1
)
*
filter_num_per_div
// NOLINT
:
filter_num_per_div
);
size_t
filter_size
=
element_num
*
arg
->
conv_arg
[
i
].
filter_num
*
sizeof
(
int8_t
);
element_num
*
align_to_x
(
arg
->
conv_arg
[
i
].
filter_num
,
FILTER_NUM_ALIGNMENT
)
*
sizeof
(
int8_t
);
auto
filter_head
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
*
filter_num_per_div
];
arg
->
conv_arg
[
i
].
filter_address
=
fpga_malloc
(
filter_size
);
memcpy
(
arg
->
conv_arg
[
i
].
filter_address
,
filter_head
,
filter_size
);
fpga_flush
(
arg
->
conv_arg
[
i
].
filter_address
,
filter_size
);
size_t
bs_size
=
2
*
arg
->
conv_arg
[
i
].
filter_num
*
sizeof
(
float
);
size_t
bs_size
=
2
*
align_to_x
(
arg
->
conv_arg
[
i
].
filter_num
,
BS_NUM_ALIGNMENT
)
*
sizeof
(
float
);
auto
bs_head
=
&
bs_ptr
[
i
*
filter_num_per_div
*
2
];
arg
->
conv_arg
[
i
].
sb_address
=
fpga_malloc
(
bs_size
);
memcpy
(
arg
->
conv_arg
[
i
].
sb_address
,
bs_head
,
bs_size
);
...
...
@@ -249,9 +415,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
if
(
n
>
1
)
{
arg
->
conv_arg
[
i
].
output
.
scale_address
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
// NOLINT
arg
->
conv_arg
[
i
].
output
.
address
=
fpga_malloc
(
out
->
dims
()[
2
]
*
align_to_x
(
out
->
dims
()[
3
]
*
arg
->
conv_arg
[
i
].
filter_num
,
arg
->
conv_arg
[
i
].
output
.
address
=
fpga_malloc
(
out
->
dims
()[
2
]
*
align_to_x
((
int
)(
out
->
dims
()[
3
]
*
arg
->
conv_arg
[
i
].
filter_num
)
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
}
else
{
...
...
@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
(
half
*
)
arg
->
conv_arg
[
i
].
output
.
address
;
// NOLINT
arg
->
concat_arg
.
scales_in
[
i
]
=
arg
->
conv_arg
[
i
].
output
.
scale_address
;
arg
->
concat_arg
.
channel_num
[
i
]
=
arg
->
conv_arg
[
i
].
filter_num
;
expand_conv_arg
(
&
arg
->
conv_arg
[
i
]);
}
filter
->
reset_data_ptr
(
nullptr
);
fpga_free
(
bs_ptr
);
}
}
// fill_split_arg
void
fill_deconv_arg
(
struct
DeconvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
group_num
,
int
stride_h
,
...
...
@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
auto
out_ptr
=
out
->
data
<
float
>
();
arg
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
sub_conv_num
=
stride_h
;
arg
->
sub_conv_num
=
(
uint32_t
)
stride_h
;
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
int
sub_conv_num
=
arg
->
sub_conv_num
;
int
sub_stride
=
1
;
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
(
filter
->
dims
()[
3
],
padding_w
,
stride_w
);
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
filter
->
dims
()[
3
],
stride_w
);
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
(
(
int
)
filter
->
dims
()[
3
]
,
padding_w
,
stride_w
);
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
(
int
)
filter
->
dims
()[
3
],
stride_w
);
int
sub_output_width
=
deconv_filter
::
deconv_get_sub_out_axis
(
input
->
dims
()[
3
],
sub_pad
,
sub_filter_width
);
(
int
)
input
->
dims
()[
3
],
sub_pad
,
sub_filter_width
);
int
sub_output_height
=
deconv_filter
::
deconv_get_sub_out_axis
(
input
->
dims
()[
2
],
sub_pad
,
sub_filter_width
);
(
int
)
input
->
dims
()[
2
],
sub_pad
,
sub_filter_width
);
arg
->
sub_output_width
=
sub_output_width
;
arg
->
sub_output_height
=
sub_output_height
;
arg
->
omit_size
=
deconv_filter
::
deconv_get_omit
(
stride_w
,
filter
->
dims
()[
3
],
padding_w
);
arg
->
sub_output_width
=
(
uint32_t
)
sub_output_width
;
arg
->
sub_output_height
=
(
uint32_t
)
sub_output_height
;
arg
->
omit_size
=
(
uint32_t
)
deconv_filter
::
deconv_get_omit
(
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
int
sub_channels
=
(
int
32_t
)
input
->
dims
()[
1
];
int
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
...
...
@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
conv_args
[
i
].
filter_num
=
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
);
arg
->
conv_args
[
i
].
group_num
=
group_num
;
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
arg
->
conv_args
[
i
].
kernel
.
width
=
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
height
=
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_args
[
i
].
image
.
channels
=
sub_channels
;
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
conv_args
[
i
].
image
.
pad_width
=
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
(
void
*
)
bs_ptr
;
char
*
filter_sub_space
=
auto
filter_sub_space
=
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
fpga_copy
(
filter_sub_space
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
align_conv_sub_filter_count
);
(
size_t
)
align_conv_sub_filter_count
);
arg
->
conv_args
[
i
].
filter_address
=
(
void
*
)(
filter_sub_space
);
fpga_flush
(
filter_sub_space
,
align_conv_sub_filter_count
);
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
if
(
sub_conv_num
==
1
)
{
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
}
else
{
half
*
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
);
float
*
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
}
}
...
...
@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
// fpga_free(filter_ptr);
}
}
// fill_deconv_arg
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/bias_scale.h
浏览文件 @
965fce05
...
...
@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once
#define BS_NUM_ALIGNMENT 8
namespace
paddle_mobile
{
namespace
fpga
{
namespace
bias_scale
{
...
...
src/fpga/V1/deconv_bias_scale.h
浏览文件 @
965fce05
...
...
@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once
#define BS_NUM_ALIGNMENT 8
namespace
paddle_mobile
{
namespace
fpga
{
namespace
deconv_bias_scale
{
...
...
src/fpga/V1/filter.h
浏览文件 @
965fce05
...
...
@@ -14,9 +14,6 @@ limitations under the License. */
#pragma once
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
namespace
paddle_mobile
{
namespace
fpga
{
namespace
filter
{
...
...
src/fpga/V1/image.cpp
浏览文件 @
965fce05
...
...
@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
fpga_flush
(
image_out
,
height
*
align_each_out_area_cw
*
sizeof
(
int16_t
));
}
void
split_image
(
int16_t
*
image_in
,
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
uint32_t
*
channel_nums
,
int
height
,
int
width
)
{
void
split_image
(
int16_t
*
image_in
,
const
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
const
uint32_t
*
channel_nums
,
int
height
,
int
width
)
{
int
total_channel
=
0
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
scales_out
[
i
][
0
]
=
scale_in
[
0
];
scales_out
[
i
][
1
]
=
scale_in
[
1
];
total_channel
+=
channel_nums
[
i
];
}
int
element_num
=
height
*
align_to_x
(
width
*
total_channel
,
IMAGE_ALIGNMENT
);
fpga_invalidate
(
image_in
,
element_num
*
sizeof
(
int16_t
));
int
src_offset
=
0
,
des_offset
=
0
;
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int
src_offset
=
h
*
align_to_x
(
total_channel
*
width
,
IMAGE_ALIGNMENT
);
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
src_offset
=
h
*
align_to_x
(
total_channel
*
width
,
IMAGE_ALIGNMENT
)
+
w
*
total_channel
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
int
des_offset
=
h
*
align_to_x
(
channel_nums
[
i
]
*
width
,
IMAGE_ALIGNMENT
);
des_offset
=
h
*
align_to_x
(
channel_nums
[
i
]
*
width
,
IMAGE_ALIGNMENT
)
+
w
*
channel_nums
[
i
];
memcpy
((
int16_t
*
)
images_out
[
i
]
+
des_offset
,
image_in
+
src_offset
,
channel_nums
[
i
]
*
sizeof
(
int16_t
));
src_offset
+=
channel_nums
[
i
];
}
}
}
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
element_num
=
height
*
align_to_x
(
width
*
channel_nums
[
i
],
IMAGE_ALIGNMENT
);
fpga_flush
(
images_out
[
i
],
element_num
*
sizeof
(
int16_t
));
}
}
}
// namespace image
...
...
src/fpga/V1/image.h
浏览文件 @
965fce05
...
...
@@ -14,9 +14,8 @@ limitations under the License. */
#pragma once
#include <
stdint.h
>
#include <
cstdint
>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace
paddle_mobile
{
namespace
fpga
{
namespace
image
{
...
...
@@ -24,13 +23,16 @@ namespace image {
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
);
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
// Concat featuremaps along channel direction
void
concat_images
(
int16_t
**
images_in
,
float
**
scales_in
,
void
*
image_out
,
float
*
scale_out
,
int
image_num
,
uint32_t
*
channel_num
,
int
height
,
int
width
);
// Concat featuremaps along channel direction
void
split_image
(
int16_t
*
image_in
,
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
uint32_t
*
channel_nums
,
int
height
,
int
width
);
// Split featuremap along channel direction
void
split_image
(
int16_t
*
image_in
,
const
float
*
scale_in
,
void
**
images_out
,
float
**
scales_out
,
int
image_num
,
const
uint32_t
*
channel_nums
,
int
height
,
int
width
);
}
// namespace image
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/pe.cpp
浏览文件 @
965fce05
...
...
@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
cout
<<
" relu_enabled:"
<<
args
.
relu_enabled
<<
" sb_address:"
<<
args
.
sb_address
<<
" filter_address:"
<<
args
.
filter_address
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
;
cout
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
cout
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
cout
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"Conv"
;
// return
0;
uint64_t
timer_cnt
;
int
ret
=
0
;
uint64_t
output_scale
=
0
;
/*
uint64_t output_scale;
uint64_t image_scale;
uint64_t filter_scale;
...
...
@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
uint64_t sb_address_phy = 0;
uint64_t filter_address_phy = 0;
uint64_t output_address_phy = 0;
int
ret
=
0
;
fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
cout
<<
"image_scale :"
<<
hex
<<
(
image_scale
)
<<
endl
;
cout
<<
"filter_scale :"
<<
hex
<<
(
filter_scale
)
<<
endl
;
uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height *
(uint64_t)args.image.channels;
...
...
@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
filter_address_phy = vaddr_to_paddr(args.filter_address);
output_address_phy = vaddr_to_paddr(args.output.address);
/*SDK刷Cache保证数据一致性*/
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
*/
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
)
{
...
...
@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
/*restart scale*/
reg_writeq
(
output_scale
,
REG_SCALE_PARAMETER
);
reg_writeq
(
image_address_phy
,
REG_CONV_IMAGE_BASE_ADDR
);
reg_writeq
(
filter_address_phy
,
REG_CONV_FILTER_BASE_ADDR
);
reg_writeq
(
sb_address_phy
,
REG_CONV_SB_BASE_ADDR
);
reg_writeq
(
output_address_phy
,
REG_CONV_RESULT_BASE_ADDR
);
reg_writeq
(
((
uint64_t
)
args
.
image
.
height
)
|
(((
uint64_t
)
args
.
image
.
width
)
<<
32
),
REG_CONV_IMAGE_PIXEL
);
reg_writeq
(
((
uint64_t
)
args
.
kernel
.
height
)
|
(((
uint64_t
)
args
.
kernel
.
width
)
<<
32
),
REG_CONV_FILTER_PIXEL
);
reg_writeq
(
output_height
|
(
output_width
<<
32
),
REG_CONV_RESULT_PIXEL
);
reg_writeq
(
args
.
driver
.
output_height
|
(
args
.
driver
.
output_width
<<
32
),
REG_CONV_RESULT_PIXEL
);
reg_writeq
(((
uint64_t
)
args
.
image
.
pad_height
)
|
(((
uint64_t
)
args
.
image
.
pad_width
)
<<
32
),
REG_CONV_PAD_PIXEL
);
reg_writeq
(((
uint64_t
)
args
.
kernel
.
stride_h
)
|
(((
uint64_t
)
args
.
kernel
.
stride_w
)
<<
32
),
REG_CONV_STEP_PIXEL
);
reg_writeq
((
uint64_t
)
args
.
group_num
,
REG_CONV_GROUP_NUMBER
);
reg_writeq
((
uint64_t
)
args
.
filter_num
,
REG_CONV_FILTER_NUMBER
);
reg_writeq
((
uint64_t
)
args
.
image
.
channels
,
REG_CONV_CHANNEL_NUMBER
);
reg_writeq
(
*
(
uint64_t
*
)
args
.
image
.
scale_address
,
REG_CONV_IMAGE_SCALE
);
reg_writeq
(
*
(
uint64_t
*
)
args
.
filter_scale_address
,
REG_CONV_FILTER_SCALE
);
reg_writeq
(
args
.
driver
.
image_address_phy
,
REG_CONV_IMAGE_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
filter_address_phy
,
REG_CONV_FILTER_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
sb_address_phy
,
REG_CONV_SB_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
output_address_phy
,
REG_CONV_RESULT_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
filter_per_group
,
REG_CONV_FILTER_PER_GROUP
);
reg_writeq
(
args
.
driver
.
channel_per_group
,
REG_CONV_CHANNEL_PER_GROUP
);
reg_writeq
(
args
.
driver
.
image_amount_per_row
,
REG_CONV_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
args
.
driver
.
image_one_pad_per_row
,
REG_CONV_IMAGE_ONE_PAD_PER_ROW
);
reg_writeq
(
args
.
driver
.
filter_amount_all
,
REG_CONV_FILTER_AMOUNT_ALL
);
reg_writeq
(
args
.
driver
.
output_amount_per_row
,
REG_CONV_RESULT_AMOUNT_PER_ROW
);
reg_writeq
(
args
.
driver
.
image_block_amount_per_row
,
0xca8
);
reg_writeq
(
args
.
driver
.
filter_pad_width_mul_channel
,
0xcb0
);
reg_writeq
(
args
.
driver
.
image_amount_per_row_multi_win_first
,
0xcb8
);
reg_writeq
(
args
.
driver
.
image_amount_per_row_multi_win
,
0xcc0
);
reg_writeq
(
args
.
driver
.
image_block_num
,
0xcc8
);
reg_writeq
(
args
.
driver
.
image_block_len
,
0xcd0
);
reg_writeq
(
args
.
driver
.
image_block_len_last
,
0xcd8
);
reg_writeq
(
args
.
driver
.
image_win_cnt
,
0xce0
);
reg_writeq
(
args
.
driver
.
image_win_cnt_last
,
0xce8
);
reg_writeq
(
args
.
driver
.
res_row_data_align4_pad
,
0xcf8
);
reg_writeq
(
args
.
driver
.
prog_full_cnt
,
0xd08
);
reg_writeq
(
args
.
driver
.
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
args
.
driver
.
fpga_bias_scale_len
/
4
,
0xd20
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_CONV_CMD
);
reg_writeq
(
filter_per_group
,
REG_CONV_FILTER_PER_GROUP
);
reg_writeq
(
channel_per_group
,
REG_CONV_CHANNEL_PER_GROUP
);
reg_writeq
(
image_amount_per_row
,
REG_CONV_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
image_one_pad_per_row
,
REG_CONV_IMAGE_ONE_PAD_PER_ROW
);
reg_writeq
(
filter_amount_all
,
REG_CONV_FILTER_AMOUNT_ALL
);
reg_writeq
(
output_amount_per_row
,
REG_CONV_RESULT_AMOUNT_PER_ROW
);
reg_writeq
(
image_block_amount_per_row
,
0xca8
);
reg_writeq
(
filter_pad_width_mul_channel
,
0xcb0
);
reg_writeq
(
image_amount_per_row_multi_win_first
,
0xcb8
);
reg_writeq
(
image_amount_per_row_multi_win
,
0xcc0
);
reg_writeq
(
image_block_num
,
0xcc8
);
reg_writeq
(
image_block_len
,
0xcd0
);
reg_writeq
(
image_block_len_last
,
0xcd8
);
reg_writeq
(
image_win_cnt
,
0xce0
);
reg_writeq
(
image_win_cnt_last
,
0xce8
);
reg_writeq
(
res_row_data_align4_pad
,
0xcf8
);
reg_writeq
(
prog_full_cnt
,
0xd08
);
reg_writeq
(
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
fpga_bias_scale_len
/
4
,
0xd20
);
/*write scale*/
reg_writeq
(
image_scale
,
REG_CONV_IMAGE_SCALE
);
reg_writeq
(
filter_scale
,
REG_CONV_FILTER_SCALE
);
reg_writeq
(
cmd
,
REG_CONV_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_CONV
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"Conv Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
cout
<<
"output_scale :"
<<
hex
<<
(
output_scale
)
<<
endl
;
//*(args.output.scale_address) = output_scale;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
...
...
@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG
<<
"Pooling Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
...
...
@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"Conv"
;
// return 0;
int
ret
=
0
;
uint64_t
output_scale
=
0
;
uint64_t
timer_cnt
=
0
;
/*
uint64_t timer_cnt = 0;
uint64_t image0_address_phy = 0;
uint64_t image1_address_phy = 0;
uint64_t output_address_phy = 0;
...
...
@@ -629,15 +587,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
(uint64_t)args.image0.height *
(uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_POOLING
]
->
status
)
{
ret
=
-
EIO
;
DLOG
<<
"Conv Status Error!"
;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
image0_address_phy = vaddr_to_paddr(args.image0.address);
image1_address_phy = vaddr_to_paddr(args.image1.address);
output_address_phy = vaddr_to_paddr(args.output.address);
...
...
@@ -647,36 +596,35 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
IMAGE_ALIGN);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) |
(
uint64_t
)
args
.
image0
.
height
;
(uint64_t)args.image0.height;
*/
/*SDK刷Cache保证数据一致性*/
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_EW
]
->
status
)
{
ret
=
-
EIO
;
DLOG
<<
"EW Status Error!"
;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
/*restart scale*/
reg_writeq
(
output_scale
,
REG_SCALE_PARAMETER
);
reg_writeq
(
image0_address_phy
,
REG_EW_IMAGE0_BASE_ADDR
);
reg_writeq
(
image1_address_phy
,
REG_EW_IMAGE1_BASE_ADDR
);
reg_writeq
(
datalen
,
REG_EW_DATA_LEN
);
reg_writeq
(
image_image_pixel
,
REG_EW_IMAGE_PIXEL
);
reg_writeq
(
image_amount_per_row
,
REG_EW_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
output_address_phy
,
REG_EW_RESULT_BASE_ADDR
);
reg_writeq
(
coefficient
,
REG_EW_COEFFICIENT
);
reg_writeq
(
cmd
,
REG_EW_CMD
);
reg_writeq
(
args
.
driver
.
image0_address_phy
,
REG_EW_IMAGE0_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
image1_address_phy
,
REG_EW_IMAGE1_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
datalen
,
REG_EW_DATA_LEN
);
reg_writeq
(
args
.
driver
.
image_image_pixel
,
REG_EW_IMAGE_PIXEL
);
reg_writeq
(
args
.
driver
.
image_amount_per_row
,
REG_EW_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
args
.
driver
.
output_address_phy
,
REG_EW_RESULT_BASE_ADDR
);
reg_writeq
(
args
.
driver
.
coefficient
,
REG_EW_COEFFICIENT
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_EW_CMD
);
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_POOLING
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_
POOLING
]
->
status
=
ERROR
;
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_
EW
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"EW Wait Irq Timeout!"
;
}
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
...
...
@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) {
DLOG
<<
"BYPASS Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
usleep
(
40
);
/*SDK 无效 Cache保证数据一致性*/
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
...
...
@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
*
data_in
=
ptr_deconv
;
fpga_free
(
ptr_tmp
);
}
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
)
{
#ifdef FPGA_
TES
T_MODE
#ifdef FPGA_
PRIN
T_MODE
DLOG
<<
"=============ComputeFPGADeConv==========="
;
DLOG
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
...
...
src/fpga/V2/api.cpp
浏览文件 @
965fce05
...
...
@@ -146,11 +146,11 @@ int format_conv_data(framework::Tensor *filter_tensor,
}
int
format_fc_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
*
bs_ptr
)
{
framework
::
Tensor
*
ofm_tensor
,
float
*
*
bs_ptr
)
{
float
max_value
=
fpga
::
filter_find_max
(
filter_tensor
);
fpga
::
format_fc_filter
(
filter_tensor
,
max_value
);
int
aligned_num
=
get_aligned_filter_num
(
filter_tensor
);
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
fpga
::
format_bias_scale_array
(
bs_ptr
,
(
int
)
filter_tensor
->
dims
()[
0
],
// NOLINT
aligned_num
);
int
aligned_channel
=
fpga
::
get_conv_output_channel
(
filter_tensor
);
...
...
@@ -214,7 +214,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
conv_arg
[
i
].
output
.
scale_address
=
out
->
scale
;
int
num_after_alignment
=
filter
::
calc_aligned_num
(
(
int
)
input
->
dims
()[
1
],
arg
->
filter_num
);
// NOLINT
arg
->
filter_num
,
(
int
)
input
->
dims
()[
1
]
);
// NOLINT
arg
->
conv_arg
[
i
].
free_space
=
fpga_malloc
(
num_after_alignment
*
2
*
sizeof
(
half
));
}
...
...
src/fpga/V2/api.h
浏览文件 @
965fce05
...
...
@@ -41,7 +41,7 @@ void format_concat_output(framework::Tensor* out, int height, int width,
int
format_conv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
int
group
);
int
format_fc_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
*
bs_ptr
);
framework
::
Tensor
*
ofm_tensor
,
float
*
*
bs_ptr
);
void
fill_split_arg
(
struct
SplitConvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
group_num
,
int
stride_h
,
...
...
src/fpga/common/driver.cpp
浏览文件 @
965fce05
...
...
@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for
(
i
=
0
;
i
<
timeout
;
i
++
)
{
if
(
val
==
reg_readq
(
reg
))
{
std
::
cout
<<
"fpga_regpoll:"
<<
i
<<
"val:"
<<
val
<<
"reg:"
<<
reg
<<
std
::
endl
;
break
;
}
}
...
...
@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
DLOG
<<
"dest:"
<<
dest
<<
" src:"
<<
src
<<
" size:"
<<
num
;
for
(
i
=
0
;
i
<
num
;
i
++
)
{
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*
((
int8_t
*
)
dest
+
i
)
=
*
((
int8_t
*
)
src
+
i
);
// NOLINT
}
...
...
src/fpga/common/driver.h
浏览文件 @
965fce05
...
...
@@ -103,22 +103,15 @@ struct FPGA_INFO {
extern
struct
FPGA_INFO
g_fpgainfo
;
inline
uint64_t
reg_readq
(
uint32_t
offset
)
{
// DLOG << "offset : " << offset;
uint64_t
value
=
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
// NOLINT
offset
);
// NOLINT
// DLOG << "read end";
usleep
(
10
);
return
value
;
}
inline
void
reg_writeq
(
uint64_t
value
,
uint32_t
offset
)
{
// DLOG << "offset : " << offset << ", value : " << value;
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
// NOLINT
offset
)
=
value
;
// DLOG << "write end";
usleep
(
10
);
}
int
open_device_driver
();
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
965fce05
...
...
@@ -92,7 +92,8 @@ void fpga_free(void *ptr) {
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
#ifdef PADDLE_MOBILE_ZU5
driver
::
fpga_copy_driver
(
dest
,
src
,
num
);
// driver::fpga_copy_driver(dest, src, num);
memcpy
(
dest
,
src
,
num
);
#else
memcpy
(
dest
,
src
,
num
);
#endif
...
...
src/fpga/common/fpga_common.h
浏览文件 @
965fce05
...
...
@@ -20,6 +20,13 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
fpga
{
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#endif
enum
DataType
{
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP16
=
0
,
...
...
@@ -52,19 +59,70 @@ struct ImageOutputArgs {
float
*
scale_address
;
// output scale address;
uint64_t
timer_cnt
;
// time counter for FPGA computation
};
#ifdef PADDLE_MOBILE_FPGA_V1
struct
ConvDriverParam
{
uint64_t
image_address_phy
;
uint64_t
filter_address_phy
;
uint64_t
sb_address_phy
;
uint64_t
output_address_phy
;
uint64_t
output_height
;
uint64_t
output_width
;
uint64_t
filter_per_group
;
uint64_t
channel_per_group
;
uint64_t
image_amount_per_row
;
uint64_t
image_one_pad_per_row
;
uint64_t
filter_amount_all
;
uint64_t
output_amount_per_row
;
uint64_t
image_block_amount_per_row
;
uint64_t
filter_pad_width_mul_channel
;
uint64_t
image_amount_per_row_multi_win_first
;
uint64_t
image_amount_per_row_multi_win
;
uint64_t
image_block_num
;
uint64_t
image_block_len
;
uint64_t
image_block_len_last
;
uint64_t
image_win_cnt
;
uint64_t
image_win_cnt_last
;
uint64_t
res_row_data_align4_pad
;
uint64_t
prog_full_cnt
;
uint64_t
post_prog_full_cnt
;
uint64_t
fpga_bias_scale_len
;
uint64_t
cmd
;
};
struct
EWAddDriverParam
{
uint64_t
image0_address_phy
;
uint64_t
image1_address_phy
;
uint64_t
datalen
;
uint64_t
image_image_pixel
;
uint64_t
image_amount_per_row
;
uint64_t
output_address_phy
;
uint64_t
coefficient
;
uint64_t
cmd
;
};
#endif
struct
ConvArgs
{
bool
relu_enabled
;
void
*
sb_address
;
// scale and bias
void
*
filter_address
;
float
*
filter_scale_address
;
void
*
free_space
;
// used by FPGA logic
uint32_t
filter_num
;
uint32_t
group_num
;
struct
KernelArgs
kernel
;
struct
ImageInputArgs
image
;
// input image;
struct
ImageOutputArgs
output
;
#ifdef PADDLE_MOBILE_FPGA_V2
void
*
free_space
;
// used by FPGA logic
#endif
#ifdef PADDLE_MOBILE_FPGA_V1
struct
ConvDriverParam
driver
;
#endif
};
struct
ConcatArgs
{
...
...
@@ -115,6 +173,9 @@ struct EWAddArgs {
struct
ImageInputArgs
image0
;
struct
ImageInputArgs
image1
;
struct
ImageOutputArgs
output
;
#ifdef PADDLE_MOBILE_FPGA_V1
struct
EWAddDriverParam
driver
;
#endif
};
struct
BypassArgs
{
...
...
@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num);
int
fpga_flush
(
void
*
address
,
size_t
size
);
int
fpga_invalidate
(
void
*
address
,
size_t
size
);
uint64_t
vaddr_to_paddr
(
void
*
address
);
void
expand_conv_arg
(
ConvArgs
*
arg
);
void
expand_EW_arg
(
EWAddArgs
*
arg
);
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/common/pe.h
浏览文件 @
965fce05
...
...
@@ -26,6 +26,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int
ComputeFpgaConv
(
const
struct
SplitConvArgs
&
args
);
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
);
int
ComputeFPGASplit
(
const
struct
SplitArgs
&
args
);
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
);
}
// namespace fpga
}
// namespace paddle_mobile
src/operators/kernel/arm/quantize_kernel.cpp
浏览文件 @
965fce05
...
...
@@ -19,10 +19,12 @@ limitations under the License. */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#ifndef __aarch64__
inline
float32_t
vmaxvq_f32
(
float32x4_t
r
)
{
float32x2_t
v
=
vmax_f32
(
vget_high_f32
(
r
),
vget_low_f32
(
r
));
...
...
src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
浏览文件 @
965fce05
...
...
@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
ewaddArgs
.
image1
.
pad_width
=
0
;
ewaddArgs
.
output
.
scale_address
=
out
->
scale
;
ewaddArgs
.
output
.
address
=
out_ptr
;
fpga
::
expand_EW_arg
(
&
ewaddArgs
);
param
->
SetFpgaArgs
(
ewaddArgs
);
return
true
;
}
...
...
src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
浏览文件 @
965fce05
...
...
@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs
.
image1
.
pad_width
=
0
;
ewaddArgs
.
output
.
scale_address
=
out
->
scale
;
ewaddArgs
.
output
.
address
=
out_ptr
;
fpga
::
expand_EW_arg
(
&
ewaddArgs
);
param
->
SetFpgaArgs
(
ewaddArgs
);
return
true
;
}
...
...
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
965fce05
...
...
@@ -24,8 +24,12 @@ template <>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp32_ofm
(
out
);
auto
float_input
=
new
Tensor
;
float_input
->
mutable_data
<
float
>
({
1
,
input
->
dims
()[
1
]});
float_input
->
mutable_data
<
float
>
(
{
1
,
input
->
dims
()[
2
],
input
->
dims
()[
3
],
input
->
dims
()[
1
]});
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
...
...
@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input_ptr
;
args
.
image
.
height
=
1
;
args
.
image
.
width
=
1
;
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
]
;
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
]
;
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
float_input
->
data
<
float
>
();
args
.
output
.
scale_address
=
float_input
->
scale
;
...
...
@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
Tensor
*
out
=
param
.
Out
();
fpga
::
PerformBypass
(
param
.
FpgaArgs
());
fpga
::
fpga_invalidate
(
(
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
fpga
::
get_align_image_cw
(
in_x
->
dims
()[
1
])
*
sizeof
(
float
));
fpga
::
fpga_invalidate
(
(
void
*
)
in_x
->
data
<
float
>
(),
// NOLINT
in_x
->
numel
()
*
sizeof
(
float
));
// TODO: In general case, 0 should be squeezed before softmax input
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
fpga
::
fpga_flush
(
out
->
data
<
float
>
(),
out
->
memory_size
());
}
...
...
src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
浏览文件 @
965fce05
...
...
@@ -21,7 +21,7 @@ namespace operators {
template
<
>
bool
ElementwiseAddReluKernel
<
FPGA
,
float
>::
Init
(
ElementwiseAddReluParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
fals
e
;
bool
relu_enabled
=
tru
e
;
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
*
input_y
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
*
out
=
param
->
Out
();
...
...
src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
浏览文件 @
965fce05
...
...
@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
out
->
Resize
(
framework
::
make_ddim
({
1
,
channel
,
1
,
1
}));
filter
->
Resize
(
framework
::
make_ddim
({
num
,
filter_channel
,
height
,
width
}));
fpga
::
format_fc_data
(
filter
,
out
,
bs_ptr
);
fpga
::
format_fc_data
(
filter
,
out
,
&
bs_ptr
);
fpga
::
SplitConvArgs
conv_arg
=
{
0
};
fpga
::
fill_split_arg
(
&
conv_arg
,
input_x
,
out
,
filter
,
relu_enabled
,
1
,
1
,
1
,
...
...
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
0 → 100644
浏览文件 @
965fce05
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) && defined(__aarch64__)
#include "operators/math/depthwise_conv3x3.h"
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
// template<>
// void DepthwiseConv3x3<int8_t, int32_t>(
// const framework::Tensor *input, const framework::Tensor *filter,
// const std::vector<int> &strides, framework::Tensor *output) {
// PADDLE_MOBILE_THROW_EXCEPTION(
// "Depthwise conv with generic strides has not been implemented.");
// }
template
<
>
void
DepthwiseConv3x3S1
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Depthwise conv3x3 with stride 1 for arm v8 has not been implemented."
);
}
template
<
>
void
DepthwiseConv3x3S2
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Depthwise conv3x3 with stride 2 for arm v8 has not been implemented."
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
0 → 100644
浏览文件 @
965fce05
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
// project.
#ifdef CONV_OP
#ifdef __aarch64__
#include "operators/math/pad.h"
#include "operators/math/winograd/winograd_transform.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
template
<
>
void
winograd_transform_weight
<
8
,
3
>
(
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
output
)
{
/*
* w0 = g0
* w1 = ((g0 + g2) + g1) * (-2.0 / 9)
* w2 = ((g0 + g2) - g1) * (-2.0 / 9)
* w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
* w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
* w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
* w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
* w7 = g2
*/
// TODO(hjchen2)
PADDLE_MOBILE_THROW_EXCEPTION
(
"Winograd for arm v8 has not been implemented."
);
}
template
<
>
void
winograd_transform_input
<
8
,
3
>
(
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
output
)
{
/*
* x0 = (d0 - d6) + (d4 - d2) * 5.25
* x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
* x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
* x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
* x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
* x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
* x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
* x7 = (d7 - d1) + (d3 - d5) * 5.25
*/
// TODO(hjchen2)
PADDLE_MOBILE_THROW_EXCEPTION
(
"Winograd for arm v8 has not been implemented."
);
}
template
<
>
void
winograd_transform_output
<
8
,
3
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
output
)
{
// TODO(hjchen2)
PADDLE_MOBILE_THROW_EXCEPTION
(
"Winograd for arm v8 has not been implemented."
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif // __aarch64__
#endif // CONV_OP
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录