Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
1e8dd938
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1e8dd938
编写于
12月 15, 2018
作者:
Z
zhangyang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove redundancy for V1 for FPGA track
上级
d74fdd19
变更
10
展开全部
显示空白变更内容
内联
并排
Showing
10 changed file
with
152 addition
and
346 deletion
+152
-346
CMakeLists.txt
CMakeLists.txt
+3
-2
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+44
-64
src/fpga/V1/api.h
src/fpga/V1/api.h
+0
-2
src/fpga/V1/deconv_filter.cpp
src/fpga/V1/deconv_filter.cpp
+5
-15
src/fpga/V1/deconv_filter.h
src/fpga/V1/deconv_filter.h
+0
-1
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+71
-243
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+6
-10
src/fpga/common/driver.h
src/fpga/common/driver.h
+3
-7
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+20
-1
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+0
-1
未找到文件。
CMakeLists.txt
浏览文件 @
1e8dd938
...
@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF)
...
@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF)
option
(
CPU
"armv7 with neon"
ON
)
option
(
CPU
"armv7 with neon"
ON
)
option
(
GPU_MALI
"mali gpu"
OFF
)
option
(
GPU_MALI
"mali gpu"
OFF
)
option
(
GPU_CL
"opencl gpu"
OFF
)
option
(
GPU_CL
"opencl gpu"
OFF
)
option
(
FPGA
"fpga"
OFF
)
option
(
FPGA
"fpga"
OFF
)
if
(
FPGA
)
if
(
FPGA
)
option
(
FPGAV1
"fpga v1"
ON
)
option
(
FPGAV1
"fpga v1"
ON
)
...
@@ -144,7 +145,7 @@ if(FPGA)
...
@@ -144,7 +145,7 @@ if(FPGA)
endforeach
()
endforeach
()
file
(
GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h
)
file
(
GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h
)
foreach
(
f
${
_tmp_list
}
)
foreach
(
f
${
_tmp_list
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
CC
${
f
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
H
${
f
}
)
endforeach
()
endforeach
()
endif
()
endif
()
if
(
FPGAV2
)
if
(
FPGAV2
)
...
@@ -156,7 +157,7 @@ if(FPGA)
...
@@ -156,7 +157,7 @@ if(FPGA)
endforeach
()
endforeach
()
file
(
GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h
)
file
(
GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h
)
foreach
(
f
${
_tmp_list
}
)
foreach
(
f
${
_tmp_list
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
CC
${
f
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
H
${
f
}
)
endforeach
()
endforeach
()
endif
()
endif
()
...
...
src/fpga/V1/api.cpp
浏览文件 @
1e8dd938
...
@@ -24,8 +24,6 @@ namespace fpga {
...
@@ -24,8 +24,6 @@ namespace fpga {
#define USE_RELU 1
#define USE_RELU 1
#define USE_BIAS 2
#define USE_BIAS 2
int
get_align_image_cw
(
int
cw
)
{
return
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
auto
dims
=
image_tensor
->
dims
();
auto
dims
=
image_tensor
->
dims
();
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
...
@@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) {
...
@@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) {
return
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
return
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
}
}
int
get_aligned_filter_num
(
int
num
)
{
return
align_to_x
(
num
,
FILTER_NUM_ALIGNMENT
);
}
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
)
{
int
group_num
)
{
filter_tensor
->
scale
[
0
]
=
float
(
max_value
/
127.0
);
// NOLINT
filter_tensor
->
scale
[
0
]
=
float
(
max_value
/
127.0
);
// NOLINT
...
@@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width,
...
@@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width,
void
expand_conv_arg
(
ConvArgs
*
arg
)
{
void
expand_conv_arg
(
ConvArgs
*
arg
)
{
ConvArgs
args
=
*
arg
;
ConvArgs
args
=
*
arg
;
uint64_t
filterlen
=
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
*
auto
fpga_bias_scale_len
=
(
uint64_t
)
args
.
image
.
channels
;
filterlen
=
align_to_x
(
filterlen
,
FILTER_ELEMENT_ALIGNMENT
);
filterlen
*=
align_to_x
((
uint64_t
)
args
.
filter_num
,
FILTER_NUM_ALIGNMENT
);
uint64_t
fpga_bias_scale_len
=
align_to_x
(
args
.
filter_num
/
args
.
group_num
,
8
)
*
args
.
group_num
;
align_to_x
(
args
.
filter_num
/
args
.
group_num
,
8
)
*
args
.
group_num
;
uint64_t
output_height
=
auto
output_height
=
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
args
.
kernel
.
stride_h
+
1
;
1
;
uint64_t
output_width
=
auto
output_width
=
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
args
.
kernel
.
stride_w
+
1
;
1
;
uint64_t
output_size
=
output_height
*
output_width
*
(
uint64_t
)
args
.
filter_num
;
auto
filter_per_group
=
args
.
filter_num
/
args
.
group_num
;
auto
channel_per_group
=
args
.
image
.
channels
/
args
.
group_num
;
auto
filter_per_group
=
(
uint64_t
)(
args
.
filter_num
/
args
.
group_num
);
auto
channel_per_group
=
(
uint64_t
)(
args
.
image
.
channels
/
args
.
group_num
);
auto
image_row_count
=
args
.
image
.
width
*
args
.
image
.
channels
;
auto
image_amount_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
);
uint64_t
image_row_count
=
((
uint64_t
)
args
.
image
.
width
)
*
auto
image_one_pad_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
)
+
((
uint64_t
)
args
.
image
.
channels
);
// without align
args
.
image
.
pad_width
*
args
.
image
.
channels
;
uint64_t
image_amount_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
);
auto
filter_amount_all
=
uint64_t
image_one_pad_per_row
=
align_to_x
(
args
.
kernel
.
height
*
args
.
kernel
.
width
*
channel_per_group
,
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
)
+
((
uint64_t
)
args
.
image
.
pad_width
)
*
((
uint64_t
)
args
.
image
.
channels
);
uint64_t
filter_amount_all
=
align_to_x
(((
uint64_t
)
args
.
kernel
.
height
)
*
((
uint64_t
)
args
.
kernel
.
width
)
*
channel_per_group
,
FILTER_ELEMENT_ALIGNMENT
);
FILTER_ELEMENT_ALIGNMENT
);
uint64_t
output_amount_per_row
=
auto
output_amount_per_row
=
align_to_x
(
output_width
*
((
uint64_t
)
args
.
filter_num
)
,
IMAGE_ALIGNMENT
);
align_to_x
(
output_width
*
args
.
filter_num
,
IMAGE_ALIGNMENT
);
// find the opt partition strategy
// find the opt partition strategy
uint64_t
res_win
;
uint64_t
res_win
;
uint64_t
res_fit
=
0
;
uint64_t
res_fit
=
0
;
for
(
res_win
=
1
;
res_win
<=
output_width
;
res_win
=
res_win
+
1
)
{
for
(
res_win
=
1
;
res_win
<=
output_width
;
res_win
++
)
{
if
((
align_to_x
(
if
((
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
res_win
-
1
)
*
args
.
kernel
.
stride_w
)),
(
args
.
kernel
.
width
+
(
res_win
-
1
)
*
args
.
kernel
.
stride_w
)),
...
@@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) {
...
@@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) {
}
}
res_fit
=
res_win
;
res_fit
=
res_win
;
uint64_t
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
auto
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
uint64_t
block_len
=
res_fit
;
auto
block_len
=
res_fit
;
uint64_t
block_last
=
output_width
-
res_fit
*
(
block_num
-
1
);
auto
block_last
=
output_width
-
res_fit
*
(
block_num
-
1
);
uint64_t
res_amount_per_row
=
output_width
*
args
.
filter_num
;
auto
res_amount_per_row
=
output_width
*
args
.
filter_num
;
uint64_t
res_amount_per_row_pad
=
output_amount_per_row
-
res_amount_per_row
;
auto
res_amount_per_row_pad
=
output_amount_per_row
-
res_amount_per_row
;
uint64_t
image_block_amount_per_row
=
auto
image_block_amount_per_row
=
args
.
kernel
.
stride_w
*
(
res_fit
)
*
args
.
image
.
channels
;
args
.
kernel
.
stride_w
*
res_fit
*
args
.
image
.
channels
;
uint64_t
filter_pad_width_mul_channel
=
auto
filter_pad_width_mul_channel
=
args
.
image
.
pad_width
*
args
.
image
.
channels
;
args
.
image
.
pad_width
*
args
.
image
.
channels
;
uint64_t
image_amount_per_row_multi_win_first
=
auto
image_amount_per_row_multi_win_first
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
uint64_t
image_amount_per_row_multi_win
=
auto
image_amount_per_row_multi_win
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
);
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
);
uint64_t
image_block_num
=
block_num
;
auto
image_block_num
=
block_num
;
uint64_t
image_block_len
=
auto
image_block_len
=
align_to_x
((
args
.
image
.
channels
*
align_to_x
((
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_len
-
1
)
*
args
.
kernel
.
stride_w
)),
(
args
.
kernel
.
width
+
(
block_len
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
IMAGE_ALIGNMENT
)
/
16
+
16
+
1
;
1
;
uint64_t
image_block_len_last
=
auto
image_block_len_last
=
align_to_x
(
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_last
-
1
)
*
args
.
kernel
.
stride_w
)),
(
args
.
kernel
.
width
+
(
block_last
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
IMAGE_ALIGNMENT
)
/
16
+
16
+
1
;
1
;
uint64_t
image_win_cnt
=
block_len
;
auto
image_win_cnt
=
block_len
;
uint64_t
image_win_cnt_last
=
block_last
;
auto
image_win_cnt_last
=
block_last
;
uint64_t
res_row_data_align4_pad
=
res_amount_per_row_pad
/
8
;
auto
res_row_data_align4_pad
=
res_amount_per_row_pad
/
8
;
uint64_t
prog_full_cnt
=
2048
/
(
filter_amount_all
/
16
*
2
)
-
1
;
auto
prog_full_cnt
=
2048
/
(
filter_amount_all
/
16
*
2
)
-
1
;
if
(
prog_full_cnt
==
1023
)
{
if
(
prog_full_cnt
==
1023
)
{
prog_full_cnt
--
;
prog_full_cnt
--
;
}
}
uint64_t
post_prog_full_cnt
=
auto
post_prog_full_cnt
=
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
>
2
)
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
>
2
)
?
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
-
2
)
?
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
-
2
)
:
0
;
:
0
;
uint64_t
cmd
=
0UL
|
(
args
.
relu_enabled
?
USE_RELU
:
0
)
|
USE_BIAS
;
auto
cmd
=
0UL
|
(
args
.
relu_enabled
?
USE_RELU
:
0
)
|
USE_BIAS
;
(
*
arg
).
driver
.
image_address_phy
=
vaddr_to_paddr
(
args
.
image
.
address
);
(
*
arg
).
driver
.
image_address_phy
=
vaddr_to_paddr
(
args
.
image
.
address
);
(
*
arg
).
driver
.
sb_address_phy
=
vaddr_to_paddr
(
args
.
sb_address
);
(
*
arg
).
driver
.
sb_address_phy
=
vaddr_to_paddr
(
args
.
sb_address
);
...
@@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
sub_conv_num
=
(
uint32_t
)
stride_h
;
arg
->
sub_conv_num
=
(
uint32_t
)
stride_h
;
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
int
sub_conv_num
=
arg
->
sub_conv_num
;
int
sub_conv_num
=
arg
->
sub_conv_num
;
int
sub_stride
=
1
;
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
((
int
)
filter
->
dims
()[
3
],
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
((
int
)
filter
->
dims
()[
3
],
padding_w
,
stride_w
);
padding_w
,
stride_w
);
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
...
@@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
int
sub_channels
=
(
int
)
input
->
dims
()[
1
];
auto
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
int
conv_output_size
=
int
conv_output_size
=
(
align_to_x
(
sub_output_width
*
sub_filter_num
,
IMAGE_ALIGNMENT
))
*
(
align_to_x
(
sub_output_width
*
sub_filter_num
,
IMAGE_ALIGNMENT
))
*
sub_output_height
;
sub_output_height
;
int
ouput_size
=
conv_output_size
*
sub_conv_num
;
int
align_sub_filter_num
=
align_to_x
(
sub_filter_num
,
FILTER_NUM_ALIGNMENT
);
int
align_sub_filter_num
=
align_to_x
(
sub_filter_num
,
FILTER_NUM_ALIGNMENT
);
int
align_sub_filter_count
=
int
align_sub_filter_count
=
...
@@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_sub_filter_count
*
align_sub_filter_num
;
align_sub_filter_count
*
align_sub_filter_num
;
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
conv_args
[
i
].
filter_num
=
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
)
;
arg
->
conv_args
[
i
].
filter_num
=
arg
->
sub_conv_num
*
arg
->
filter_num
;
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
...
@@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
...
@@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
(
void
*
)
bs_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
bs_ptr
;
auto
filter_sub_space
=
auto
filter_sub_space
=
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
fpga_copy
(
filter_sub_space
,
fpga_copy
(
filter_sub_space
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
(
size_t
)
align_conv_sub_filter_count
);
(
size_t
)
align_conv_sub_filter_count
);
arg
->
conv_args
[
i
].
filter_address
=
(
void
*
)(
filter_sub_space
)
;
arg
->
conv_args
[
i
].
filter_address
=
filter_sub_space
;
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
if
(
sub_conv_num
==
1
)
{
if
(
sub_conv_num
==
1
)
{
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
}
else
{
}
else
{
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
auto
ptr_output
=
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
)
;
arg
->
conv_args
[
i
].
output
.
address
=
ptr_output
;
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
}
}
expand_conv_arg
(
&
arg
->
conv_args
[
i
]);
}
}
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
arg
->
output
.
scale_address
=
out
->
scale
;
// fpga_free(filter_
ptr);
filter
->
reset_data_ptr
(
null
ptr
);
}
// fill_deconv_arg
}
// fill_deconv_arg
}
// namespace fpga
}
// namespace fpga
...
...
src/fpga/V1/api.h
浏览文件 @
1e8dd938
...
@@ -21,7 +21,6 @@ limitations under the License. */
...
@@ -21,7 +21,6 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
fpga
{
namespace
fpga
{
int
get_align_image_cw
(
int
cw
);
void
format_image
(
framework
::
Tensor
*
image_tensor
);
void
format_image
(
framework
::
Tensor
*
image_tensor
);
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
);
// only allocate memory
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
);
// only allocate memory
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
);
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
);
...
@@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor);
...
@@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor);
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
);
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
);
int
get_aligned_filter_element_num
(
int
chw
);
int
get_aligned_filter_element_num
(
int
chw
);
int
get_aligned_filter_num
(
int
num
);
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
);
int
group_num
);
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
);
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
);
...
...
src/fpga/V1/deconv_filter.cpp
浏览文件 @
1e8dd938
...
@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter
...
@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter
void
deconv_inverse_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
width
,
void
deconv_inverse_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
width
,
int
height
)
{
int
height
)
{
float
*
tmp
=
*
data_in
;
float
*
tmp
=
*
data_in
;
// float fix_range = 127;// float scale = fix_range / max;
int
data_size
=
num
*
channel
*
width
*
height
;
int
data_size
=
num
*
channel
*
width
*
height
;
int
hw_len
=
height
*
width
;
int
hw_len
=
height
*
width
;
float
*
tmp_data
=
(
float
*
)
fpga_malloc
(
data_size
*
sizeof
(
float
));
auto
tmp_data
=
(
float
*
)
fpga_malloc
(
data_size
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
for
(
int
k
=
0
;
k
<
hw_len
;
++
k
)
{
for
(
int
k
=
0
;
k
<
hw_len
;
++
k
)
{
...
@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
...
@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
}
}
}
}
}
}
*
data_in
=
(
float
*
)
tmp_data
;
//
*
data_in
=
tmp_data
;
fpga_free
(
tmp
);
fpga_free
(
tmp
);
}
}
...
@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
...
@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
*/
*/
int
deconv_calc_sub_pad
(
int
filter_axis
,
int
pad
,
int
stride
)
{
int
deconv_calc_sub_pad
(
int
filter_axis
,
int
pad
,
int
stride
)
{
if
(
stride
==
0
||
((
filter_axis
-
pad
-
1
)
<
0
))
{
if
(
stride
==
0
||
((
filter_axis
-
pad
-
1
)
<
0
))
{
// error
PADDLE_MOBILE_ENFORCE
(
false
,
"Wrong deconv parameters"
);
return
0
;
}
}
return
(
filter_axis
-
pad
-
1
)
/
stride
;
return
(
filter_axis
-
pad
-
1
)
/
stride
;
}
}
...
@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
...
@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
position. so the omit rows or columns is (stride - )
position. so the omit rows or columns is (stride - )
*/
*/
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
)
{
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
)
{
if
(((
filter_width
-
pad
)
<=
0
))
{
// ((filter_width-pad) > stride) ||
PADDLE_MOBILE_ENFORCE
(
filter_width
>
pad
,
"Wrong deconv parameters"
);
// error
int
idx
;
return
0
;
}
int
idx
=
1
;
bool
flag
=
false
;
bool
flag
=
false
;
for
(
idx
=
1
;
idx
<=
stride
;
++
idx
)
{
for
(
idx
=
1
;
idx
<=
stride
;
++
idx
)
{
int
j
=
idx
;
int
j
=
idx
;
...
@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
...
@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
return
(
stride
-
idx
);
return
(
stride
-
idx
);
}
}
int
deconv_get_sub_filter_num
(
int
filter_num
,
int
stride
)
{
return
filter_num
*
stride
;
}
void
deconv_get_sub_filter
(
char
**
data_in
,
int
height
,
int
width
,
void
deconv_get_sub_filter
(
char
**
data_in
,
int
height
,
int
width
,
int
sub_conv_n
,
int
kernel_num
,
int
channel
)
{
int
sub_conv_n
,
int
kernel_num
,
int
channel
)
{
char
*
ptr_tmp
=
*
data_in
;
char
*
ptr_tmp
=
*
data_in
;
...
@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
...
@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
char
*
ptr_space
=
(
char
*
)
fpga_malloc
(
sub_conv_n
*
align_offset
*
char
*
ptr_space
=
(
char
*
)
fpga_malloc
(
sub_conv_n
*
align_offset
*
sizeof
(
char
));
// continuous space
sizeof
(
char
));
// continuous space
for
(
int
i
=
0
;
i
<
sub_conv_n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
sub_conv_n
;
++
i
)
{
int
offset
=
i
*
origin_offset
;
char
*
ptr_tmp
=
(
ptr_ptr_data
)[
i
];
char
*
ptr_tmp
=
(
ptr_ptr_data
)[
i
];
filter
::
align_element
(
&
ptr_tmp
,
sub_num
,
sub_chw
);
filter
::
align_element
(
&
ptr_tmp
,
sub_num
,
sub_chw
);
...
...
src/fpga/V1/deconv_filter.h
浏览文件 @
1e8dd938
...
@@ -21,7 +21,6 @@ namespace deconv_filter {
...
@@ -21,7 +21,6 @@ namespace deconv_filter {
void
deconv_inverse_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
width
,
void
deconv_inverse_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
width
,
int
height
);
int
height
);
int
deconv_calc_sub_pad
(
int
filter_axis
,
int
pad
,
int
stride
);
int
deconv_calc_sub_pad
(
int
filter_axis
,
int
pad
,
int
stride
);
int
deconv_get_sub_filter_num
(
int
filter_num
,
int
stride
);
int
deconv_get_sub_filter_axis
(
int
filter_axis
,
int
stride
);
int
deconv_get_sub_filter_axis
(
int
filter_axis
,
int
stride
);
int
deconv_get_sub_out_axis
(
int
image_axis
,
int
sub_pad
,
int
sub_filter_axis
);
int
deconv_get_sub_out_axis
(
int
image_axis
,
int
sub_pad
,
int
sub_filter_axis
);
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
);
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
);
...
...
src/fpga/V1/pe.cpp
浏览文件 @
1e8dd938
此差异已折叠。
点击以展开。
src/fpga/common/driver.cpp
浏览文件 @
1e8dd938
...
@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
...
@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t
_nr
=
DIV_ROUND_UP
(
size
,
FPGA_PAGE_SIZE
);
uint64_t
_nr
=
DIV_ROUND_UP
(
size
,
FPGA_PAGE_SIZE
);
unsigned
int
nr
=
(
unsigned
int
)
_nr
;
unsigned
int
nr
=
(
unsigned
int
)
_nr
;
int
ret
=
0
;
int
ret
=
0
;
DLOG
<<
size
;
DLOG
<<
_nr
;
DLOG
<<
nr
;
uint64_t
a_size
=
FPGA_PAGE_SIZE
*
nr
;
uint64_t
a_size
=
FPGA_PAGE_SIZE
*
nr
;
DLOG
<<
a_size
;
DLOG
<<
a_size
;
...
@@ -283,7 +279,7 @@ int fpga_memory_add() {
...
@@ -283,7 +279,7 @@ int fpga_memory_add() {
return
0
;
return
0
;
}
}
uint64_t
vaddr_to_paddr
(
void
*
address
)
{
uint64_t
vaddr_to_paddr
_driver
(
void
*
address
)
{
uint64_t
paddr
=
0
;
uint64_t
paddr
=
0
;
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
address
);
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
address
);
if
(
iter
!=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
end
())
{
if
(
iter
!=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
end
())
{
...
@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) {
...
@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) {
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
munmap
(
ptr
,
size
);
munmap
(
ptr
,
size
);
}
else
{
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"Invalid pointer"
<<
ptr
;
}
}
}
}
...
@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) {
...
@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
munmap
(
ptr
,
size
);
munmap
(
ptr
,
size
);
p_addr
=
vaddr_to_paddr
(
ptr
);
p_addr
=
vaddr_to_paddr
_driver
(
ptr
);
pos
=
(
p_addr
-
g_fpgainfo
.
memory_info
->
mem_start
)
/
FPGA_PAGE_SIZE
;
pos
=
(
p_addr
-
g_fpgainfo
.
memory_info
->
mem_start
)
/
FPGA_PAGE_SIZE
;
/*clear bitmap*/
/*clear bitmap*/
...
@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) {
...
@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo
.
fpga_vaddr2paddr_map
.
erase
(
iter
);
g_fpgainfo
.
fpga_vaddr2paddr_map
.
erase
(
iter
);
}
}
}
else
{
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"Invalid pointer"
<<
ptr
;
}
}
}
}
...
@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) {
...
@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) {
struct
MemoryCacheArgs
args
;
struct
MemoryCacheArgs
args
;
uint64_t
p_addr
;
uint64_t
p_addr
;
p_addr
=
vaddr_to_paddr
(
address
);
p_addr
=
vaddr_to_paddr
_driver
(
address
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
size
=
size
;
args
.
size
=
size
;
...
@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
...
@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
struct
MemoryCacheArgs
args
;
struct
MemoryCacheArgs
args
;
uint64_t
p_addr
;
uint64_t
p_addr
;
p_addr
=
vaddr_to_paddr
(
address
);
p_addr
=
vaddr_to_paddr
_driver
(
address
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
size
=
size
;
args
.
size
=
size
;
...
...
src/fpga/common/driver.h
浏览文件 @
1e8dd938
...
@@ -31,8 +31,8 @@ namespace driver {
...
@@ -31,8 +31,8 @@ namespace driver {
#define FPGA_REG_PHY_ADDR 0xa0000000
#define FPGA_REG_PHY_ADDR 0xa0000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x
2
0000000
#define FPGA_MEM_PHY_ADDR 0x
4
0000000
#define FPGA_MEM_SIZE 0x
2
0000000
#define FPGA_MEM_SIZE 0x
8
0000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
#define FPGA_PAGE_SIZE (16UL * 1024UL)
...
@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size);
...
@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size);
void
fpga_free_driver
(
void
*
ptr
);
void
fpga_free_driver
(
void
*
ptr
);
void
fpga_copy_driver
(
void
*
dest
,
const
void
*
src
,
size_t
num
);
int
fpga_flush_driver
(
void
*
address
,
size_t
size
);
int
fpga_flush_driver
(
void
*
address
,
size_t
size
);
int
fpga_invalidate_driver
(
void
*
address
,
size_t
size
);
int
fpga_invalidate_driver
(
void
*
address
,
size_t
size
);
/*pe*/
uint64_t
vaddr_to_paddr_driver
(
void
*
address
);
uint64_t
vaddr_to_paddr
(
void
*
address
);
int
fpga_regpoll
(
uint64_t
reg
,
uint64_t
val
,
int
time
);
int
fpga_regpoll
(
uint64_t
reg
,
uint64_t
val
,
int
time
);
...
...
src/fpga/common/fpga_common.h
浏览文件 @
1e8dd938
...
@@ -37,6 +37,18 @@ enum LayoutType {
...
@@ -37,6 +37,18 @@ enum LayoutType {
LAYOUT_HWC
=
0
,
LAYOUT_HWC
=
0
,
};
};
enum
ActivationType
{
NONE
=
0
,
LEAKYRELU
=
1
,
SIGMOID
=
2
,
TANH
=
3
,
};
struct
ActivationArgs
{
enum
ActivationType
activation_type
;
int16_t
leaky_relu_negative_slope
;
};
struct
KernelArgs
{
struct
KernelArgs
{
uint32_t
width
;
uint32_t
width
;
uint32_t
height
;
uint32_t
height
;
...
@@ -58,7 +70,10 @@ struct ImageOutputArgs {
...
@@ -58,7 +70,10 @@ struct ImageOutputArgs {
void
*
address
;
// output result address;
void
*
address
;
// output result address;
float
*
scale_address
;
// output scale address;
float
*
scale_address
;
// output scale address;
uint64_t
timer_cnt
;
// time counter for FPGA computation
uint64_t
timer_cnt
;
// time counter for FPGA computation
struct
ActivationArgs
activation
;
// To select activation and specify (Leaky)Relu parameter.
};
};
#ifdef PADDLE_MOBILE_FPGA_V1
#ifdef PADDLE_MOBILE_FPGA_V1
struct
ConvDriverParam
{
struct
ConvDriverParam
{
uint64_t
image_address_phy
;
uint64_t
image_address_phy
;
...
@@ -198,7 +213,11 @@ struct DeconvArgs {
...
@@ -198,7 +213,11 @@ struct DeconvArgs {
struct
ConvArgs
*
conv_args
;
struct
ConvArgs
*
conv_args
;
};
};
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static
inline
uint32_t
align_to_x
(
int64_t
num
,
int64_t
x
)
{
return
((
uint32_t
)(
num
+
x
)
-
1
)
/
(
uint32_t
)
x
*
(
uint32_t
)
x
;
}
int16_t
fp32_2_fp16
(
float
fp32_num
);
int16_t
fp32_2_fp16
(
float
fp32_num
);
float
fp16_2_fp32
(
int16_t
fp16_num
);
float
fp16_2_fp32
(
int16_t
fp16_num
);
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
1e8dd938
...
@@ -14,7 +14,6 @@ limitations under the License. */
...
@@ -14,7 +14,6 @@ limitations under the License. */
#ifdef TRANSPOSE2_OP
#ifdef TRANSPOSE2_OP
#include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录