Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
39c27adb
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
39c27adb
编写于
12月 16, 2018
作者:
Z
zhangyang0701
提交者:
GitHub
12月 16, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1351 from zhangyang0701/develop
remove redundancy for V1 for FPGA track close
#1350
上级
d74fdd19
23d78fdd
变更
11
展开全部
显示空白变更内容
内联
并排
Showing
11 changed file
with
153 addition
and
347 deletion
+153
-347
CMakeLists.txt
CMakeLists.txt
+3
-2
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+44
-64
src/fpga/V1/api.h
src/fpga/V1/api.h
+0
-2
src/fpga/V1/deconv_filter.cpp
src/fpga/V1/deconv_filter.cpp
+5
-15
src/fpga/V1/deconv_filter.h
src/fpga/V1/deconv_filter.h
+0
-1
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+71
-243
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+6
-10
src/fpga/common/driver.h
src/fpga/common/driver.h
+3
-7
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+1
-1
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+20
-1
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+0
-1
未找到文件。
CMakeLists.txt
浏览文件 @
39c27adb
...
...
@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF)
option
(
CPU
"armv7 with neon"
ON
)
option
(
GPU_MALI
"mali gpu"
OFF
)
option
(
GPU_CL
"opencl gpu"
OFF
)
option
(
FPGA
"fpga"
OFF
)
if
(
FPGA
)
option
(
FPGAV1
"fpga v1"
ON
)
...
...
@@ -144,7 +145,7 @@ if(FPGA)
endforeach
()
file
(
GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h
)
foreach
(
f
${
_tmp_list
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
CC
${
f
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
H
${
f
}
)
endforeach
()
endif
()
if
(
FPGAV2
)
...
...
@@ -156,7 +157,7 @@ if(FPGA)
endforeach
()
file
(
GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h
)
foreach
(
f
${
_tmp_list
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
CC
${
f
}
)
list
(
REMOVE_ITEM PADDLE_MOBILE_
H
${
f
}
)
endforeach
()
endif
()
...
...
src/fpga/V1/api.cpp
浏览文件 @
39c27adb
...
...
@@ -24,8 +24,6 @@ namespace fpga {
#define USE_RELU 1
#define USE_BIAS 2
int
get_align_image_cw
(
int
cw
)
{
return
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
auto
dims
=
image_tensor
->
dims
();
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
...
...
@@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) {
return
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
}
int
get_aligned_filter_num
(
int
num
)
{
return
align_to_x
(
num
,
FILTER_NUM_ALIGNMENT
);
}
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
)
{
filter_tensor
->
scale
[
0
]
=
float
(
max_value
/
127.0
);
// NOLINT
...
...
@@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width,
void
expand_conv_arg
(
ConvArgs
*
arg
)
{
ConvArgs
args
=
*
arg
;
uint64_t
filterlen
=
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
*
(
uint64_t
)
args
.
image
.
channels
;
filterlen
=
align_to_x
(
filterlen
,
FILTER_ELEMENT_ALIGNMENT
);
filterlen
*=
align_to_x
((
uint64_t
)
args
.
filter_num
,
FILTER_NUM_ALIGNMENT
);
uint64_t
fpga_bias_scale_len
=
auto
fpga_bias_scale_len
=
align_to_x
(
args
.
filter_num
/
args
.
group_num
,
8
)
*
args
.
group_num
;
uint64_t
output_height
=
auto
output_height
=
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
1
;
uint64_t
output_width
=
auto
output_width
=
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
1
;
uint64_t
output_size
=
output_height
*
output_width
*
(
uint64_t
)
args
.
filter_num
;
auto
filter_per_group
=
(
uint64_t
)(
args
.
filter_num
/
args
.
group_num
);
auto
channel_per_group
=
(
uint64_t
)(
args
.
image
.
channels
/
args
.
group_num
);
uint64_t
image_row_count
=
((
uint64_t
)
args
.
image
.
width
)
*
((
uint64_t
)
args
.
image
.
channels
);
// without align
uint64_t
image_amount_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
);
uint64_t
image_one_pad_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
)
+
((
uint64_t
)
args
.
image
.
pad_width
)
*
((
uint64_t
)
args
.
image
.
channels
);
uint64_t
filter_amount_all
=
align_to_x
(((
uint64_t
)
args
.
kernel
.
height
)
*
((
uint64_t
)
args
.
kernel
.
width
)
*
channel_per_group
,
auto
filter_per_group
=
args
.
filter_num
/
args
.
group_num
;
auto
channel_per_group
=
args
.
image
.
channels
/
args
.
group_num
;
auto
image_row_count
=
args
.
image
.
width
*
args
.
image
.
channels
;
auto
image_amount_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
);
auto
image_one_pad_per_row
=
align_to_x
(
image_row_count
,
IMAGE_ALIGNMENT
)
+
args
.
image
.
pad_width
*
args
.
image
.
channels
;
auto
filter_amount_all
=
align_to_x
(
args
.
kernel
.
height
*
args
.
kernel
.
width
*
channel_per_group
,
FILTER_ELEMENT_ALIGNMENT
);
uint64_t
output_amount_per_row
=
align_to_x
(
output_width
*
((
uint64_t
)
args
.
filter_num
)
,
IMAGE_ALIGNMENT
);
auto
output_amount_per_row
=
align_to_x
(
output_width
*
args
.
filter_num
,
IMAGE_ALIGNMENT
);
// find the opt partition strategy
uint64_t
res_win
;
uint64_t
res_fit
=
0
;
for
(
res_win
=
1
;
res_win
<=
output_width
;
res_win
=
res_win
+
1
)
{
for
(
res_win
=
1
;
res_win
<=
output_width
;
res_win
++
)
{
if
((
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
res_win
-
1
)
*
args
.
kernel
.
stride_w
)),
...
...
@@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) {
}
res_fit
=
res_win
;
uint64_t
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
uint64_t
block_len
=
res_fit
;
uint64_t
block_last
=
output_width
-
res_fit
*
(
block_num
-
1
);
auto
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
auto
block_len
=
res_fit
;
auto
block_last
=
output_width
-
res_fit
*
(
block_num
-
1
);
uint64_t
res_amount_per_row
=
output_width
*
args
.
filter_num
;
uint64_t
res_amount_per_row_pad
=
output_amount_per_row
-
res_amount_per_row
;
auto
res_amount_per_row
=
output_width
*
args
.
filter_num
;
auto
res_amount_per_row_pad
=
output_amount_per_row
-
res_amount_per_row
;
uint64_t
image_block_amount_per_row
=
args
.
kernel
.
stride_w
*
(
res_fit
)
*
args
.
image
.
channels
;
uint64_t
filter_pad_width_mul_channel
=
auto
image_block_amount_per_row
=
args
.
kernel
.
stride_w
*
res_fit
*
args
.
image
.
channels
;
auto
filter_pad_width_mul_channel
=
args
.
image
.
pad_width
*
args
.
image
.
channels
;
uint64_t
image_amount_per_row_multi_win_first
=
auto
image_amount_per_row_multi_win_first
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
uint64_t
image_amount_per_row_multi_win
=
auto
image_amount_per_row_multi_win
=
image_amount_per_row
*
(
4
*
args
.
kernel
.
stride_h
);
uint64_t
image_block_num
=
block_num
;
uint64_t
image_block_len
=
auto
image_block_num
=
block_num
;
auto
image_block_len
=
align_to_x
((
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_len
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
;
uint64_t
image_block_len_last
=
auto
image_block_len_last
=
align_to_x
(
(
args
.
image
.
channels
*
(
args
.
kernel
.
width
+
(
block_last
-
1
)
*
args
.
kernel
.
stride_w
)),
IMAGE_ALIGNMENT
)
/
16
+
1
;
uint64_t
image_win_cnt
=
block_len
;
uint64_t
image_win_cnt_last
=
block_last
;
uint64_t
res_row_data_align4_pad
=
res_amount_per_row_pad
/
8
;
uint64_t
prog_full_cnt
=
2048
/
(
filter_amount_all
/
16
*
2
)
-
1
;
auto
image_win_cnt
=
block_len
;
auto
image_win_cnt_last
=
block_last
;
auto
res_row_data_align4_pad
=
res_amount_per_row_pad
/
8
;
auto
prog_full_cnt
=
2048
/
(
filter_amount_all
/
16
*
2
)
-
1
;
if
(
prog_full_cnt
==
1023
)
{
prog_full_cnt
--
;
}
uint64_t
post_prog_full_cnt
=
auto
post_prog_full_cnt
=
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
>
2
)
?
(
512
/
(
align_to_x
(
args
.
filter_num
,
4
)
/
4
*
2
)
-
2
)
:
0
;
uint64_t
cmd
=
0UL
|
(
args
.
relu_enabled
?
USE_RELU
:
0
)
|
USE_BIAS
;
auto
cmd
=
0UL
|
(
args
.
relu_enabled
?
USE_RELU
:
0
)
|
USE_BIAS
;
(
*
arg
).
driver
.
image_address_phy
=
vaddr_to_paddr
(
args
.
image
.
address
);
(
*
arg
).
driver
.
sb_address_phy
=
vaddr_to_paddr
(
args
.
sb_address
);
...
...
@@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
sub_conv_num
=
(
uint32_t
)
stride_h
;
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
int
sub_conv_num
=
arg
->
sub_conv_num
;
int
sub_stride
=
1
;
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
((
int
)
filter
->
dims
()[
3
],
padding_w
,
stride_w
);
int
sub_filter_width
=
deconv_filter
::
deconv_get_sub_filter_axis
(
...
...
@@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
int
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
auto
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
int
conv_output_size
=
(
align_to_x
(
sub_output_width
*
sub_filter_num
,
IMAGE_ALIGNMENT
))
*
sub_output_height
;
int
ouput_size
=
conv_output_size
*
sub_conv_num
;
int
align_sub_filter_num
=
align_to_x
(
sub_filter_num
,
FILTER_NUM_ALIGNMENT
);
int
align_sub_filter_count
=
...
...
@@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_sub_filter_count
*
align_sub_filter_num
;
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
conv_args
[
i
].
filter_num
=
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
)
;
arg
->
conv_args
[
i
].
filter_num
=
arg
->
sub_conv_num
*
arg
->
filter_num
;
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
...
...
@@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
...
...
@@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
(
void
*
)
bs_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
bs_ptr
;
auto
filter_sub_space
=
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
fpga_copy
(
filter_sub_space
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
(
size_t
)
align_conv_sub_filter_count
);
arg
->
conv_args
[
i
].
filter_address
=
(
void
*
)(
filter_sub_space
)
;
arg
->
conv_args
[
i
].
filter_address
=
filter_sub_space
;
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
if
(
sub_conv_num
==
1
)
{
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
}
else
{
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
)
;
auto
ptr_output
=
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
ptr_output
;
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
}
expand_conv_arg
(
&
arg
->
conv_args
[
i
]);
}
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
// fpga_free(filter_
ptr);
filter
->
reset_data_ptr
(
null
ptr
);
}
// fill_deconv_arg
}
// namespace fpga
...
...
src/fpga/V1/api.h
浏览文件 @
39c27adb
...
...
@@ -21,7 +21,6 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
fpga
{
int
get_align_image_cw
(
int
cw
);
void
format_image
(
framework
::
Tensor
*
image_tensor
);
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
);
// only allocate memory
void
format_fp32_ofm
(
framework
::
Tensor
*
ofm_tensor
);
...
...
@@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor);
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
);
int
get_aligned_filter_element_num
(
int
chw
);
int
get_aligned_filter_num
(
int
num
);
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
);
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
);
...
...
src/fpga/V1/deconv_filter.cpp
浏览文件 @
39c27adb
...
...
@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter
void
deconv_inverse_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
width
,
int
height
)
{
float
*
tmp
=
*
data_in
;
// float fix_range = 127;// float scale = fix_range / max;
int
data_size
=
num
*
channel
*
width
*
height
;
int
hw_len
=
height
*
width
;
float
*
tmp_data
=
(
float
*
)
fpga_malloc
(
data_size
*
sizeof
(
float
));
auto
tmp_data
=
(
float
*
)
fpga_malloc
(
data_size
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
for
(
int
k
=
0
;
k
<
hw_len
;
++
k
)
{
...
...
@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
}
}
}
*
data_in
=
(
float
*
)
tmp_data
;
//
*
data_in
=
tmp_data
;
fpga_free
(
tmp
);
}
...
...
@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
*/
int
deconv_calc_sub_pad
(
int
filter_axis
,
int
pad
,
int
stride
)
{
if
(
stride
==
0
||
((
filter_axis
-
pad
-
1
)
<
0
))
{
// error
return
0
;
PADDLE_MOBILE_ENFORCE
(
false
,
"Wrong deconv parameters"
);
}
return
(
filter_axis
-
pad
-
1
)
/
stride
;
}
...
...
@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
position. so the omit rows or columns is (stride - )
*/
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
)
{
if
(((
filter_width
-
pad
)
<=
0
))
{
// ((filter_width-pad) > stride) ||
// error
return
0
;
}
int
idx
=
1
;
PADDLE_MOBILE_ENFORCE
(
filter_width
>
pad
,
"Wrong deconv parameters"
);
int
idx
;
bool
flag
=
false
;
for
(
idx
=
1
;
idx
<=
stride
;
++
idx
)
{
int
j
=
idx
;
...
...
@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
return
(
stride
-
idx
);
}
int
deconv_get_sub_filter_num
(
int
filter_num
,
int
stride
)
{
return
filter_num
*
stride
;
}
void
deconv_get_sub_filter
(
char
**
data_in
,
int
height
,
int
width
,
int
sub_conv_n
,
int
kernel_num
,
int
channel
)
{
char
*
ptr_tmp
=
*
data_in
;
...
...
@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
char
*
ptr_space
=
(
char
*
)
fpga_malloc
(
sub_conv_n
*
align_offset
*
sizeof
(
char
));
// continuous space
for
(
int
i
=
0
;
i
<
sub_conv_n
;
++
i
)
{
int
offset
=
i
*
origin_offset
;
char
*
ptr_tmp
=
(
ptr_ptr_data
)[
i
];
filter
::
align_element
(
&
ptr_tmp
,
sub_num
,
sub_chw
);
...
...
src/fpga/V1/deconv_filter.h
浏览文件 @
39c27adb
...
...
@@ -21,7 +21,6 @@ namespace deconv_filter {
void
deconv_inverse_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
width
,
int
height
);
int
deconv_calc_sub_pad
(
int
filter_axis
,
int
pad
,
int
stride
);
int
deconv_get_sub_filter_num
(
int
filter_num
,
int
stride
);
int
deconv_get_sub_filter_axis
(
int
filter_axis
,
int
stride
);
int
deconv_get_sub_out_axis
(
int
image_axis
,
int
sub_pad
,
int
sub_filter_axis
);
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
);
...
...
src/fpga/V1/pe.cpp
浏览文件 @
39c27adb
此差异已折叠。
点击以展开。
src/fpga/common/driver.cpp
浏览文件 @
39c27adb
...
...
@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t
_nr
=
DIV_ROUND_UP
(
size
,
FPGA_PAGE_SIZE
);
unsigned
int
nr
=
(
unsigned
int
)
_nr
;
int
ret
=
0
;
DLOG
<<
size
;
DLOG
<<
_nr
;
DLOG
<<
nr
;
uint64_t
a_size
=
FPGA_PAGE_SIZE
*
nr
;
DLOG
<<
a_size
;
...
...
@@ -283,7 +279,7 @@ int fpga_memory_add() {
return
0
;
}
uint64_t
vaddr_to_paddr
(
void
*
address
)
{
uint64_t
vaddr_to_paddr
_driver
(
void
*
address
)
{
uint64_t
paddr
=
0
;
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
address
);
if
(
iter
!=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
end
())
{
...
...
@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) {
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
munmap
(
ptr
,
size
);
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"Invalid pointer"
<<
ptr
;
}
}
...
...
@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
munmap
(
ptr
,
size
);
p_addr
=
vaddr_to_paddr
(
ptr
);
p_addr
=
vaddr_to_paddr
_driver
(
ptr
);
pos
=
(
p_addr
-
g_fpgainfo
.
memory_info
->
mem_start
)
/
FPGA_PAGE_SIZE
;
/*clear bitmap*/
...
...
@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo
.
fpga_vaddr2paddr_map
.
erase
(
iter
);
}
}
else
{
DLOG
<<
"Invalid pointer"
;
DLOG
<<
"Invalid pointer"
<<
ptr
;
}
}
...
...
@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) {
struct
MemoryCacheArgs
args
;
uint64_t
p_addr
;
p_addr
=
vaddr_to_paddr
(
address
);
p_addr
=
vaddr_to_paddr
_driver
(
address
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
size
=
size
;
...
...
@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
struct
MemoryCacheArgs
args
;
uint64_t
p_addr
;
p_addr
=
vaddr_to_paddr
(
address
);
p_addr
=
vaddr_to_paddr
_driver
(
address
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
size
=
size
;
...
...
src/fpga/common/driver.h
浏览文件 @
39c27adb
...
...
@@ -31,8 +31,8 @@ namespace driver {
#define FPGA_REG_PHY_ADDR 0xa0000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x
2
0000000
#define FPGA_MEM_SIZE 0x
2
0000000
#define FPGA_MEM_PHY_ADDR 0x
4
0000000
#define FPGA_MEM_SIZE 0x
8
0000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
...
...
@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size);
void
fpga_free_driver
(
void
*
ptr
);
void
fpga_copy_driver
(
void
*
dest
,
const
void
*
src
,
size_t
num
);
int
fpga_flush_driver
(
void
*
address
,
size_t
size
);
int
fpga_invalidate_driver
(
void
*
address
,
size_t
size
);
/*pe*/
uint64_t
vaddr_to_paddr
(
void
*
address
);
uint64_t
vaddr_to_paddr_driver
(
void
*
address
);
int
fpga_regpoll
(
uint64_t
reg
,
uint64_t
val
,
int
time
);
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
39c27adb
...
...
@@ -115,7 +115,7 @@ int fpga_invalidate(void *address, size_t size) {
}
uint64_t
vaddr_to_paddr
(
void
*
address
)
{
#ifdef PADDLE_MOBILE_ZU5
return
driver
::
vaddr_to_paddr
(
address
);
return
driver
::
vaddr_to_paddr
_driver
(
address
);
#else
return
0
;
#endif
...
...
src/fpga/common/fpga_common.h
浏览文件 @
39c27adb
...
...
@@ -37,6 +37,18 @@ enum LayoutType {
LAYOUT_HWC
=
0
,
};
enum
ActivationType
{
NONE
=
0
,
LEAKYRELU
=
1
,
SIGMOID
=
2
,
TANH
=
3
,
};
struct
ActivationArgs
{
enum
ActivationType
activation_type
;
int16_t
leaky_relu_negative_slope
;
};
struct
KernelArgs
{
uint32_t
width
;
uint32_t
height
;
...
...
@@ -58,7 +70,10 @@ struct ImageOutputArgs {
void
*
address
;
// output result address;
float
*
scale_address
;
// output scale address;
uint64_t
timer_cnt
;
// time counter for FPGA computation
struct
ActivationArgs
activation
;
// To select activation and specify (Leaky)Relu parameter.
};
#ifdef PADDLE_MOBILE_FPGA_V1
struct
ConvDriverParam
{
uint64_t
image_address_phy
;
...
...
@@ -198,7 +213,11 @@ struct DeconvArgs {
struct
ConvArgs
*
conv_args
;
};
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static
inline
uint32_t
align_to_x
(
int64_t
num
,
int64_t
x
)
{
return
((
uint32_t
)(
num
+
x
)
-
1
)
/
(
uint32_t
)
x
*
(
uint32_t
)
x
;
}
int16_t
fp32_2_fp16
(
float
fp32_num
);
float
fp16_2_fp32
(
int16_t
fp16_num
);
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
39c27adb
...
...
@@ -14,7 +14,6 @@ limitations under the License. */
#ifdef TRANSPOSE2_OP
#include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录