Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
a27e0055
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a27e0055
编写于
12月 20, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'ocr_ctc' of
https://github.com/hjchen2/paddle-mobile
into ocr_ctc
上级
3cd9c13b
d1807b4c
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
259 addition
and
73 deletion
+259
-73
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+170
-38
src/fpga/V1/api.h
src/fpga/V1/api.h
+5
-0
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+73
-26
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+3
-0
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+1
-1
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+4
-4
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+3
-4
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
a27e0055
...
...
@@ -81,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) {
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_split_num
(
num
,
div_capacity
);
}
int
get_deconv_plit_num
(
framework
::
Tensor
*
filter_tensor
,
int
stride
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
chw
=
dims
[
1
]
*
dims
[
2
]
/
stride
*
dims
[
3
]
/
stride
;
auto
num
=
dims
[
0
]
*
stride
;
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_split_num
(
num
,
div_capacity
);
}
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
)
{
auto
dims
=
filter_tensor
->
dims
();
...
...
@@ -90,6 +97,15 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
return
filter
::
calc_num_per_div
(
num
,
group_num
,
div_capacity
);
}
int
get_deconv_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
,
int
stride
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
chw
=
dims
[
1
]
*
dims
[
2
]
/
stride
*
dims
[
3
]
/
stride
;
auto
num
=
dims
[
0
]
*
stride
;
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_num_per_div
(
num
,
group_num
,
div_capacity
);
}
int
get_aligned_filter_element_num
(
int
chw
)
{
return
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
}
...
...
@@ -448,14 +464,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
sub_output_height
=
(
uint32_t
)
sub_output_height
;
arg
->
omit_size
=
(
uint32_t
)
deconv_filter
::
deconv_get_omit
(
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
auto
sub_channels
=
(
int
)
input
->
dims
()[
1
];
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
int
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
int
conv_output_size
=
(
align_to_x
(
sub_output_width
*
sub_filter_num
,
IMAGE_ALIGNMENT
))
*
sub_output_height
;
int
ouput_size
=
conv_output_size
*
sub_conv_num
;
int
align_sub_filter_num
=
align_to_x
(
sub_filter_num
,
FILTER_NUM_ALIGNMENT
);
int
align_sub_filter_count
=
...
...
@@ -464,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int
align_conv_sub_filter_count
=
align_sub_filter_count
*
align_sub_filter_num
;
int
split_num
=
group_num
==
1
?
(
uint32_t
)
get_deconv_plit_num
(
filter
,
sub_conv_num
)
:
1
;
arg
->
split_conv_args
=
(
SplitConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
SplitConvArgs
));
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
conv_args
[
i
].
filter_num
=
arg
->
sub_conv_num
*
arg
->
filter_num
;
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
arg
->
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_args
[
i
].
sb_address
=
bs_ptr
;
auto
filter_sub_space
=
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
fpga_copy
(
filter_sub_space
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
(
size_t
)
align_conv_sub_filter_count
);
arg
->
conv_args
[
i
].
filter_address
=
filter_sub_space
;
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
arg
->
split_conv_args
[
i
].
filter_num
=
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
);
arg
->
split_conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
].
split_num
=
split_num
;
arg
->
split_conv_args
[
i
].
conv_arg
=
(
ConvArgs
*
)
fpga_malloc
(
split_num
*
sizeof
(
ConvArgs
));
arg
->
split_conv_args
[
i
].
concat_arg
.
height
=
sub_output_height
;
arg
->
split_conv_args
[
i
].
concat_arg
.
width
=
sub_output_width
;
arg
->
split_conv_args
[
i
].
concat_arg
.
image_num
=
split_num
;
arg
->
split_conv_args
[
i
].
concat_arg
.
images_in
=
(
half
**
)
fpga_malloc
(
split_num
*
sizeof
(
half
*
));
arg
->
split_conv_args
[
i
].
concat_arg
.
scales_in
=
(
float
**
)
fpga_malloc
(
split_num
*
sizeof
(
float
*
));
arg
->
split_conv_args
[
i
].
concat_arg
.
channel_num
=
(
uint32_t
*
)
fpga_malloc
(
split_num
*
sizeof
(
uint32_t
));
// arg->split_conv_args[i].concat_arg.image_out =
// fpga_malloc(conv_output_size * sizeof(half));
// arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 *
// sizeof(float));
}
int
filter_num_per_div
=
get_deconv_filter_num_per_div
(
filter
,
group_num
,
stride_w
);
int
element_num
=
get_aligned_filter_element_num
(
(
int
)(
sub_channels
*
sub_filter_width
*
sub_filter_width
));
int
chw
=
sub_channels
*
sub_filter_width
*
sub_filter_width
;
int
division_capacity
=
filter
::
calc_division_capacity
(
chw
);
int
num_per_div_before_alignment
=
filter
::
calc_num_per_div
(
sub_filter_num
,
group_num
,
division_capacity
);
int
num_per_div_after_alignment
=
align_to_x
(
num_per_div_before_alignment
,
FILTER_NUM_ALIGNMENT
);
int
div_num
=
(
sub_filter_num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
int
residual
=
sub_filter_num
%
num_per_div_before_alignment
;
int
num_after_alignment
=
num_per_div_after_alignment
*
((
residual
==
0
)
?
div_num
:
(
div_num
-
1
))
+
align_to_x
(
residual
,
FILTER_NUM_ALIGNMENT
);
int
filter_sub_conv_offset
=
element_num
*
num_after_alignment
;
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
if
(
sub_conv_num
==
1
)
{
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
split_conv_args
[
i
].
output
.
address
=
arg
->
output
.
address
;
arg
->
split_conv_args
[
i
].
output
.
scale_address
=
arg
->
output
.
scale_address
;
}
else
{
auto
ptr_output
=
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
ptr_output
;
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
split_conv_args
[
i
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
)
;
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
arg
->
split_
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
}
expand_conv_arg
(
&
arg
->
conv_args
[
i
]);
}
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
for
(
int
j
=
0
;
j
<
split_num
;
++
j
)
{
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
relu_enabled
=
relu_enabled
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
stride_w
=
1
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
stride_h
=
1
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
scale_address
=
input
->
scale
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
address
=
input_ptr
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_scale_address
=
filter
->
scale
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
=
(
uint32_t
)(
j
==
split_num
-
1
?
sub_filter_num
-
(
split_num
-
1
)
*
filter_num_per_div
// NOLINT
:
filter_num_per_div
);
size_t
filter_size
=
element_num
*
align_to_x
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
,
FILTER_NUM_ALIGNMENT
)
*
sizeof
(
int8_t
);
auto
filter_head
=
&
((
int8_t
*
)
filter_ptr
)[
j
*
element_num
*
filter_num_per_div
+
i
*
filter_sub_conv_offset
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
=
fpga_malloc
(
filter_size
);
memcpy
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_head
,
filter_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_size
);
{
static
int
test_cnt
=
0
;
signed
char
result
=
0
;
if
(
test_cnt
<=
1
)
{
std
::
string
filename
=
"deconv_split_flt"
+
std
::
to_string
(
test_cnt
);
fpga
::
savefile
<
signed
char
>
(
filename
,
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_size
,
result
);
test_cnt
++
;
}
}
size_t
bs_align_num
=
align_to_x
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
,
BS_NUM_ALIGNMENT
);
size_t
bs_size
=
2
*
bs_align_num
*
sizeof
(
float
);
auto
bs_head
=
&
bs_ptr
[
j
*
filter_num_per_div
*
2
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
=
fpga_malloc
(
bs_size
);
memcpy
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
,
bs_head
,
bs_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
,
bs_size
);
if
(
split_num
==
1
)
{
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
=
arg
->
split_conv_args
[
i
].
output
.
address
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
=
arg
->
split_conv_args
[
i
].
output
.
scale_address
;
}
else
{
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
);
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
=
ptr_output_scale
;
}
arg
->
split_conv_args
[
i
].
concat_arg
.
images_in
[
j
]
=
(
half
*
)
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
;
// NOLINT
arg
->
split_conv_args
[
i
].
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
;
arg
->
split_conv_args
[
i
].
concat_arg
.
channel_num
[
j
]
=
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
;
expand_conv_arg
(
&
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
]));
}
arg
->
split_conv_args
[
i
].
concat_arg
.
image_out
=
arg
->
split_conv_args
[
i
].
output
.
address
;
arg
->
split_conv_args
[
i
].
concat_arg
.
scale_out
=
arg
->
split_conv_args
[
i
].
output
.
scale_address
;
}
filter
->
reset_data_ptr
(
nullptr
);
fpga_free
(
bs_ptr
);
}
// fill_deconv_arg
}
// namespace fpga
...
...
src/fpga/V1/api.h
浏览文件 @
a27e0055
...
...
@@ -27,7 +27,12 @@ void format_fp32_ofm(framework::Tensor* ofm_tensor);
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
);
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
int
get_deconv_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
,
int
stride
);
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
);
int
get_deconv_plit_num
(
framework
::
Tensor
*
filter_tensor
,
int
stride
);
int
get_aligned_filter_element_num
(
int
chw
);
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
);
...
...
src/fpga/V1/pe.cpp
浏览文件 @
a27e0055
...
...
@@ -13,15 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/pe.h"
#include "common/types.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
#ifdef COST_TIME_PRINT
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <iostream>
//#include <iostream>
#endif
namespace
paddle_mobile
{
namespace
fpga
{
using
namespace
driver
;
// NOLINT
using
namespace
std
;
#define USE_RELU 1
#define USE_BIAS 2
...
...
@@ -162,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
<<
" group_num:"
<<
args
.
group_num
<<
" split_num:"
<<
args
.
split_num
;
#endif
int
ret
=
0
;
int
split_num
=
args
.
split_num
;
for
(
int
i
=
0
;
i
<
split_num
;
i
++
)
{
ComputeBasicConv
(
args
.
conv_arg
[
i
]);
ret
|=
ComputeBasicConv
(
args
.
conv_arg
[
i
]);
}
if
(
split_num
>
1
)
{
ComputeFPGAConcat
(
args
.
concat_arg
);
}
return
ret
;
}
int
ComputeBasicConv
(
const
struct
ConvArgs
&
args
)
{
...
...
@@ -250,12 +262,13 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq
(
args
.
driver
.
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
args
.
driver
.
fpga_bias_scale_len
/
4
,
0xd20
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_CONV_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_CONV
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"Conv Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
...
...
@@ -289,6 +302,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"Polling"
;
// return 0;
uint64_t
output_scale
=
0
;
uint64_t
timer_cnt
=
0
;
int
ret
=
0
;
...
...
@@ -561,11 +576,13 @@ int PerformBypass(const struct BypassArgs &args) {
reg_writeq
(
datalen
,
REG_CONVERT_LENGTH
);
reg_writeq
(
cmd
,
REG_CONVERT_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_BYPASS
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_BYPASS
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"BYPASS Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
...
...
@@ -619,37 +636,29 @@ void deconv_post_process(const struct DeconvArgs &args) {
int
align_deconv_row_len
=
align_to_x
(
deconv_row_len
,
16
);
for
(
int
idx
=
0
;
idx
<
sub_conv_n
;
++
idx
)
{
fpga_invalidate
(
args
.
conv_args
[
idx
].
output
.
address
,
align_origin_w
*
origin_h
*
sizeof
(
int16_t
));
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
split_conv_args
[
idx
].
output
.
address
,
align_origin_w
*
origin_h
*
sizeof
(
int16_t
));
}
auto
ptr_deconv
=
(
int16_t
*
)
fpga_malloc
(
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
memset
(
ptr_deconv
,
0
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
int
deconv_idx
=
0
;
for
(
int
nn
=
0
;
nn
<
num
;
++
nn
)
{
for
(
int
hh
=
0
;
hh
<
origin_h
;
++
hh
)
{
int
hx
=
(
hh
%
sub_conv_n
);
auto
sub_t
=
(
int16_t
*
)(
args
.
conv_args
[
sub_conv_n
-
hx
-
1
].
output
.
address
);
(
int16_t
*
)(
args
.
split_
conv_args
[
sub_conv_n
-
hx
-
1
].
output
.
address
);
int
hi
=
(
hh
/
sub_conv_n
);
if
((
hh
<
omit_size
)
||
(
hh
>=
(
origin_h
-
omit_size
)))
continue
;
int
sidx
=
(
nn
*
origin_h
*
align_origin_w
+
hi
*
align_origin_w
+
omit_size
*
channel
);
fpga_copy
(
ptr_deconv
+
deconv_idx
,
sub_t
+
sidx
,
fpga_copy
((
int16_t
*
)(
args
.
output
.
address
)
+
deconv_idx
,
sub_t
+
sidx
,
sizeof
(
int16_t
)
*
deconv_row_len
);
deconv_idx
+=
align_deconv_row_len
;
}
}
fpga_copy
(
args
.
output
.
address
,
ptr_deconv
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
fpga_flush
(
args
.
output
.
address
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
fpga_free
(
ptr_deconv
);
}
// deconv_post_process
}
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
...
...
@@ -661,32 +670,70 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
<<
" sub_conv_num:"
<<
args
.
sub_conv_num
;
DLOG
<<
"args.output.address: "
<<
args
.
output
.
address
<<
"args.output.scale_address: "
<<
args
.
output
.
scale_address
;
DLOG
<<
"args.conv_args.sb_address: "
<<
(
args
.
conv_args
)
->
sb_address
<<
"args.conv_args.filter_address: "
<<
(
args
.
conv_args
)
->
filter_address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
int
sub_conv_num
=
args
.
sub_conv_num
;
#ifdef COST_TIME_PRINT
timeval
start
,
end
;
long
dif_sec
,
dif_usec
;
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
ComputeBasicConv
(
args
.
conv_args
[
i
]);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
ComputeFpgaConv
(
args
.
split_conv_args
[
i
]);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv basic_conv: "
<<
i
<<
" times: "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
}
if
(
sub_conv_num
>
1
)
{
float
max_scale
=
-
1.0
f
;
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
conv_args
[
i
].
output
.
scale_address
,
2
*
sizeof
(
float
));
float
ptr_scale
=
(
args
.
conv_args
[
i
].
output
.
scale_address
)[
0
];
args
.
split_
conv_args
[
i
].
output
.
scale_address
,
2
*
sizeof
(
float
));
float
ptr_scale
=
(
args
.
split_
conv_args
[
i
].
output
.
scale_address
)[
0
];
if
(
ptr_scale
>
max_scale
)
{
args
.
output
.
scale_address
[
0
]
=
ptr_scale
;
args
.
output
.
scale_address
[
1
]
=
(
args
.
conv_args
[
i
].
output
.
scale_address
)[
1
];
(
args
.
split_
conv_args
[
i
].
output
.
scale_address
)[
1
];
}
}
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv scale "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
// fpga_flush(args.output.scale_address, 2 * sizeof(float));
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
deconv_post_process
(
args
);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv_post_process "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
}
return
0
;
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
a27e0055
...
...
@@ -59,6 +59,9 @@ int close_device() {
void
*
fpga_malloc
(
size_t
size
)
{
static
uint64_t
counter
=
0
;
if
(
size
<=
0
)
{
size
=
1
;
}
#ifdef PADDLE_MOBILE_ZU5
auto
ptr
=
driver
::
fpga_malloc_driver
(
size
);
#else
...
...
src/fpga/common/fpga_common.h
浏览文件 @
a27e0055
...
...
@@ -210,7 +210,7 @@ struct DeconvArgs {
uint32_t
sub_output_width
;
uint32_t
sub_output_height
;
struct
ImageOutputArgs
output
;
struct
ConvArgs
*
conv_args
;
struct
SplitConvArgs
*
split_
conv_args
;
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
...
...
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
浏览文件 @
a27e0055
...
...
@@ -54,11 +54,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
fpga
::
format_deconv_filter
(
filter
,
max_value
,
param
->
Groups
(),
param
->
Strides
()[
0
]);
//
int element_num_per_div =
// fpga::get_filter_num_per_div(filter, param->Groups()
);
int
element_num_per_div
=
fpga
::
get_deconv_filter_num_per_div
(
filter
,
param
->
Groups
(),
sub_conv_n
);
//
deconv only support group=1 && no spilt
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
channel
*
sub_conv_n
,
//
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
channel
*
sub_conv_n
);
fpga
::
format_fp16_ofm
(
out
);
...
...
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
浏览文件 @
a27e0055
...
...
@@ -55,11 +55,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
fpga
::
format_deconv_filter
(
filter
,
max_value
,
param
->
Groups
(),
param
->
Strides
()[
0
]);
//
int element_num_per_div =
// fpga::get_filter_num_per_div(filter, param->Groups()
);
int
element_num_per_div
=
fpga
::
get_deconv_filter_num_per_div
(
filter
,
param
->
Groups
(),
sub_conv_n
);
// deconv only support group=1 && no spilt
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
channel
*
sub_conv_n
,
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
channel
*
sub_conv_n
);
fpga
::
format_fp16_ofm
(
out
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录