Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8bae119c
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8bae119c
编写于
12月 20, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'ocr_ctc' of
https://github.com/hjchen2/paddle-mobile
into ocr_ctc
上级
fd91b828
914d2eab
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
259 addition
and
73 deletion
+259
-73
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+170
-38
src/fpga/V1/api.h
src/fpga/V1/api.h
+5
-0
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+73
-26
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+3
-0
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+1
-1
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+4
-4
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+3
-4
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
8bae119c
...
@@ -81,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) {
...
@@ -81,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) {
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_split_num
(
num
,
div_capacity
);
return
filter
::
calc_split_num
(
num
,
div_capacity
);
}
}
int
get_deconv_plit_num
(
framework
::
Tensor
*
filter_tensor
,
int
stride
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
chw
=
dims
[
1
]
*
dims
[
2
]
/
stride
*
dims
[
3
]
/
stride
;
auto
num
=
dims
[
0
]
*
stride
;
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_split_num
(
num
,
div_capacity
);
}
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
)
{
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
dims
=
filter_tensor
->
dims
();
...
@@ -90,6 +97,15 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
...
@@ -90,6 +97,15 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
return
filter
::
calc_num_per_div
(
num
,
group_num
,
div_capacity
);
return
filter
::
calc_num_per_div
(
num
,
group_num
,
div_capacity
);
}
}
int
get_deconv_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
,
int
stride
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
chw
=
dims
[
1
]
*
dims
[
2
]
/
stride
*
dims
[
3
]
/
stride
;
auto
num
=
dims
[
0
]
*
stride
;
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_num_per_div
(
num
,
group_num
,
div_capacity
);
}
int
get_aligned_filter_element_num
(
int
chw
)
{
int
get_aligned_filter_element_num
(
int
chw
)
{
return
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
return
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
}
}
...
@@ -448,14 +464,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -448,14 +464,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg
->
sub_output_height
=
(
uint32_t
)
sub_output_height
;
arg
->
sub_output_height
=
(
uint32_t
)
sub_output_height
;
arg
->
omit_size
=
(
uint32_t
)
deconv_filter
::
deconv_get_omit
(
arg
->
omit_size
=
(
uint32_t
)
deconv_filter
::
deconv_get_omit
(
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
ConvArgs
));
auto
sub_channels
=
(
int
)
input
->
dims
()[
1
];
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
int
sub_channels
=
(
int
)
input
->
dims
()[
1
];
int
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
int
conv_output_size
=
int
conv_output_size
=
(
align_to_x
(
sub_output_width
*
sub_filter_num
,
IMAGE_ALIGNMENT
))
*
(
align_to_x
(
sub_output_width
*
sub_filter_num
,
IMAGE_ALIGNMENT
))
*
sub_output_height
;
sub_output_height
;
int
ouput_size
=
conv_output_size
*
sub_conv_num
;
int
align_sub_filter_num
=
align_to_x
(
sub_filter_num
,
FILTER_NUM_ALIGNMENT
);
int
align_sub_filter_num
=
align_to_x
(
sub_filter_num
,
FILTER_NUM_ALIGNMENT
);
int
align_sub_filter_count
=
int
align_sub_filter_count
=
...
@@ -464,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
...
@@ -464,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int
align_conv_sub_filter_count
=
int
align_conv_sub_filter_count
=
align_sub_filter_count
*
align_sub_filter_num
;
align_sub_filter_count
*
align_sub_filter_num
;
int
split_num
=
group_num
==
1
?
(
uint32_t
)
get_deconv_plit_num
(
filter
,
sub_conv_num
)
:
1
;
arg
->
split_conv_args
=
(
SplitConvArgs
*
)
fpga_malloc
(
sub_conv_num
*
sizeof
(
SplitConvArgs
));
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
conv_args
[
i
].
filter_num
=
arg
->
sub_conv_num
*
arg
->
filter_num
;
arg
->
split_conv_args
[
i
].
filter_num
=
arg
->
conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
);
arg
->
split_conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_args
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
split_conv_args
[
i
].
split_num
=
split_num
;
arg
->
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
arg
->
split_conv_args
[
i
].
conv_arg
=
(
ConvArgs
*
)
fpga_malloc
(
split_num
*
sizeof
(
ConvArgs
));
arg
->
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
].
concat_arg
.
height
=
sub_output_height
;
arg
->
conv_args
[
i
].
kernel
.
stride_w
=
1
;
arg
->
split_conv_args
[
i
].
concat_arg
.
width
=
sub_output_width
;
arg
->
conv_args
[
i
].
kernel
.
stride_h
=
1
;
arg
->
split_conv_args
[
i
].
concat_arg
.
image_num
=
split_num
;
arg
->
split_conv_args
[
i
].
concat_arg
.
images_in
=
arg
->
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
(
half
**
)
fpga_malloc
(
split_num
*
sizeof
(
half
*
));
arg
->
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
split_conv_args
[
i
].
concat_arg
.
scales_in
=
arg
->
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
(
float
**
)
fpga_malloc
(
split_num
*
sizeof
(
float
*
));
arg
->
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
split_conv_args
[
i
].
concat_arg
.
channel_num
=
arg
->
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
(
uint32_t
*
)
fpga_malloc
(
split_num
*
sizeof
(
uint32_t
));
arg
->
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
// arg->split_conv_args[i].concat_arg.image_out =
arg
->
conv_args
[
i
].
image
.
address
=
input_ptr
;
// fpga_malloc(conv_output_size * sizeof(half));
arg
->
conv_args
[
i
].
sb_address
=
bs_ptr
;
// arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 *
// sizeof(float));
auto
filter_sub_space
=
}
(
char
*
)
fpga_malloc
(
align_conv_sub_filter_count
*
sizeof
(
char
));
fpga_copy
(
filter_sub_space
,
(
char
*
)
filter_ptr
+
i
*
align_conv_sub_filter_count
,
(
size_t
)
align_conv_sub_filter_count
);
arg
->
conv_args
[
i
].
filter_address
=
filter_sub_space
;
fpga_flush
(
filter_sub_space
,
(
size_t
)
align_conv_sub_filter_count
);
int
filter_num_per_div
=
get_deconv_filter_num_per_div
(
filter
,
group_num
,
stride_w
);
int
element_num
=
get_aligned_filter_element_num
(
(
int
)(
sub_channels
*
sub_filter_width
*
sub_filter_width
));
int
chw
=
sub_channels
*
sub_filter_width
*
sub_filter_width
;
int
division_capacity
=
filter
::
calc_division_capacity
(
chw
);
int
num_per_div_before_alignment
=
filter
::
calc_num_per_div
(
sub_filter_num
,
group_num
,
division_capacity
);
int
num_per_div_after_alignment
=
align_to_x
(
num_per_div_before_alignment
,
FILTER_NUM_ALIGNMENT
);
int
div_num
=
(
sub_filter_num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
int
residual
=
sub_filter_num
%
num_per_div_before_alignment
;
int
num_after_alignment
=
num_per_div_after_alignment
*
((
residual
==
0
)
?
div_num
:
(
div_num
-
1
))
+
align_to_x
(
residual
,
FILTER_NUM_ALIGNMENT
);
int
filter_sub_conv_offset
=
element_num
*
num_after_alignment
;
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
if
(
sub_conv_num
==
1
)
{
if
(
sub_conv_num
==
1
)
{
arg
->
conv_args
[
i
].
output
.
address
=
out_ptr
;
arg
->
split_conv_args
[
i
].
output
.
address
=
arg
->
output
.
address
;
arg
->
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
split_conv_args
[
i
].
output
.
scale_address
=
arg
->
output
.
scale_address
;
}
else
{
}
else
{
auto
ptr_output
=
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
conv_args
[
i
].
output
.
address
=
ptr_output
;
arg
->
split_conv_args
[
i
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
)
;
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
arg
->
split_
conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
}
}
expand_conv_arg
(
&
arg
->
conv_args
[
i
]);
}
arg
->
output
.
address
=
out_ptr
;
for
(
int
j
=
0
;
j
<
split_num
;
++
j
)
{
arg
->
output
.
scale_address
=
out
->
scale
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
relu_enabled
=
relu_enabled
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
stride_w
=
1
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
kernel
.
stride_h
=
1
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
scale_address
=
input
->
scale
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
image
.
address
=
input_ptr
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_scale_address
=
filter
->
scale
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
=
(
uint32_t
)(
j
==
split_num
-
1
?
sub_filter_num
-
(
split_num
-
1
)
*
filter_num_per_div
// NOLINT
:
filter_num_per_div
);
size_t
filter_size
=
element_num
*
align_to_x
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
,
FILTER_NUM_ALIGNMENT
)
*
sizeof
(
int8_t
);
auto
filter_head
=
&
((
int8_t
*
)
filter_ptr
)[
j
*
element_num
*
filter_num_per_div
+
i
*
filter_sub_conv_offset
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
=
fpga_malloc
(
filter_size
);
memcpy
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_head
,
filter_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_size
);
{
static
int
test_cnt
=
0
;
signed
char
result
=
0
;
if
(
test_cnt
<=
1
)
{
std
::
string
filename
=
"deconv_split_flt"
+
std
::
to_string
(
test_cnt
);
fpga
::
savefile
<
signed
char
>
(
filename
,
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_size
,
result
);
test_cnt
++
;
}
}
size_t
bs_align_num
=
align_to_x
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
,
BS_NUM_ALIGNMENT
);
size_t
bs_size
=
2
*
bs_align_num
*
sizeof
(
float
);
auto
bs_head
=
&
bs_ptr
[
j
*
filter_num_per_div
*
2
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
=
fpga_malloc
(
bs_size
);
memcpy
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
,
bs_head
,
bs_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
,
bs_size
);
if
(
split_num
==
1
)
{
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
=
arg
->
split_conv_args
[
i
].
output
.
address
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
=
arg
->
split_conv_args
[
i
].
output
.
scale_address
;
}
else
{
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
);
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
=
ptr_output_scale
;
}
arg
->
split_conv_args
[
i
].
concat_arg
.
images_in
[
j
]
=
(
half
*
)
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
;
// NOLINT
arg
->
split_conv_args
[
i
].
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
;
arg
->
split_conv_args
[
i
].
concat_arg
.
channel_num
[
j
]
=
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_num
;
expand_conv_arg
(
&
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
]));
}
arg
->
split_conv_args
[
i
].
concat_arg
.
image_out
=
arg
->
split_conv_args
[
i
].
output
.
address
;
arg
->
split_conv_args
[
i
].
concat_arg
.
scale_out
=
arg
->
split_conv_args
[
i
].
output
.
scale_address
;
}
filter
->
reset_data_ptr
(
nullptr
);
filter
->
reset_data_ptr
(
nullptr
);
fpga_free
(
bs_ptr
);
}
// fill_deconv_arg
}
// fill_deconv_arg
}
// namespace fpga
}
// namespace fpga
...
...
src/fpga/V1/api.h
浏览文件 @
8bae119c
...
@@ -27,7 +27,12 @@ void format_fp32_ofm(framework::Tensor* ofm_tensor);
...
@@ -27,7 +27,12 @@ void format_fp32_ofm(framework::Tensor* ofm_tensor);
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
);
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
);
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
int
get_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
int
get_deconv_filter_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
,
int
stride
);
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
);
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
);
int
get_deconv_plit_num
(
framework
::
Tensor
*
filter_tensor
,
int
stride
);
int
get_aligned_filter_element_num
(
int
chw
);
int
get_aligned_filter_element_num
(
int
chw
);
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
);
int
group_num
);
...
...
src/fpga/V1/pe.cpp
浏览文件 @
8bae119c
...
@@ -13,15 +13,25 @@ See the License for the specific language governing permissions and
...
@@ -13,15 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "fpga/common/pe.h"
#include "fpga/common/pe.h"
#include "common/types.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#include "fpga/V1/image.h"
#include "fpga/common/config.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
#include "fpga/common/driver.h"
#ifdef COST_TIME_PRINT
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <iostream>
//#include <iostream>
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
fpga
{
namespace
fpga
{
using
namespace
driver
;
// NOLINT
using
namespace
driver
;
// NOLINT
using
namespace
std
;
#define USE_RELU 1
#define USE_RELU 1
#define USE_BIAS 2
#define USE_BIAS 2
...
@@ -162,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
...
@@ -162,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
<<
" group_num:"
<<
args
.
group_num
<<
" group_num:"
<<
args
.
group_num
<<
" split_num:"
<<
args
.
split_num
;
<<
" split_num:"
<<
args
.
split_num
;
#endif
#endif
int
ret
=
0
;
int
split_num
=
args
.
split_num
;
int
split_num
=
args
.
split_num
;
for
(
int
i
=
0
;
i
<
split_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
split_num
;
i
++
)
{
ComputeBasicConv
(
args
.
conv_arg
[
i
]);
ret
|=
ComputeBasicConv
(
args
.
conv_arg
[
i
]);
}
}
if
(
split_num
>
1
)
{
if
(
split_num
>
1
)
{
ComputeFPGAConcat
(
args
.
concat_arg
);
ComputeFPGAConcat
(
args
.
concat_arg
);
}
}
return
ret
;
}
}
int
ComputeBasicConv
(
const
struct
ConvArgs
&
args
)
{
int
ComputeBasicConv
(
const
struct
ConvArgs
&
args
)
{
...
@@ -250,12 +262,13 @@ int ComputeBasicConv(const struct ConvArgs &args) {
...
@@ -250,12 +262,13 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq
(
args
.
driver
.
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
args
.
driver
.
post_prog_full_cnt
,
0xd10
);
reg_writeq
(
args
.
driver
.
fpga_bias_scale_len
/
4
,
0xd20
);
reg_writeq
(
args
.
driver
.
fpga_bias_scale_len
/
4
,
0xd20
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_CONV_CMD
);
reg_writeq
(
args
.
driver
.
cmd
,
REG_CONV_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_CONV
,
PE_IRQ_TIMEOUT
))
{
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_CONV
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
=
ERROR
;
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_CONV
]
->
status
=
ERROR
;
ret
=
-
EIO
;
ret
=
-
EIO
;
DLOG
<<
"Conv Wait Irq Timeout!"
;
DLOG
<<
"Conv Wait Irq Timeout!"
;
}
}
DLOG
<<
"after reg poll"
;
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
...
@@ -289,6 +302,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
...
@@ -289,6 +302,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#endif
#ifdef PADDLE_MOBILE_ZU5
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"Polling"
;
// return 0;
uint64_t
output_scale
=
0
;
uint64_t
output_scale
=
0
;
uint64_t
timer_cnt
=
0
;
uint64_t
timer_cnt
=
0
;
int
ret
=
0
;
int
ret
=
0
;
...
@@ -561,11 +576,13 @@ int PerformBypass(const struct BypassArgs &args) {
...
@@ -561,11 +576,13 @@ int PerformBypass(const struct BypassArgs &args) {
reg_writeq
(
datalen
,
REG_CONVERT_LENGTH
);
reg_writeq
(
datalen
,
REG_CONVERT_LENGTH
);
reg_writeq
(
cmd
,
REG_CONVERT_CMD
);
reg_writeq
(
cmd
,
REG_CONVERT_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_BYPASS
,
PE_IRQ_TIMEOUT
))
{
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_BYPASS
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_BYPASS
]
->
status
=
ERROR
;
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_BYPASS
]
->
status
=
ERROR
;
ret
=
-
EIO
;
ret
=
-
EIO
;
DLOG
<<
"BYPASS Wait Irq Timeout!"
;
DLOG
<<
"BYPASS Wait Irq Timeout!"
;
}
}
DLOG
<<
"after reg poll"
;
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
...
@@ -619,37 +636,29 @@ void deconv_post_process(const struct DeconvArgs &args) {
...
@@ -619,37 +636,29 @@ void deconv_post_process(const struct DeconvArgs &args) {
int
align_deconv_row_len
=
align_to_x
(
deconv_row_len
,
16
);
int
align_deconv_row_len
=
align_to_x
(
deconv_row_len
,
16
);
for
(
int
idx
=
0
;
idx
<
sub_conv_n
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
sub_conv_n
;
++
idx
)
{
fpga_invalidate
(
args
.
conv_args
[
idx
].
output
.
address
,
paddle_mobile
::
fpga
::
fpga_invalidate
(
align_origin_w
*
origin_h
*
sizeof
(
int16_t
));
args
.
split_conv_args
[
idx
].
output
.
address
,
align_origin_w
*
origin_h
*
sizeof
(
int16_t
));
}
}
auto
ptr_deconv
=
(
int16_t
*
)
fpga_malloc
(
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
memset
(
ptr_deconv
,
0
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
int
deconv_idx
=
0
;
int
deconv_idx
=
0
;
for
(
int
nn
=
0
;
nn
<
num
;
++
nn
)
{
for
(
int
nn
=
0
;
nn
<
num
;
++
nn
)
{
for
(
int
hh
=
0
;
hh
<
origin_h
;
++
hh
)
{
for
(
int
hh
=
0
;
hh
<
origin_h
;
++
hh
)
{
int
hx
=
(
hh
%
sub_conv_n
);
int
hx
=
(
hh
%
sub_conv_n
);
auto
sub_t
=
auto
sub_t
=
(
int16_t
*
)(
args
.
conv_args
[
sub_conv_n
-
hx
-
1
].
output
.
address
);
(
int16_t
*
)(
args
.
split_
conv_args
[
sub_conv_n
-
hx
-
1
].
output
.
address
);
int
hi
=
(
hh
/
sub_conv_n
);
int
hi
=
(
hh
/
sub_conv_n
);
if
((
hh
<
omit_size
)
||
(
hh
>=
(
origin_h
-
omit_size
)))
continue
;
if
((
hh
<
omit_size
)
||
(
hh
>=
(
origin_h
-
omit_size
)))
continue
;
int
sidx
=
(
nn
*
origin_h
*
align_origin_w
+
hi
*
align_origin_w
+
int
sidx
=
(
nn
*
origin_h
*
align_origin_w
+
hi
*
align_origin_w
+
omit_size
*
channel
);
omit_size
*
channel
);
fpga_copy
((
int16_t
*
)(
args
.
output
.
address
)
+
deconv_idx
,
sub_t
+
sidx
,
fpga_copy
(
ptr_deconv
+
deconv_idx
,
sub_t
+
sidx
,
sizeof
(
int16_t
)
*
deconv_row_len
);
sizeof
(
int16_t
)
*
deconv_row_len
);
deconv_idx
+=
align_deconv_row_len
;
deconv_idx
+=
align_deconv_row_len
;
}
}
}
}
fpga_copy
(
args
.
output
.
address
,
ptr_deconv
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
fpga_flush
(
args
.
output
.
address
,
fpga_flush
(
args
.
output
.
address
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
fpga_free
(
ptr_deconv
);
}
}
// deconv_post_process
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
)
{
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
#ifdef FPGA_PRINT_MODE
...
@@ -661,32 +670,70 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
...
@@ -661,32 +670,70 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
<<
" sub_conv_num:"
<<
args
.
sub_conv_num
;
<<
" sub_conv_num:"
<<
args
.
sub_conv_num
;
DLOG
<<
"args.output.address: "
<<
args
.
output
.
address
DLOG
<<
"args.output.address: "
<<
args
.
output
.
address
<<
"args.output.scale_address: "
<<
args
.
output
.
scale_address
;
<<
"args.output.scale_address: "
<<
args
.
output
.
scale_address
;
DLOG
<<
"args.conv_args.sb_address: "
<<
(
args
.
conv_args
)
->
sb_address
<<
"args.conv_args.filter_address: "
<<
(
args
.
conv_args
)
->
filter_address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
#endif
int
sub_conv_num
=
args
.
sub_conv_num
;
int
sub_conv_num
=
args
.
sub_conv_num
;
#ifdef COST_TIME_PRINT
timeval
start
,
end
;
long
dif_sec
,
dif_usec
;
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
ComputeBasicConv
(
args
.
conv_args
[
i
]);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
ComputeFpgaConv
(
args
.
split_conv_args
[
i
]);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv basic_conv: "
<<
i
<<
" times: "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
}
}
if
(
sub_conv_num
>
1
)
{
if
(
sub_conv_num
>
1
)
{
float
max_scale
=
-
1.0
f
;
float
max_scale
=
-
1.0
f
;
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
paddle_mobile
::
fpga
::
fpga_invalidate
(
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
conv_args
[
i
].
output
.
scale_address
,
2
*
sizeof
(
float
));
args
.
split_
conv_args
[
i
].
output
.
scale_address
,
2
*
sizeof
(
float
));
float
ptr_scale
=
(
args
.
conv_args
[
i
].
output
.
scale_address
)[
0
];
float
ptr_scale
=
(
args
.
split_
conv_args
[
i
].
output
.
scale_address
)[
0
];
if
(
ptr_scale
>
max_scale
)
{
if
(
ptr_scale
>
max_scale
)
{
args
.
output
.
scale_address
[
0
]
=
ptr_scale
;
args
.
output
.
scale_address
[
0
]
=
ptr_scale
;
args
.
output
.
scale_address
[
1
]
=
args
.
output
.
scale_address
[
1
]
=
(
args
.
conv_args
[
i
].
output
.
scale_address
)[
1
];
(
args
.
split_
conv_args
[
i
].
output
.
scale_address
)[
1
];
}
}
}
}
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv scale "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
// fpga_flush(args.output.scale_address, 2 * sizeof(float));
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
deconv_post_process
(
args
);
deconv_post_process
(
args
);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv_post_process "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
}
}
return
0
;
return
0
;
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
8bae119c
...
@@ -59,6 +59,9 @@ int close_device() {
...
@@ -59,6 +59,9 @@ int close_device() {
void
*
fpga_malloc
(
size_t
size
)
{
void
*
fpga_malloc
(
size_t
size
)
{
static
uint64_t
counter
=
0
;
static
uint64_t
counter
=
0
;
if
(
size
<=
0
)
{
size
=
1
;
}
#ifdef PADDLE_MOBILE_ZU5
#ifdef PADDLE_MOBILE_ZU5
auto
ptr
=
driver
::
fpga_malloc_driver
(
size
);
auto
ptr
=
driver
::
fpga_malloc_driver
(
size
);
#else
#else
...
...
src/fpga/common/fpga_common.h
浏览文件 @
8bae119c
...
@@ -210,7 +210,7 @@ struct DeconvArgs {
...
@@ -210,7 +210,7 @@ struct DeconvArgs {
uint32_t
sub_output_width
;
uint32_t
sub_output_width
;
uint32_t
sub_output_height
;
uint32_t
sub_output_height
;
struct
ImageOutputArgs
output
;
struct
ImageOutputArgs
output
;
struct
ConvArgs
*
conv_args
;
struct
SplitConvArgs
*
split_
conv_args
;
};
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
...
...
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
浏览文件 @
8bae119c
...
@@ -54,11 +54,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
...
@@ -54,11 +54,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
fpga
::
format_deconv_filter
(
filter
,
max_value
,
param
->
Groups
(),
fpga
::
format_deconv_filter
(
filter
,
max_value
,
param
->
Groups
(),
param
->
Strides
()[
0
]);
param
->
Strides
()[
0
]);
//
int element_num_per_div =
int
element_num_per_div
=
// fpga::get_filter_num_per_div(filter, param->Groups()
);
fpga
::
get_deconv_filter_num_per_div
(
filter
,
param
->
Groups
(),
sub_conv_n
);
//
deconv only support group=1 && no spilt
//
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
channel
*
sub_conv_n
,
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
channel
*
sub_conv_n
);
channel
*
sub_conv_n
);
fpga
::
format_fp16_ofm
(
out
);
fpga
::
format_fp16_ofm
(
out
);
...
...
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
浏览文件 @
8bae119c
...
@@ -55,11 +55,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
...
@@ -55,11 +55,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
fpga
::
format_deconv_filter
(
filter
,
max_value
,
param
->
Groups
(),
fpga
::
format_deconv_filter
(
filter
,
max_value
,
param
->
Groups
(),
param
->
Strides
()[
0
]);
param
->
Strides
()[
0
]);
//
int element_num_per_div =
int
element_num_per_div
=
// fpga::get_filter_num_per_div(filter, param->Groups()
);
fpga
::
get_deconv_filter_num_per_div
(
filter
,
param
->
Groups
(),
sub_conv_n
);
// deconv only support group=1 && no spilt
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
channel
*
sub_conv_n
,
channel
*
sub_conv_n
);
channel
*
sub_conv_n
);
fpga
::
format_fp16_ofm
(
out
);
fpga
::
format_fp16_ofm
(
out
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录