Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
7a8b998f
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7a8b998f
编写于
12月 12, 2018
作者:
qnqinan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some bugs in fpga V2 track and update fpga V2 pe code
上级
344c1df7
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
1552 addition
and
95 deletion
+1552
-95
src/fpga/V2/api.cpp
src/fpga/V2/api.cpp
+3
-2
src/fpga/V2/filter.cpp
src/fpga/V2/filter.cpp
+15
-2
src/fpga/V2/pe.cpp
src/fpga/V2/pe.cpp
+1521
-86
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+7
-1
src/operators/kernel/fpga/V2/feed_kernel.cpp
src/operators/kernel/fpga/V2/feed_kernel.cpp
+5
-3
src/operators/kernel/fpga/V2/softmax_kernel.cpp
src/operators/kernel/fpga/V2/softmax_kernel.cpp
+1
-1
未找到文件。
src/fpga/V2/api.cpp
浏览文件 @
7a8b998f
...
...
@@ -204,7 +204,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
conv_arg
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_arg
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_arg
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
arg
->
conv_arg
[
i
].
image
.
channels
=
(
uint32_t
)
get_aligned_channel_num
((
int
)(
input
->
dims
()[
1
]));
// NOLINT
arg
->
conv_arg
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
conv_arg
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_arg
[
i
].
image
.
pad_height
=
(
uint32_t
)
padding_h
;
...
...
@@ -216,7 +217,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int
num_after_alignment
=
filter
::
calc_aligned_num
(
arg
->
filter_num
,
(
int
)
input
->
dims
()[
1
]);
// NOLINT
arg
->
conv_arg
[
i
].
free_space
=
fpga_malloc
(
num_after_alignment
*
2
*
sizeof
(
half
));
fpga_malloc
(
num_after_alignment
*
2
*
sizeof
(
float
));
// half
}
}
...
...
src/fpga/V2/filter.cpp
浏览文件 @
7a8b998f
...
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include <memory.h>
#include <algorithm>
#include "fpga/common/fpga_common.h"
namespace
paddle_mobile
{
namespace
fpga
{
namespace
filter
{
...
...
@@ -88,12 +87,25 @@ void align_filter(float **data_in, int num, int channel, int height,
*
data_in
=
new_data
;
fpga_free
(
temp
);
}
void
convert_to_fp16
(
float
**
data_in
,
int
data_size
)
{
float
*
tmp
=
*
data_in
;
// half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size *
// sizeof(half_float::half));
int16_t
*
tmp_data
=
(
int16_t
*
)
fpga_malloc
(
data_size
*
sizeof
(
int16_t
));
// NOLINT
for
(
int
i
=
0
;
i
<
data_size
;
i
++
)
{
// tmp_data[i] = (half_float::half)((*data_in)[i]);
tmp_data
[
i
]
=
fp32_2_fp16
((
*
data_in
)[
i
]);
}
*
data_in
=
(
float
*
)
tmp_data
;
// NOLINT
fpga_free
(
tmp
);
}
void
format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
)
{
convert_to_hwc
(
data_in
,
num
,
channel
,
height
,
width
);
align_filter
(
data_in
,
num
,
channel
,
height
,
width
);
int
pixel_num
=
calc_aligned_total_pixel_num
(
num
,
channel
,
height
,
width
);
convert_to_fp16
(
data_in
,
pixel_num
);
fpga_flush
(
*
data_in
,
pixel_num
*
sizeof
(
float
));
}
...
...
@@ -115,6 +127,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
convert_fc_filter
(
data_in
,
num
,
chw
);
align_filter
(
data_in
,
num
,
channel
,
height
,
width
);
int
pixel_num
=
calc_aligned_total_pixel_num
(
num
,
channel
,
height
,
width
);
convert_to_fp16
(
data_in
,
pixel_num
);
fpga_flush
(
*
data_in
,
pixel_num
*
sizeof
(
float
));
}
...
...
src/fpga/V2/pe.cpp
浏览文件 @
7a8b998f
...
...
@@ -13,40 +13,53 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/pe.h"
#include "fpga/V2/api.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
using
namespace
std
;
// NOLINT
using
namespace
paddle_mobile
::
fpga
::
driver
;
// NOLINT
namespace
paddle_mobile
{
namespace
fpga
{
#define MUL8(x) ((x)*8)
#define BYPASS_DONE 1
#define MUL8(x) (x * 8)
#define BYPASS_DONE 2
#define CONV_DONE 1
static
inline
int
get_image_out_axis
(
int
src_len
,
int
pad
,
int
kernel_len
,
int
kernel_step
)
{
if
(
kernel_step
==
0
)
{
return
0
;
}
return
(
src_len
+
2
*
pad
-
kernel_len
)
/
kernel_step
+
1
;
}
float
Findfp16Max
()
{
uint16_t
abs_vals
[
16
];
uint64_t
max_fp16
;
max_fp16
=
driver
::
reg_readq
(
MUL8
(
49
));
abs_vals
[
0
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
));
// NOLINT
abs_vals
[
1
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
2
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
3
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
driver
::
reg_readq
(
MUL8
(
50
));
abs_vals
[
4
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
));
// NOLINT
abs_vals
[
5
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
6
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
7
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
driver
::
reg_readq
(
MUL8
(
51
));
abs_vals
[
8
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
));
// NOLINT
abs_vals
[
9
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
10
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
11
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
driver
::
reg_readq
(
MUL8
(
52
));
abs_vals
[
12
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
));
abs_vals
[
13
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
14
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
15
]
=
(
uint16_t
)(
0x0000007f
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
reg_readq
(
MUL8
(
49
));
abs_vals
[
0
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
));
// NOLINT
abs_vals
[
1
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
2
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
3
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
reg_readq
(
MUL8
(
50
));
abs_vals
[
4
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
));
// NOLINT
abs_vals
[
5
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
6
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
7
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
reg_readq
(
MUL8
(
51
));
abs_vals
[
8
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
));
// NOLINT
abs_vals
[
9
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
10
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
11
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
48
));
// NOLINT
max_fp16
=
reg_readq
(
MUL8
(
52
));
abs_vals
[
12
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
));
abs_vals
[
13
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
16
));
// NOLINT
abs_vals
[
14
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
32
));
// NOLINT
abs_vals
[
15
]
=
(
uint16_t
)(
0x0000007f
ff
&
(
max_fp16
>>
48
));
// NOLINT
uint16_t
tmp
=
0
;
for
(
int
i
=
0
;
i
<
16
;
i
++
)
{
...
...
@@ -54,6 +67,7 @@ float Findfp16Max() {
tmp
=
abs_vals
[
i
];
}
}
DLOG
<<
"max value found: "
<<
fp16_2_fp32
(
tmp
);
return
fp16_2_fp32
(
tmp
)
/
127.0
f
;
}
...
...
@@ -88,7 +102,473 @@ int ComputeBasicConv(const struct ConvArgs &args) {
return
0
;
#endif
return
0
;
uint64_t
ifm_pixel_num
=
((
args
.
image
.
width
)
*
(
args
.
image
.
height
)
*
args
.
image
.
channels
);
uint64_t
ifm_memory_size
=
ifm_pixel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
flt_pixel_num
=
(
args
.
filter_num
*
(
args
.
kernel
.
width
)
*
// NOLINT
(
args
.
kernel
.
height
)
*
args
.
image
.
channels
);
uint64_t
filter_memory_size
=
flt_pixel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
bn_pixel_num
=
(
args
.
filter_num
*
2
);
// NOLINT
uint64_t
bn_memory_size
=
bn_pixel_num
*
sizeof
(
float
);
uint64_t
ofm_width
=
((
args
.
image
.
width
)
+
2
*
args
.
image
.
pad_width
-
args
.
kernel
.
width
)
/
(
args
.
kernel
.
stride_w
)
+
1
;
uint64_t
ofm_height
=
((
args
.
image
.
height
)
+
2
*
(
args
.
image
.
pad_height
)
-
(
args
.
kernel
.
height
))
/
(
args
.
kernel
.
stride_h
)
+
1
;
uint32_t
filter_num
=
args
.
filter_num
;
uint32_t
image_channels
=
args
.
image
.
channels
;
DLOG
<<
"filter_num: "
<<
filter_num
;
uint64_t
ifm_src_paddr
=
vaddr_to_paddr
((
args
.
image
.
address
));
uint64_t
flt_src_paddr
=
vaddr_to_paddr
((
args
.
filter_address
));
uint64_t
sb_src_paddr
=
vaddr_to_paddr
((
args
.
free_space
));
uint64_t
ifm_dst_paddr
=
vaddr_to_paddr
((
args
.
output
.
address
));
/**********BN******************/
float
image_inv_scale
=
(
args
.
image
.
scale_address
)[
0
];
float
filter_inv_scale
=
(
args
.
filter_scale_address
)[
0
];
float
scale_tmp
=
image_inv_scale
*
filter_inv_scale
;
int
idx
=
0
;
float
tmp
=
0.0
;
float
*
convert_sb_addr
=
(
float
*
)(
args
.
free_space
);
// NOLINT
for
(
idx
=
0
;
idx
<
args
.
filter_num
*
2
;
idx
++
)
{
if
(
idx
%
2
==
1
)
{
tmp
=
((
float
*
)(
args
.
sb_address
))[
idx
]
*
scale_tmp
;
// NOLINT
}
else
{
tmp
=
((
float
*
)(
args
.
sb_address
))[
idx
];
// NOLINT
}
convert_sb_addr
[
idx
]
=
tmp
;
// NOLINT
}
fpga_flush
(
convert_sb_addr
,
args
.
filter_num
*
2
*
sizeof
(
float
));
reg_writeq
(
1
,
MUL8
(
24
));
usleep
(
1
);
reg_writeq
(
0
,
MUL8
(
24
));
reg_writeq
(
sb_src_paddr
,
MUL8
(
27
));
reg_writeq
(
0
,
MUL8
(
0
));
uint64_t
bps_addr
=
0x8c00000000000000
;
bps_addr
+=
bn_memory_size
;
reg_writeq
(
bps_addr
,
MUL8
(
0
));
int
ret
=
-
1
;
ret
=
fpga_regpoll
(
MUL8
(
48
),
BYPASS_DONE
,
0xffffff
);
if
(
ret
)
{
DLOG
<<
"conv bypass failed"
;
return
ret
;
}
reg_readq
(
MUL8
(
63
));
/*********configuring registers*************/
uint32_t
cmd_image_vir_base_addr
=
(
uint32_t
)
ifm_src_paddr
;
uint32_t
cmd_filter_vir_base_addr
=
(
uint32_t
)
flt_src_paddr
;
uint32_t
cmd_scale_base_addr
=
(
uint32_t
)
sb_src_paddr
;
uint32_t
conv_ofm_addr_base
=
(
uint32_t
)
ifm_dst_paddr
;
uint64_t
cmd_group_num
=
args
.
group_num
;
uint64_t
cmd_filter_per_group
=
filter_num
/
cmd_group_num
;
uint64_t
cmd_flt_sqr_len
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
);
uint64_t
cmd_ifm_pre_row_num
=
0
;
if
(
1
==
args
.
image
.
height
)
{
cmd_ifm_pre_row_num
=
1
;
}
else
{
cmd_ifm_pre_row_num
=
(
args
.
kernel
.
height
)
-
(
args
.
image
.
pad_height
)
+
(
args
.
kernel
.
stride_h
);
}
uint64_t
cmd_flt_pre_batch_num
=
1
;
uint64_t
cmd_ifm_pack_num_per_row_mns1
=
(
uint64_t
)(((
args
.
image
.
channels
)
+
127
)
/
128
)
-
1
;
uint64_t
cmd_bn_num
=
filter_num
;
uint64_t
cmd_bias_num
=
filter_num
;
uint64_t
cmd_ifm_stride_row_length
=
args
.
image
.
width
*
args
.
kernel
.
stride_h
;
uint64_t
cmd_flt_pack_num_per_kernel_mns1
=
(
uint64_t
)(((
args
.
image
.
channels
)
+
127
)
/
128
)
-
1
;
uint64_t
cmd_ofm_width_mns1
=
(
uint64_t
)(
((
args
.
image
.
width
)
-
(
args
.
kernel
.
width
)
+
2
*
(
args
.
image
.
pad_width
))
/
(
args
.
kernel
.
stride_w
));
uint64_t
cmd_ofm_height
=
(
uint64_t
)(((
args
.
image
.
height
)
-
(
args
.
kernel
.
height
)
+
2
*
(
args
.
image
.
pad_height
))
/
(
args
.
kernel
.
stride_h
))
+
1
;
uint64_t
cmd_channel_num
=
0
;
uint64_t
cmd_ifm_pack_len
=
0
;
uint64_t
cmd_channel_per_group
=
0
;
uint64_t
cmd_flt_batch_num_mns1
=
0
;
uint64_t
cmd_flt_N_impl
=
8
;
uint64_t
cmd_ifm_C_impl
=
16
;
uint64_t
cmd_flt_pack_length
=
0
;
uint64_t
cmd_step_h_mul_row_byte_len
=
0
;
uint64_t
cmd_pad_h_mul_row_byte_len
=
0
;
uint64_t
cmd_ifm_pack_byte_length
=
16
*
((((
args
.
image
.
width
)
+
7
)
/
8
)
*
8
);
uint64_t
row_len_align
=
args
.
image
.
width
;
if
(
image_channels
>
64
)
{
cmd_channel_num
=
(
uint64_t
)((((
args
.
image
.
channels
)
+
127
))
/
128
)
*
128
;
cmd_ifm_pack_len
=
128
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
128
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)(((
args
.
filter_num
+
7
))
/
8
-
1
);
cmd_flt_N_impl
=
8
;
cmd_ifm_C_impl
=
128
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
128
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
(
args
.
image
.
width
);
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
(
args
.
image
.
width
);
cmd_ifm_pack_byte_length
=
128
*
(
args
.
image
.
width
);
row_len_align
=
args
.
image
.
width
*
(
cmd_ifm_pack_num_per_row_mns1
+
1
);
}
else
if
(
image_channels
>
32
)
{
cmd_channel_num
=
64
;
cmd_ifm_pack_len
=
64
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
64
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
args
.
filter_num
)
+
15
))
/
16
-
1
);
cmd_flt_N_impl
=
16
;
cmd_ifm_C_impl
=
64
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
64
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
1
))
/
2
)
*
2
;
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
1
))
/
2
)
*
2
;
cmd_ifm_pack_byte_length
=
64
*
(
uint64_t
)((((
args
.
image
.
width
)
+
1
))
/
2
)
*
2
;
row_len_align
=
(
uint64_t
)((((
args
.
image
.
width
)
+
1
))
/
2
);
}
else
if
(
image_channels
>
16
)
{
cmd_channel_num
=
32
;
cmd_ifm_pack_len
=
32
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
32
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
args
.
filter_num
)
+
31
))
/
32
-
1
);
cmd_flt_N_impl
=
32
;
cmd_ifm_C_impl
=
32
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
32
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
3
))
/
4
)
*
4
;
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
3
))
/
4
)
*
4
;
cmd_ifm_pack_byte_length
=
32
*
(
uint64_t
)((((
args
.
image
.
width
)
+
3
))
/
4
)
*
4
;
row_len_align
=
(
uint64_t
)((((
args
.
image
.
width
)
+
3
))
/
4
);
}
else
{
cmd_channel_num
=
16
;
cmd_ifm_pack_len
=
16
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
16
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
args
.
filter_num
)
+
63
))
/
64
-
1
);
cmd_flt_N_impl
=
64
;
cmd_ifm_C_impl
=
16
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
16
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
7
))
/
8
)
*
8
;
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
7
))
/
8
)
*
8
;
cmd_ifm_pack_byte_length
=
16
*
((((
args
.
image
.
width
)
+
7
))
/
8
)
*
8
;
row_len_align
=
(
uint64_t
)((((
args
.
image
.
width
)
+
7
))
/
8
);
}
uint64_t
cmd_flt_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
cmd_channel_num
;
uint64_t
cmd_ifm_row_byte_length
=
cmd_channel_num
*
(
args
.
image
.
width
);
uint64_t
cmd_ifm_buf_col_len
=
0
;
uint64_t
ifm_one_batch_len
=
(
1048576
/
((
args
.
image
.
width
)
*
cmd_channel_num
));
uint64_t
cmd_ifm_batch_num_tmp
=
(
uint64_t
)(
((
args
.
image
.
height
)
+
ifm_one_batch_len
-
1
)
/
ifm_one_batch_len
);
if
(
1
==
cmd_ifm_batch_num_tmp
)
{
cmd_ifm_buf_col_len
=
args
.
image
.
height
;
}
else
{
if
(((
args
.
image
.
height
)
/
(
cmd_ifm_batch_num_tmp
)
%
2
)
==
0
)
{
cmd_ifm_buf_col_len
=
(
args
.
image
.
height
)
/
cmd_ifm_batch_num_tmp
;
}
else
{
cmd_ifm_buf_col_len
=
(
args
.
image
.
height
)
/
cmd_ifm_batch_num_tmp
-
1
;
}
}
uint64_t
cmd_ifm_batch_num_mns1
=
(((
args
.
image
.
height
)
+
cmd_ifm_buf_col_len
-
1
)
/
cmd_ifm_buf_col_len
)
-
1
;
uint64_t
cmd_flt_cycle_num_mns1
=
cmd_ifm_batch_num_mns1
;
uint64_t
cmd_flt_total_batch_num
=
filter_num
/
cmd_flt_N_impl
;
uint64_t
cmd_ifm_buf_col_len_rem
=
(
args
.
image
.
height
)
-
cmd_ifm_batch_num_mns1
*
cmd_ifm_buf_col_len
;
//= -4;
uint64_t
cmd_flt_N_len
=
args
.
kernel
.
width
*
args
.
kernel
.
height
*
(
cmd_flt_pack_num_per_kernel_mns1
+
1
);
//-------- ofm batch number reg && initial URAM reading address
// logic-----------------
uint64_t
cmd_init_raddr_cnt
=
1
;
uint64_t
cmd_init_raddr_flag
=
0
;
int64_t
cmd_init_raddr_index
=
-
8
;
int64_t
cmd_init_raddr_col_0
=
-
4
;
int64_t
cmd_init_raddr_col_1
=
-
4
;
uint64_t
conv_ofm_buf_col_len
=
0
;
uint64_t
conv_ofm_buf_col_len_rem
=
0
;
if
(((
args
.
image
.
pad_height
)
%
(
2
*
(
args
.
kernel
.
stride_h
)))
==
0
)
{
cmd_init_raddr_cnt
=
0
;
cmd_init_raddr_flag
=
0
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(((
args
.
image
.
pad_height
)
+
1
)
/
2
);
cmd_init_raddr_col_0
=
cmd_init_raddr_index
;
cmd_init_raddr_col_1
=
cmd_init_raddr_index
;
}
else
if
(((
args
.
image
.
pad_height
)
-
2
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
))))
<=
(
args
.
kernel
.
stride_h
))
{
cmd_init_raddr_cnt
=
(
args
.
kernel
.
stride_h
)
-
((
args
.
image
.
pad_height
)
-
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
))));
cmd_init_raddr_flag
=
1
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
args
.
image
.
pad_height
)
-
(
int64_t
)
row_len_align
*
((
args
.
image
.
pad_height
)
/
(
2
*
args
.
kernel
.
stride_h
));
cmd_init_raddr_col_0
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
args
.
image
.
pad_height
)
-
(
int64_t
)
row_len_align
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
)));
cmd_init_raddr_col_1
=
0
;
}
else
if
(((
args
.
image
.
pad_height
)
-
2
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
))))
<=
2
*
(
args
.
kernel
.
stride_h
))
{
cmd_init_raddr_cnt
=
2
*
(
args
.
kernel
.
stride_h
)
*
(((
args
.
image
.
pad_height
)
+
2
*
(
args
.
kernel
.
stride_h
)
-
1
)
/
(
2
*
(
args
.
kernel
.
stride_h
)))
-
(
args
.
image
.
pad_height
);
cmd_init_raddr_flag
=
0
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
args
.
kernel
.
stride_h
)
*
(((
args
.
image
.
pad_height
)
+
2
*
(
args
.
kernel
.
stride_h
)
-
1
)
/
(
2
*
(
args
.
kernel
.
stride_h
)));
cmd_init_raddr_col_0
=
0
-
(
int64_t
)
row_len_align
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
)))
-
(
int64_t
)
row_len_align
*
(
2
*
(
args
.
kernel
.
stride_h
)
*
(((
args
.
image
.
pad_height
)
+
2
*
(
args
.
kernel
.
stride_h
)
-
1
)
/
(
2
*
(
args
.
kernel
.
stride_h
)))
-
(
args
.
image
.
pad_height
));
cmd_init_raddr_col_1
=
cmd_init_raddr_col_0
;
}
if
(
cmd_ifm_batch_num_mns1
==
0
)
{
if
((
args
.
kernel
.
height
)
<=
(
args
.
kernel
.
stride_h
))
{
conv_ofm_buf_col_len
=
(
args
.
image
.
height
)
+
2
*
(
args
.
image
.
pad_height
)
-
3
*
(
args
.
kernel
.
stride_h
);
}
else
{
conv_ofm_buf_col_len
=
(
args
.
image
.
height
)
+
2
*
(
args
.
image
.
pad_height
)
-
2
*
(
args
.
kernel
.
stride_h
)
-
(
args
.
kernel
.
height
);
}
conv_ofm_buf_col_len_rem
=
conv_ofm_buf_col_len
;
}
else
{
int
N_rem
=
0
;
int
row_rem
=
0
;
if
((
args
.
kernel
.
height
)
<=
(
args
.
kernel
.
stride_h
))
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
-
3
*
(
args
.
kernel
.
stride_h
);
N_rem
=
(
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
height
))
/
(
args
.
kernel
.
stride_h
)
+
1
;
row_rem
=
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
stride_h
)
*
N_rem
;
conv_ofm_buf_col_len_rem
=
cmd_ifm_buf_col_len_rem
+
2
*
(
args
.
image
.
pad_height
)
+
row_rem
-
3
*
(
args
.
kernel
.
stride_h
);
}
else
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
+
2
*
(
args
.
image
.
pad_height
)
-
2
*
(
args
.
kernel
.
stride_h
)
-
(
args
.
kernel
.
height
);
N_rem
=
(
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
height
))
/
(
args
.
kernel
.
stride_h
)
+
1
;
row_rem
=
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
stride_h
)
*
N_rem
;
conv_ofm_buf_col_len_rem
=
cmd_ifm_buf_col_len_rem
+
(
args
.
image
.
pad_height
)
+
row_rem
-
2
*
(
args
.
kernel
.
stride_h
)
-
(
args
.
kernel
.
height
);
}
}
//----------------------- para functions --------------------------------
float
filter_quant_scale_tmp
=
((
args
.
filter_scale_address
)[
1
]);
float
image_quant_scale_tmp
=
((
args
.
image
.
scale_address
)[
1
]);
uint32_t
cmd_filter_quant_scale
=
*
(
uint32_t
*
)(
&
filter_quant_scale_tmp
);
// NOLINT
uint32_t
cmd_image_quant_scale
=
*
(
uint32_t
*
)(
&
image_quant_scale_tmp
);
// NOLINT
uint64_t
wParallelsim
=
cmd_flt_N_impl
>>
3
;
uint64_t
wParallelsim_num
=
(
uint64_t
)(((
args
.
filter_num
)
+
cmd_flt_N_impl
-
1
)
/
cmd_flt_N_impl
)
-
1
;
uint64_t
win_size
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
(
cmd_ifm_pack_num_per_row_mns1
+
1
)
-
1
;
uint64_t
conv_ofm_width
=
(((
args
.
image
.
width
)
-
(
args
.
kernel
.
width
)
+
(
args
.
image
.
pad_width
)
+
(
args
.
image
.
pad_width
))
/
(
args
.
kernel
.
stride_w
));
uint64_t
conv_ofm_dma_length
=
cmd_flt_N_impl
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_dma_stride
=
args
.
filter_num
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_height_batch_tmp
=
get_image_out_axis
(
args
.
image
.
height
,
args
.
image
.
pad_height
,
args
.
kernel
.
height
,
args
.
kernel
.
stride_h
);
uint64_t
conv_ofm_height_batch
=
(
conv_ofm_height_batch_tmp
+
1
)
/
2
-
1
;
uint64_t
o_ust_rst
=
0
;
uint64_t
conv_ofm_dma_repeat
=
(
uint64_t
)(((((
args
.
image
.
width
)
-
(
args
.
kernel
.
width
)
+
(
args
.
image
.
pad_width
)
+
(
args
.
image
.
pad_width
)))
/
(
args
.
kernel
.
stride_w
))
+
1
);
uint64_t
conv_ofm_dma_offset
=
args
.
filter_num
*
conv_ofm_dma_repeat
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_inter_stride
=
conv_ofm_dma_offset
*
2
;
//----------------- register contation ------------------
uint64_t
cmd_ifm_flt_base_addr
=
((
uint64_t
)
cmd_filter_vir_base_addr
<<
32
)
|
((
uint64_t
)
cmd_image_vir_base_addr
);
uint64_t
cmd_ifm_flt_dim
=
((
uint64_t
)(
args
.
kernel
.
height
)
<<
48
)
|
((
uint64_t
)(
args
.
kernel
.
width
)
<<
32
)
|
((
uint64_t
)(
args
.
image
.
height
)
<<
16
)
|
((
uint64_t
)(
args
.
image
.
width
));
uint64_t
cmd_pad_step_size
=
((
uint64_t
)(
args
.
kernel
.
stride_h
)
<<
48
)
|
((
uint64_t
)(
args
.
kernel
.
stride_w
)
<<
32
)
|
((
uint64_t
)(
args
.
image
.
pad_height
)
<<
16
)
|
((
uint64_t
)(
args
.
image
.
pad_width
));
uint64_t
cmd_param1
=
((
uint64_t
)
cmd_filter_per_group
<<
48
)
|
((
uint64_t
)
cmd_channel_num
<<
32
)
|
((
uint64_t
)
filter_num
<<
16
)
|
((
uint64_t
)
cmd_group_num
);
uint64_t
cmd_param2
=
((
uint64_t
)
cmd_flt_sqr_len
<<
48
)
|
((
uint64_t
)
cmd_ifm_pack_len
<<
32
)
|
((
uint64_t
)
cmd_ifm_pre_row_num
<<
16
)
|
((
uint64_t
)
cmd_channel_per_group
);
uint64_t
cmd_param3
=
((
uint64_t
)
cmd_flt_batch_num_mns1
<<
48
)
|
((
uint64_t
)
cmd_flt_total_batch_num
<<
32
)
|
((
uint64_t
)
cmd_flt_N_impl
<<
16
)
|
((
uint64_t
)
cmd_flt_pre_batch_num
);
uint64_t
cmd_param4
=
((
uint64_t
)
cmd_ifm_pack_num_per_row_mns1
<<
48
)
|
((
uint64_t
)
cmd_bn_num
<<
32
)
|
((
uint64_t
)
cmd_bias_num
<<
16
)
|
((
uint64_t
)
cmd_flt_N_len
);
uint64_t
cmd_param5
=
((
uint64_t
)
cmd_ifm_stride_row_length
<<
48
)
|
((
uint64_t
)
cmd_flt_pack_length
<<
32
)
|
((
uint64_t
)
cmd_flt_cycle_num_mns1
<<
16
)
|
((
uint64_t
)
cmd_flt_pack_num_per_kernel_mns1
);
uint64_t
cmd_param6
=
((
uint64_t
)
cmd_ofm_width_mns1
<<
48
)
|
((
uint64_t
)
cmd_ifm_batch_num_mns1
<<
32
)
|
((
uint64_t
)
cmd_ifm_buf_col_len
<<
16
)
|
((
uint64_t
)
cmd_ifm_C_impl
);
uint64_t
cmd_param7
=
((
uint64_t
)
conv_ofm_inter_stride
<<
32
)
|
((
uint64_t
)
cmd_ifm_buf_col_len_rem
<<
16
)
|
((
uint64_t
)
cmd_ofm_height
);
uint64_t
cmd_param8
=
((
uint64_t
)
cmd_flt_length
<<
32
)
|
((
uint64_t
)
cmd_ifm_row_byte_length
);
uint64_t
cmd_ifm_flt_quant_scale
=
(((
uint64_t
)
cmd_filter_quant_scale
)
<<
32
)
|
((
uint64_t
)
cmd_image_quant_scale
);
uint64_t
cmd_step_pad_mul_row_len
=
((
uint64_t
)
cmd_pad_h_mul_row_byte_len
<<
32
)
|
((
uint64_t
)
cmd_step_h_mul_row_byte_len
);
//---- ofm paras ----
uint64_t
cmd_conv_param_reg
=
((
uint64_t
)
wParallelsim_num
<<
32
)
|
((
uint64_t
)
wParallelsim
<<
16
)
|
((
uint64_t
)
win_size
);
uint64_t
cmd_ofm_addr_width_reg
=
((
uint64_t
)
conv_ofm_width
<<
32
)
|
((
uint64_t
)
conv_ofm_addr_base
);
uint64_t
cmd_intra_stride_atoms_reg
=
((
uint64_t
)
conv_ofm_dma_length
<<
32
)
|
((
uint64_t
)
conv_ofm_dma_stride
);
uint64_t
cmd_ofm_height_batch_reg
=
((
uint64_t
)
conv_ofm_buf_col_len_rem
<<
48
)
|
((
uint64_t
)
conv_ofm_buf_col_len
<<
32
)
|
((
uint64_t
)
conv_ofm_height_batch
+
0x80000000
);
uint64_t
cmd_user_ctrl_reg
=
((
uint64_t
)
o_ust_rst
);
uint64_t
cmd_wdma_param_reg
=
((
uint64_t
)(
conv_ofm_dma_repeat
|
0x80000000
)
<<
32
)
|
((
uint64_t
)
conv_ofm_dma_offset
);
uint64_t
cmd_init_raddr_reg
=
((
cmd_init_raddr_col_1
&
0xffff
)
<<
48
)
|
((
cmd_init_raddr_col_0
&
0xffff
)
<<
32
)
|
(((
cmd_init_raddr_index
&
0xffff
)
<<
16
))
|
(
cmd_init_raddr_flag
&
0xffff
)
<<
15
|
((
cmd_init_raddr_cnt
&
0xffff
));
uint64_t
cmd_para31
=
(
cmd_para31
&
0x1
)
|
args
.
relu_enabled
;
DLOG
<<
"cmd_init_raddr_col_1 = "
<<
hex
<<
cmd_init_raddr_col_1
;
DLOG
<<
"cmd_init_raddr_col_0 = "
<<
hex
<<
cmd_init_raddr_col_0
;
DLOG
<<
"cmd_init_raddr_index = "
<<
hex
<<
cmd_init_raddr_index
;
//
DLOG
<<
"cmd_init_raddr_cnt = "
<<
hex
<<
cmd_init_raddr_cnt
;
DLOG
<<
"conv_ofm_height_batch = "
<<
conv_ofm_height_batch
;
DLOG
<<
"cmd_ifm_flt_base_addr = "
<<
hex
<<
cmd_ifm_flt_base_addr
;
DLOG
<<
"cmd_scale_base_addr = "
<<
hex
<<
cmd_scale_base_addr
;
DLOG
<<
"cmd_ifm_flt_dim = "
<<
hex
<<
cmd_ifm_flt_dim
;
DLOG
<<
"cmd_pad_step_size = "
<<
hex
<<
cmd_pad_step_size
;
DLOG
<<
"cmd_param1 = "
<<
hex
<<
cmd_param1
;
DLOG
<<
"cmd_param2 = "
<<
hex
<<
cmd_param2
;
DLOG
<<
"cmd_param3 = "
<<
hex
<<
cmd_param3
;
DLOG
<<
"cmd_param4 = "
<<
hex
<<
cmd_param4
;
DLOG
<<
"cmd_param5 = "
<<
hex
<<
cmd_param5
;
DLOG
<<
"cmd_param6 = "
<<
hex
<<
cmd_param6
;
DLOG
<<
"cmd_param7 = "
<<
hex
<<
cmd_param7
;
DLOG
<<
"cmd_param8 = "
<<
hex
<<
cmd_param8
;
DLOG
<<
"cmd_ifm_flt_quant_scale = "
<<
hex
<<
cmd_ifm_flt_quant_scale
;
DLOG
<<
"cmd_step_pad_mul_row_len = "
<<
hex
<<
cmd_step_pad_mul_row_len
;
DLOG
<<
"cmd_ifm_pack_byte_length = "
<<
hex
<<
cmd_ifm_pack_byte_length
;
DLOG
<<
"cmd_conv_param_reg = "
<<
hex
<<
cmd_conv_param_reg
;
DLOG
<<
"cmd_ofm_addr_width_reg = "
<<
hex
<<
cmd_ofm_addr_width_reg
;
DLOG
<<
"cmd_intra_stride_atoms_reg = "
<<
hex
<<
cmd_intra_stride_atoms_reg
;
DLOG
<<
"cmd_init_raddr_reg = "
<<
hex
<<
cmd_init_raddr_reg
;
DLOG
<<
"cmd_ofm_height_batch_reg = "
<<
hex
<<
cmd_ofm_height_batch_reg
;
DLOG
<<
"cmd_wdma_param_reg = "
<<
hex
<<
cmd_wdma_param_reg
;
DLOG
<<
"cmd_para31 = "
<<
hex
<<
cmd_para31
;
reg_writeq
(
cmd_ifm_flt_base_addr
,
MUL8
(
1
));
reg_writeq
(
cmd_scale_base_addr
,
MUL8
(
2
));
reg_writeq
(
cmd_ifm_flt_dim
,
MUL8
(
3
));
reg_writeq
(
cmd_pad_step_size
,
MUL8
(
4
));
reg_writeq
(
cmd_param1
,
MUL8
(
5
));
reg_writeq
(
cmd_param2
,
MUL8
(
6
));
reg_writeq
(
cmd_param3
,
MUL8
(
7
));
reg_writeq
(
cmd_param4
,
MUL8
(
8
));
reg_writeq
(
cmd_param5
,
MUL8
(
9
));
reg_writeq
(
cmd_param6
,
MUL8
(
10
));
reg_writeq
(
cmd_param7
,
MUL8
(
11
));
reg_writeq
(
cmd_param8
,
MUL8
(
12
));
reg_writeq
(
cmd_ifm_flt_quant_scale
,
MUL8
(
13
));
reg_writeq
(
cmd_step_pad_mul_row_len
,
MUL8
(
14
));
reg_writeq
(
cmd_ifm_pack_byte_length
,
MUL8
(
15
));
reg_writeq
(
cmd_conv_param_reg
,
MUL8
(
16
));
reg_writeq
(
cmd_ofm_addr_width_reg
,
MUL8
(
17
));
reg_writeq
(
cmd_intra_stride_atoms_reg
,
MUL8
(
18
));
reg_writeq
(
cmd_init_raddr_reg
,
MUL8
(
29
));
reg_writeq
(
cmd_para31
,
MUL8
(
31
));
reg_writeq
(
0
,
MUL8
(
19
));
reg_writeq
(
cmd_ofm_height_batch_reg
,
MUL8
(
19
));
reg_writeq
(
cmd_ofm_height_batch_reg
&
0xffffffff00000000
,
MUL8
(
19
));
reg_writeq
(
cmd_wdma_param_reg
,
MUL8
(
25
));
reg_writeq
(
0
,
MUL8
(
0
));
reg_writeq
(
0x4000000000000000
,
MUL8
(
0
));
ret
=
fpga_regpoll
(
MUL8
(
48
),
CONV_DONE
,
0xffffff
);
if
(
ret
==
-
1
)
{
DLOG
<<
"fpga conv no interrupt!!"
;
return
ret
;
}
reg_readq
(
MUL8
(
63
));
usleep
(
10
);
float
scale
=
Findfp16Max
();
(
args
.
output
.
scale_address
)[
0
]
=
scale
;
// NOLINT
(
args
.
output
.
scale_address
)[
1
]
=
(
float
)(
1.0
/
scale
);
// NOLINT
DLOG
<<
"Findfp16Max scale = "
<<
scale
;
DLOG
<<
"ret="
<<
ret
;
return
ret
;
}
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
)
{
...
...
@@ -97,7 +577,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG
<<
" mode:"
<<
args
.
mode
<<
" kernel_reciprocal:"
<<
fp16_2_fp32
(
args
.
kernel_reciprocal
);
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
...
...
@@ -107,13 +586,467 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
return
0
;
uint32_t
filter_num_align
=
0
;
filter_num_align
=
args
.
image
.
channels
;
DLOG
<<
"______db_______: begin to set registers. "
;
uint64_t
ifm_pixel_num
=
((
args
.
image
.
width
)
*
(
args
.
image
.
height
)
*
args
.
image
.
channels
);
uint64_t
ifm_memory_size
=
ifm_pixel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
flt_pixel_num
=
0
;
uint64_t
filter_memory_size
=
0
;
//!! ???
uint64_t
bn_pixel_num
=
(
filter_num_align
*
2
);
uint64_t
bn_memory_size
=
bn_pixel_num
*
sizeof
(
uint16_t
);
uint64_t
ofm_width
=
((
args
.
image
.
width
)
+
2
*
args
.
image
.
pad_width
-
args
.
kernel
.
width
)
/
(
args
.
kernel
.
stride_w
)
+
1
;
uint64_t
ofm_height
=
((
args
.
image
.
height
)
+
2
*
(
args
.
image
.
pad_height
)
-
(
args
.
kernel
.
height
))
/
(
args
.
kernel
.
stride_h
)
+
1
;
uint32_t
filter_num
=
filter_num_align
;
uint32_t
image_channels
=
args
.
image
.
channels
;
uint64_t
ifm_src_paddr
=
vaddr_to_paddr
((
args
.
image
.
address
));
uint64_t
flt_src_paddr
=
0
;
uint64_t
sb_src_paddr
=
0
;
uint64_t
ifm_dst_paddr
=
vaddr_to_paddr
((
args
.
output
.
address
));
/**********BN******************/
float
image_inv_scale
=
0
;
float
filter_inv_scale
=
0
;
int
idx
=
0
;
DLOG
<<
"______db_______: reset registers. "
;
reg_writeq
(
1
,
MUL8
(
24
));
usleep
(
1
);
reg_writeq
(
0
,
MUL8
(
24
));
/*********configuring registers*************/
uint32_t
cmd_image_vir_base_addr
=
(
uint32_t
)
ifm_src_paddr
;
uint32_t
cmd_filter_vir_base_addr
=
(
uint32_t
)
flt_src_paddr
;
uint32_t
cmd_scale_base_addr
=
(
uint32_t
)
sb_src_paddr
;
uint32_t
conv_ofm_addr_base
=
(
uint32_t
)
ifm_dst_paddr
;
uint64_t
cmd_group_num
=
1
;
// args.group_num;
uint64_t
cmd_filter_per_group
=
filter_num
/
cmd_group_num
;
uint64_t
cmd_flt_sqr_len
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
);
uint64_t
cmd_ifm_pre_row_num
=
args
.
kernel
.
height
;
if
((
args
.
kernel
.
height
==
args
.
image
.
height
)
&&
(
0
==
args
.
image
.
pad_height
))
{
cmd_ifm_pre_row_num
=
(
args
.
kernel
.
height
);
}
else
{
cmd_ifm_pre_row_num
=
(
args
.
kernel
.
height
)
-
(
args
.
image
.
pad_height
)
+
(
args
.
kernel
.
stride_h
);
}
uint64_t
cmd_flt_pre_batch_num
=
1
;
uint64_t
cmd_ifm_pack_num_per_row_mns1
=
(
uint64_t
)(((
args
.
image
.
channels
)
+
63
)
/
64
)
-
1
;
uint64_t
cmd_bn_num
=
filter_num
;
uint64_t
cmd_bias_num
=
filter_num
;
uint64_t
cmd_ifm_stride_row_length
=
args
.
image
.
width
*
args
.
kernel
.
stride_h
;
uint64_t
cmd_flt_pack_num_per_kernel_mns1
=
(
uint64_t
)(((
args
.
image
.
channels
)
+
63
)
/
64
)
-
1
;
uint64_t
cmd_ofm_width_mns1
=
(
uint64_t
)(
((
args
.
image
.
width
)
-
(
args
.
kernel
.
width
)
+
2
*
(
args
.
image
.
pad_width
))
/
(
args
.
kernel
.
stride_w
));
uint64_t
cmd_ofm_height
=
(
uint64_t
)(((
args
.
image
.
height
)
-
(
args
.
kernel
.
height
)
+
2
*
(
args
.
image
.
pad_height
))
/
(
args
.
kernel
.
stride_h
))
+
1
;
uint64_t
cmd_channel_num
=
0
;
uint64_t
cmd_ifm_pack_len
=
0
;
uint64_t
cmd_channel_per_group
=
0
;
uint64_t
cmd_flt_batch_num_mns1
=
0
;
uint64_t
cmd_flt_N_impl
=
8
;
uint64_t
cmd_ifm_C_impl
=
16
;
uint64_t
cmd_flt_pack_length
=
0
;
uint64_t
cmd_step_h_mul_row_byte_len
=
0
;
uint64_t
cmd_pad_h_mul_row_byte_len
=
0
;
uint64_t
cmd_ifm_pack_byte_length
=
16
*
((((
args
.
image
.
width
)
+
7
)
/
8
)
*
8
);
uint64_t
row_len_align
=
args
.
image
.
width
;
uint64_t
cmd_flt_cycle_num_mns1
=
0
;
if
(
image_channels
>
32
)
{
cmd_channel_num
=
(
uint64_t
)((((
args
.
image
.
channels
)
+
63
))
/
64
)
*
64
;
cmd_ifm_pack_len
=
64
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
64
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)(((
filter_num
+
7
))
/
8
-
1
);
cmd_flt_N_impl
=
8
;
cmd_ifm_C_impl
=
64
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
64
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
args
.
image
.
width
;
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
args
.
image
.
width
;
cmd_ifm_pack_byte_length
=
64
*
args
.
image
.
width
;
row_len_align
=
args
.
image
.
width
*
(
cmd_ifm_pack_num_per_row_mns1
+
1
);
cmd_flt_cycle_num_mns1
=
(
cmd_channel_num
/
64
)
-
1
;
}
else
if
(
image_channels
>
16
)
{
cmd_channel_num
=
32
;
cmd_ifm_pack_len
=
32
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
32
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
filter_num
)
+
15
))
/
16
-
1
);
cmd_flt_N_impl
=
16
;
cmd_ifm_C_impl
=
32
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
32
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
1
))
/
2
)
*
2
;
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
1
))
/
2
)
*
2
;
cmd_ifm_pack_byte_length
=
32
*
(
uint64_t
)((((
args
.
image
.
width
)
+
1
))
/
2
)
*
2
;
row_len_align
=
(
uint64_t
)((((
args
.
image
.
width
)
+
1
))
/
2
);
cmd_flt_cycle_num_mns1
=
0
;
}
else
if
(
image_channels
>
8
)
{
cmd_channel_num
=
16
;
cmd_ifm_pack_len
=
16
*
(
args
.
image
.
width
);
cmd_channel_per_group
=
16
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
filter_num
)
+
15
))
/
16
-
1
);
cmd_flt_N_impl
=
32
;
cmd_ifm_C_impl
=
16
;
cmd_flt_pack_length
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
16
;
cmd_step_h_mul_row_byte_len
=
(
args
.
kernel
.
stride_h
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
3
))
/
4
)
*
4
;
cmd_pad_h_mul_row_byte_len
=
(
args
.
image
.
pad_height
)
*
cmd_channel_num
*
((((
args
.
image
.
width
)
+
3
))
/
4
)
*
4
;
cmd_ifm_pack_byte_length
=
16
*
(
uint64_t
)((((
args
.
image
.
width
)
+
3
))
/
4
)
*
4
;
row_len_align
=
(
uint64_t
)((((
args
.
image
.
width
)
+
3
))
/
4
);
cmd_flt_cycle_num_mns1
=
0
;
}
cmd_flt_N_impl
=
16
;
cmd_flt_batch_num_mns1
=
0
;
cmd_flt_pack_length
=
64
;
uint64_t
cmd_flt_N_len
=
0
;
uint64_t
cmd_flt_length
=
64
;
uint64_t
cmd_ifm_row_byte_length
=
cmd_channel_num
*
(
args
.
image
.
width
);
uint64_t
cmd_ifm_buf_col_len
=
0
;
uint64_t
ifm_one_batch_len
=
(
1048576
/
((
args
.
image
.
width
)
*
cmd_channel_num
));
uint64_t
cmd_ifm_batch_num_tmp
=
(
uint64_t
)(
((
args
.
image
.
height
)
+
ifm_one_batch_len
-
1
)
/
ifm_one_batch_len
);
if
(
1
==
cmd_ifm_batch_num_tmp
)
{
cmd_ifm_buf_col_len
=
args
.
image
.
height
;
}
else
{
if
(((
args
.
image
.
height
)
/
(
cmd_ifm_batch_num_tmp
)
%
2
)
==
0
)
{
cmd_ifm_buf_col_len
=
(
args
.
image
.
height
)
/
cmd_ifm_batch_num_tmp
;
}
else
{
cmd_ifm_buf_col_len
=
(
args
.
image
.
height
)
/
cmd_ifm_batch_num_tmp
-
1
;
}
}
uint64_t
cmd_ifm_batch_num_mns1
=
(((
args
.
image
.
height
)
+
cmd_ifm_buf_col_len
-
1
)
/
cmd_ifm_buf_col_len
)
-
1
;
uint64_t
cmd_flt_total_batch_num
=
1
;
uint64_t
cmd_ifm_buf_col_len_rem
=
(
args
.
image
.
height
)
-
cmd_ifm_batch_num_mns1
*
cmd_ifm_buf_col_len
;
//= -4;
//-------- ofm batch number reg && initial URAM reading address
uint64_t
cmd_init_raddr_cnt
=
1
;
uint64_t
cmd_init_raddr_flag
=
0
;
int64_t
cmd_init_raddr_index
=
-
8
;
int64_t
cmd_init_raddr_col_0
=
-
4
;
int64_t
cmd_init_raddr_col_1
=
-
4
;
int64_t
conv_ofm_buf_col_len
=
0
;
int64_t
conv_ofm_buf_col_len_rem
=
0
;
if
(((
args
.
image
.
pad_height
)
%
(
2
*
(
args
.
kernel
.
stride_h
)))
==
0
)
{
cmd_init_raddr_cnt
=
0
;
cmd_init_raddr_flag
=
0
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(((
args
.
image
.
pad_height
)
+
1
)
/
2
);
cmd_init_raddr_col_0
=
cmd_init_raddr_index
;
cmd_init_raddr_col_1
=
cmd_init_raddr_index
;
}
else
if
(((
args
.
image
.
pad_height
)
-
2
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
))))
<=
(
args
.
kernel
.
stride_h
))
{
cmd_init_raddr_cnt
=
(
args
.
kernel
.
stride_h
)
-
((
args
.
image
.
pad_height
)
-
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
))));
cmd_init_raddr_flag
=
1
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
args
.
image
.
pad_height
)
-
(
int64_t
)
row_len_align
*
((
args
.
image
.
pad_height
)
/
(
2
*
args
.
kernel
.
stride_h
));
cmd_init_raddr_col_0
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
args
.
image
.
pad_height
)
-
(
int64_t
)
row_len_align
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
)));
cmd_init_raddr_col_1
=
cmd_init_raddr_col_0
+
args
.
kernel
.
stride_h
*
(
int64_t
)
row_len_align
;
}
else
if
(((
args
.
image
.
pad_height
)
-
2
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
))))
<=
2
*
(
args
.
kernel
.
stride_h
))
{
cmd_init_raddr_cnt
=
2
*
(
args
.
kernel
.
stride_h
)
*
(((
args
.
image
.
pad_height
)
+
2
*
(
args
.
kernel
.
stride_h
)
-
1
)
/
(
2
*
(
args
.
kernel
.
stride_h
)))
-
(
args
.
image
.
pad_height
);
cmd_init_raddr_flag
=
0
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
args
.
kernel
.
stride_h
)
*
(((
args
.
image
.
pad_height
)
+
2
*
(
args
.
kernel
.
stride_h
)
-
1
)
/
(
2
*
(
args
.
kernel
.
stride_h
)));
cmd_init_raddr_col_0
=
0
-
(
int64_t
)
row_len_align
*
((
args
.
image
.
pad_height
)
/
(
2
*
(
args
.
kernel
.
stride_h
)))
-
(
int64_t
)
row_len_align
*
(
2
*
(
args
.
kernel
.
stride_h
)
*
(((
args
.
image
.
pad_height
)
+
2
*
(
args
.
kernel
.
stride_h
)
-
1
)
/
(
2
*
(
args
.
kernel
.
stride_h
)))
-
(
args
.
image
.
pad_height
));
cmd_init_raddr_col_1
=
cmd_init_raddr_col_0
;
}
if
(
cmd_ifm_batch_num_mns1
==
0
)
{
if
((
args
.
kernel
.
height
)
<=
(
args
.
kernel
.
stride_h
))
{
conv_ofm_buf_col_len
=
(
args
.
image
.
height
)
+
2
*
(
args
.
image
.
pad_height
)
-
3
*
(
args
.
kernel
.
stride_h
);
}
else
{
conv_ofm_buf_col_len
=
(
args
.
image
.
height
)
+
2
*
(
args
.
image
.
pad_height
)
-
2
*
(
args
.
kernel
.
stride_h
)
-
(
args
.
kernel
.
height
);
}
conv_ofm_buf_col_len_rem
=
conv_ofm_buf_col_len
;
}
else
{
int
N_rem
=
0
;
int
row_rem
=
0
;
if
((
args
.
kernel
.
height
)
<=
(
args
.
kernel
.
stride_h
))
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
-
3
*
(
args
.
kernel
.
stride_h
);
N_rem
=
(
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
height
))
/
(
args
.
kernel
.
stride_h
)
+
1
;
row_rem
=
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
stride_h
)
*
N_rem
;
conv_ofm_buf_col_len_rem
=
cmd_ifm_buf_col_len_rem
+
2
*
(
args
.
image
.
pad_height
)
+
row_rem
-
3
*
(
args
.
kernel
.
stride_h
);
}
else
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
+
2
*
(
args
.
image
.
pad_height
)
-
2
*
(
args
.
kernel
.
stride_h
)
-
(
args
.
kernel
.
height
);
N_rem
=
(
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
height
))
/
(
args
.
kernel
.
stride_h
)
+
1
;
row_rem
=
cmd_ifm_buf_col_len
-
(
args
.
kernel
.
stride_h
)
*
N_rem
;
conv_ofm_buf_col_len_rem
=
cmd_ifm_buf_col_len_rem
+
(
args
.
image
.
pad_height
)
+
row_rem
-
2
*
(
args
.
kernel
.
stride_h
)
-
(
args
.
kernel
.
height
);
}
}
//----------------------- para functions --------------------------------
uint64_t
cmd_filter_quant_scale
=
0x3c00
;
uint64_t
cmd_image_quant_scale
=
0x3c00
;
uint64_t
wParallelsim
=
cmd_ifm_C_impl
>>
3
;
uint64_t
wParallelsim_num
=
cmd_flt_cycle_num_mns1
;
uint64_t
win_size
=
(
args
.
kernel
.
width
)
*
(
args
.
kernel
.
height
)
*
(
cmd_ifm_pack_num_per_row_mns1
+
1
)
-
1
;
//
uint64_t
conv_ofm_width
=
(((
args
.
image
.
width
)
-
(
args
.
kernel
.
width
)
+
(
args
.
image
.
pad_width
)
+
(
args
.
image
.
pad_width
))
/
(
args
.
kernel
.
stride_w
));
uint64_t
conv_ofm_dma_length
=
cmd_channel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_dma_stride
=
conv_ofm_dma_length
;
uint64_t
conv_ofm_height_batch_tmp
=
(
args
.
image
.
height
+
2
*
args
.
image
.
pad_height
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
1
;
uint64_t
conv_ofm_height_batch
=
(
conv_ofm_height_batch_tmp
+
1
)
/
2
-
1
;
uint64_t
o_ust_rst
=
0
;
uint64_t
conv_ofm_dma_repeat
=
(
uint64_t
)(((((
args
.
image
.
width
)
-
(
args
.
kernel
.
width
)
+
(
args
.
image
.
pad_width
)
+
(
args
.
image
.
pad_width
)))
/
(
args
.
kernel
.
stride_w
))
+
1
);
uint64_t
conv_ofm_dma_offset
=
args
.
image
.
channels
*
conv_ofm_dma_repeat
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_inter_stride
=
conv_ofm_dma_offset
*
2
;
//----------------- register contation ------------------
uint64_t
cmd_ifm_flt_base_addr
=
((
uint64_t
)
cmd_filter_vir_base_addr
<<
32
)
|
((
uint64_t
)
cmd_image_vir_base_addr
);
uint64_t
cmd_ifm_flt_dim
=
((
uint64_t
)(
args
.
kernel
.
height
)
<<
48
)
|
((
uint64_t
)(
args
.
kernel
.
width
)
<<
32
)
|
((
uint64_t
)(
args
.
image
.
height
)
<<
16
)
|
((
uint64_t
)(
args
.
image
.
width
));
uint64_t
cmd_pad_step_size
=
((
uint64_t
)(
args
.
kernel
.
stride_h
)
<<
48
)
|
((
uint64_t
)(
args
.
kernel
.
stride_w
)
<<
32
)
|
((
uint64_t
)(
args
.
image
.
pad_height
)
<<
16
)
|
((
uint64_t
)(
args
.
image
.
pad_width
));
uint64_t
cmd_param1
=
((
uint64_t
)
cmd_filter_per_group
<<
48
)
|
((
uint64_t
)
cmd_channel_num
<<
32
)
|
((
uint64_t
)
filter_num
<<
16
)
|
((
uint64_t
)
cmd_group_num
);
uint64_t
cmd_param2
=
((
uint64_t
)
cmd_flt_sqr_len
<<
48
)
|
((
uint64_t
)
cmd_ifm_pack_len
<<
32
)
|
((
uint64_t
)
cmd_ifm_pre_row_num
<<
16
)
|
((
uint64_t
)
cmd_channel_per_group
);
uint64_t
cmd_param3
=
((
uint64_t
)
cmd_flt_batch_num_mns1
<<
48
)
|
((
uint64_t
)
cmd_flt_total_batch_num
<<
32
)
|
((
uint64_t
)
cmd_flt_N_impl
<<
16
)
|
((
uint64_t
)
cmd_flt_pre_batch_num
);
uint64_t
cmd_param4
=
((
uint64_t
)
cmd_ifm_pack_num_per_row_mns1
<<
48
)
|
((
uint64_t
)
cmd_bn_num
<<
32
)
|
((
uint64_t
)
cmd_bias_num
<<
16
)
|
((
uint64_t
)
cmd_flt_N_len
);
uint64_t
cmd_param5
=
((
uint64_t
)
cmd_ifm_stride_row_length
<<
48
)
|
((
uint64_t
)
cmd_flt_pack_length
<<
32
)
|
((
uint64_t
)
cmd_flt_cycle_num_mns1
<<
16
)
|
((
uint64_t
)
cmd_flt_pack_num_per_kernel_mns1
);
uint64_t
cmd_param6
=
((
uint64_t
)
cmd_ofm_width_mns1
<<
48
)
|
((
uint64_t
)
cmd_ifm_batch_num_mns1
<<
32
)
|
((
uint64_t
)
cmd_ifm_buf_col_len
<<
16
)
|
((
uint64_t
)
cmd_ifm_C_impl
);
uint64_t
cmd_param7
=
((
uint64_t
)
conv_ofm_inter_stride
<<
32
)
|
((
uint64_t
)
cmd_ifm_buf_col_len_rem
<<
16
)
|
((
uint64_t
)
cmd_ofm_height
);
uint64_t
cmd_param8
=
((
uint64_t
)
cmd_flt_length
<<
32
)
|
((
uint64_t
)
cmd_ifm_row_byte_length
);
uint64_t
cmd_ifm_flt_quant_scale
=
((
uint64_t
)
cmd_filter_quant_scale
<<
32
)
|
((
uint64_t
)
cmd_image_quant_scale
);
uint64_t
cmd_step_pad_mul_row_len
=
((
uint64_t
)
cmd_pad_h_mul_row_byte_len
<<
32
)
|
((
uint64_t
)
cmd_step_h_mul_row_byte_len
);
//---- ofm paras ----
uint64_t
cmd_conv_param_reg
=
((
uint64_t
)
wParallelsim_num
<<
32
)
|
((
uint64_t
)
wParallelsim
<<
16
)
|
((
uint64_t
)
win_size
);
uint64_t
cmd_ofm_addr_width_reg
=
((
uint64_t
)
conv_ofm_width
<<
32
)
|
((
uint64_t
)
conv_ofm_addr_base
);
uint64_t
cmd_intra_stride_atoms_reg
=
((
uint64_t
)
conv_ofm_dma_length
<<
32
)
|
((
uint64_t
)
conv_ofm_dma_stride
);
uint64_t
cmd_ofm_height_batch_reg
=
((
uint64_t
)(
conv_ofm_buf_col_len_rem
&
0xffff
)
<<
48
)
|
((
uint64_t
)(
conv_ofm_buf_col_len
&
0xffff
)
<<
32
)
|
((
uint64_t
)
conv_ofm_height_batch
+
0x80000000
);
uint64_t
cmd_user_ctrl_reg
=
((
uint64_t
)
o_ust_rst
);
uint64_t
cmd_wdma_param_reg
=
((
uint64_t
)(
conv_ofm_dma_repeat
|
0x80000000
)
<<
32
)
|
((
uint64_t
)
conv_ofm_dma_offset
);
uint64_t
cmd_init_raddr_reg
=
((
cmd_init_raddr_col_1
&
0xffff
)
<<
48
)
|
((
cmd_init_raddr_col_0
&
0xffff
)
<<
32
)
|
(((
cmd_init_raddr_index
&
0xffff
)
<<
16
))
|
(
cmd_init_raddr_flag
&
0xffff
)
<<
15
|
((
cmd_init_raddr_cnt
&
0xffff
));
DLOG
<<
"cmd_init_raddr_col_1 = "
<<
hex
<<
cmd_init_raddr_col_1
;
DLOG
<<
"cmd_init_raddr_col_0 = "
<<
hex
<<
cmd_init_raddr_col_0
;
DLOG
<<
"cmd_init_raddr_index = "
<<
hex
<<
cmd_init_raddr_index
;
//
DLOG
<<
"cmd_init_raddr_cnt = "
<<
hex
<<
cmd_init_raddr_cnt
;
DLOG
<<
"conv_ofm_buf_col_len = "
<<
hex
<<
conv_ofm_buf_col_len
;
DLOG
<<
"conv_ofm_buf_col_len_rem = "
<<
hex
<<
conv_ofm_buf_col_len_rem
;
DLOG
<<
"cmd_ifm_flt_base_addr = "
<<
hex
<<
cmd_ifm_flt_base_addr
;
DLOG
<<
"cmd_scale_base_addr = "
<<
hex
<<
cmd_scale_base_addr
;
DLOG
<<
"cmd_ifm_flt_dim = "
<<
hex
<<
cmd_ifm_flt_dim
;
DLOG
<<
"cmd_pad_step_size = "
<<
hex
<<
cmd_pad_step_size
;
DLOG
<<
"cmd_param1 = "
<<
hex
<<
cmd_param1
;
DLOG
<<
"cmd_param2 = "
<<
hex
<<
cmd_param2
;
DLOG
<<
"cmd_param3 = "
<<
hex
<<
cmd_param3
;
DLOG
<<
"cmd_param4 = "
<<
hex
<<
cmd_param4
;
DLOG
<<
"cmd_param5 = "
<<
hex
<<
cmd_param5
;
DLOG
<<
"cmd_param6 = "
<<
hex
<<
cmd_param6
;
DLOG
<<
"cmd_param7 = "
<<
hex
<<
cmd_param7
;
DLOG
<<
"cmd_param8 = "
<<
hex
<<
cmd_param8
;
DLOG
<<
"cmd_ifm_flt_quant_scale = "
<<
hex
<<
cmd_ifm_flt_quant_scale
;
DLOG
<<
"cmd_step_pad_mul_row_len = "
<<
hex
<<
cmd_step_pad_mul_row_len
;
DLOG
<<
"cmd_ifm_pack_byte_length = "
<<
hex
<<
cmd_ifm_pack_byte_length
;
DLOG
<<
"cmd_conv_param_reg = "
<<
hex
<<
cmd_conv_param_reg
;
DLOG
<<
"cmd_ofm_addr_width_reg = "
<<
hex
<<
cmd_ofm_addr_width_reg
;
DLOG
<<
"cmd_intra_stride_atoms_reg = "
<<
hex
<<
cmd_intra_stride_atoms_reg
;
DLOG
<<
"cmd_init_raddr_reg = "
<<
hex
<<
cmd_init_raddr_reg
;
DLOG
<<
"cmd_ofm_height_batch_reg = "
<<
hex
<<
cmd_ofm_height_batch_reg
;
DLOG
<<
"cmd_wdma_param_reg = "
<<
hex
<<
cmd_wdma_param_reg
;
DLOG
<<
"pooling_mode = "
<<
hex
<<
args
.
mode
;
reg_writeq
(
cmd_ifm_flt_base_addr
,
MUL8
(
1
));
reg_writeq
(
cmd_scale_base_addr
,
MUL8
(
2
));
reg_writeq
(
cmd_ifm_flt_dim
,
MUL8
(
3
));
reg_writeq
(
cmd_pad_step_size
,
MUL8
(
4
));
reg_writeq
(
cmd_param1
,
MUL8
(
5
));
reg_writeq
(
cmd_param2
,
MUL8
(
6
));
reg_writeq
(
cmd_param3
,
MUL8
(
7
));
reg_writeq
(
cmd_param4
,
MUL8
(
8
));
reg_writeq
(
cmd_param5
,
MUL8
(
9
));
reg_writeq
(
cmd_param6
,
MUL8
(
10
));
reg_writeq
(
cmd_param7
,
MUL8
(
11
));
reg_writeq
(
cmd_param8
,
MUL8
(
12
));
reg_writeq
(
cmd_ifm_flt_quant_scale
,
MUL8
(
13
));
reg_writeq
(
cmd_step_pad_mul_row_len
,
MUL8
(
14
));
reg_writeq
(
cmd_ifm_pack_byte_length
,
MUL8
(
15
));
reg_writeq
(
cmd_conv_param_reg
,
MUL8
(
16
));
reg_writeq
(
cmd_ofm_addr_width_reg
,
MUL8
(
17
));
reg_writeq
(
cmd_intra_stride_atoms_reg
,
MUL8
(
18
));
reg_writeq
(
cmd_init_raddr_reg
,
MUL8
(
29
));
reg_writeq
(
0
,
MUL8
(
19
));
reg_writeq
(
cmd_ofm_height_batch_reg
,
MUL8
(
19
));
reg_writeq
(
cmd_ofm_height_batch_reg
&
0xffffffff00000000
,
MUL8
(
19
));
reg_writeq
(
cmd_wdma_param_reg
,
MUL8
(
25
));
/******************************************************************/
uint64_t
cmd_mult_factor
=
((
uint64_t
)
args
.
kernel_reciprocal
)
|
((
uint64_t
)
args
.
kernel_reciprocal
<<
16
);
reg_writeq
(
cmd_mult_factor
,
MUL8
(
30
));
/******************************************************************/
reg_writeq
(
0
,
MUL8
(
0
));
if
(
args
.
mode
==
0
)
{
// max pooling
reg_writeq
(
0x2200000000000000
,
MUL8
(
0
));
}
else
{
// average pooling
reg_writeq
(
0x2400000000000000
,
MUL8
(
0
));
}
int
ret
=
-
1
;
ret
=
fpga_regpoll
(
MUL8
(
48
),
CONV_DONE
,
0x00ffff
);
if
(
ret
==
-
1
)
{
DLOG
<<
"fpga pooling no interrupt!!"
;
return
ret
;
}
reg_readq
(
MUL8
(
63
));
usleep
(
10
);
// get max value
float
scale
=
Findfp16Max
();
(
args
.
output
.
scale_address
)[
0
]
=
scale
;
// NOLINT
(
args
.
output
.
scale_address
)[
1
]
=
(
float
)(
1.0
/
scale
);
// NOLINT
DLOG
<<
"Findfp16Max scale = "
<<
scale
;
DLOG
<<
"ret="
<<
ret
;
return
ret
;
}
int
get_ofm_batch_size
(
int
width
,
int
channel
)
{
int
pad_channel
,
row_size
;
if
(
64
<
channel
)
{
pad_channel
=
(
int
)((
channel
+
127
)
/
128
)
*
128
;
// NOLINT
}
else
if
(
32
<
channel
&&
channel
<=
64
)
{
pad_channel
=
((
channel
+
63
)
/
(
64
))
*
64
;
}
else
if
(
16
<
channel
&&
channel
<=
32
)
{
pad_channel
=
((
channel
+
31
)
/
(
32
))
*
32
;
}
else
if
(
channel
<=
16
)
{
pad_channel
=
((
channel
+
15
)
/
(
16
))
*
16
;
}
row_size
=
pad_channel
*
width
;
return
row_size
;
}
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
)
{
...
...
@@ -123,26 +1056,525 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
<<
" const0:"
<<
fp16_2_fp32
(
int16_t
(
args
.
const0
))
<<
" const1:"
<<
fp16_2_fp32
(
int16_t
(
args
.
const1
));
DLOG
<<
" image0_address:"
<<
args
.
image0
.
address
<<
" image0_scale_address:"
<<
args
.
image0
.
scale_address
<<
" image0_channels:"
<<
args
.
image0
.
channels
<<
" image0_height:"
<<
args
.
image0
.
height
<<
" image0_width:"
<<
args
.
image0
.
width
<<
" pad0_height:"
<<
args
.
image0
.
pad_height
<<
" pad0_width:"
<<
args
.
image0
.
pad_width
;
<<
" image0_width:"
<<
args
.
image0
.
width
;
DLOG
<<
" image1_address:"
<<
args
.
image1
.
address
<<
" image1_scale_address:"
<<
args
.
image1
.
scale_address
<<
" image1_channels:"
<<
args
.
image1
.
channels
<<
" image1_height:"
<<
args
.
image1
.
height
<<
" image1_width:"
<<
args
.
image1
.
width
<<
" pad1_height:"
<<
args
.
image1
.
pad_height
<<
" pad_width:"
<<
args
.
image1
.
pad_width
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
<<
" image1_width:"
<<
args
.
image1
.
width
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
return
0
;
uint32_t
filter_num_align
=
args
.
image0
.
channels
;
uint32_t
const_kernel_width_1
=
1
;
uint32_t
const_stride_width_1
=
1
;
uint32_t
const_kernel_height_2
=
2
;
uint32_t
const_stride_height_2
=
2
;
uint32_t
const_pad_height_0
=
0
;
uint32_t
const_pad_width_0
=
0
;
uint32_t
ew_image_height
=
args
.
image0
.
height
*
2
;
DLOG
<<
"______db_______: begin to set registers. "
;
uint64_t
ifm_pixel_num
=
((
args
.
image0
.
width
)
*
(
args
.
image0
.
height
)
*
args
.
image0
.
channels
);
uint64_t
ifm_memory_size
=
ifm_pixel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
flt_pixel_num
=
0
;
uint64_t
filter_memory_size
=
0
;
uint64_t
bn_pixel_num
=
(
filter_num_align
*
2
);
uint64_t
bn_memory_size
=
bn_pixel_num
*
sizeof
(
uint16_t
);
uint64_t
ofm_width
=
((
args
.
image0
.
width
)
+
2
*
const_pad_width_0
-
const_kernel_width_1
)
/
(
const_stride_width_1
)
+
1
;
uint64_t
ofm_height
=
((
ew_image_height
)
+
2
*
(
const_pad_height_0
)
-
(
const_kernel_height_2
))
/
(
const_stride_height_2
)
+
1
;
uint32_t
filter_num
=
filter_num_align
;
uint32_t
image_channels
=
args
.
image0
.
channels
;
uint64_t
ifm_src_paddr
=
vaddr_to_paddr
((
args
.
image0
.
address
));
uint64_t
flt_src_paddr
=
vaddr_to_paddr
((
args
.
image1
.
address
));
uint64_t
ifm_dst_paddr
=
vaddr_to_paddr
((
args
.
output
.
address
));
float
image_inv_scale
=
0
;
float
filter_inv_scale
=
0
;
int
idx
=
0
;
DLOG
<<
"______db_______: reset registers. "
;
reg_writeq
(
1
,
MUL8
(
24
));
usleep
(
1
);
reg_writeq
(
0
,
MUL8
(
24
));
/*********configuring registers*************/
uint32_t
cmd_image_vir_base_addr
=
(
uint32_t
)
ifm_src_paddr
;
uint32_t
cmd_filter_vir_base_addr
=
(
uint32_t
)
flt_src_paddr
;
uint32_t
cmd_scale_base_addr
=
0
;
uint32_t
conv_ofm_addr_base
=
(
uint32_t
)
ifm_dst_paddr
;
uint64_t
cmd_group_num
=
1
;
uint64_t
cmd_filter_per_group
=
filter_num
/
cmd_group_num
;
uint64_t
cmd_flt_sqr_len
=
(
const_kernel_width_1
)
*
(
const_kernel_height_2
);
uint64_t
cmd_ifm_pre_row_num
=
const_kernel_height_2
;
if
((
const_kernel_height_2
==
ew_image_height
)
&&
(
0
==
const_pad_height_0
))
{
cmd_ifm_pre_row_num
=
(
const_kernel_height_2
);
}
else
{
cmd_ifm_pre_row_num
=
(
const_kernel_height_2
)
-
(
const_pad_height_0
)
+
(
const_stride_height_2
);
}
uint64_t
cmd_flt_pre_batch_num
=
1
;
uint64_t
cmd_ifm_pack_num_per_row_mns1
=
(
uint64_t
)(((
args
.
image0
.
channels
)
+
63
)
/
64
)
-
1
;
uint64_t
cmd_bn_num
=
filter_num
;
uint64_t
cmd_bias_num
=
filter_num
;
uint64_t
cmd_ifm_stride_row_length
=
args
.
image0
.
width
*
const_stride_height_2
;
uint64_t
cmd_flt_pack_num_per_kernel_mns1
=
(
uint64_t
)(((
args
.
image0
.
channels
)
+
63
)
/
64
)
-
1
;
uint64_t
cmd_ofm_width_mns1
=
(
uint64_t
)(
((
args
.
image0
.
width
)
-
(
const_kernel_width_1
)
+
2
*
(
const_pad_width_0
))
/
(
const_stride_width_1
));
uint64_t
cmd_ofm_height
=
(
uint64_t
)(((
args
.
image0
.
height
)
*
2
-
(
const_kernel_height_2
)
+
2
*
(
const_pad_height_0
))
/
(
const_stride_height_2
))
+
1
;
uint64_t
cmd_channel_num
=
0
;
uint64_t
cmd_ifm_pack_len
=
0
;
uint64_t
cmd_channel_per_group
=
0
;
uint64_t
cmd_flt_batch_num_mns1
=
0
;
uint64_t
cmd_flt_N_impl
=
8
;
uint64_t
cmd_ifm_C_impl
=
16
;
uint64_t
cmd_flt_pack_length
=
0
;
uint64_t
cmd_step_h_mul_row_byte_len
=
0
;
uint64_t
cmd_pad_h_mul_row_byte_len
=
0
;
uint64_t
cmd_ifm_pack_byte_length
=
16
*
((((
args
.
image0
.
width
)
+
7
)
/
8
)
*
8
);
uint64_t
row_len_align
=
args
.
image0
.
width
;
uint64_t
cmd_flt_cycle_num_mns1
=
0
;
if
(
image_channels
>
32
)
{
cmd_channel_num
=
(
uint64_t
)((((
args
.
image0
.
channels
)
+
63
))
/
64
)
*
64
;
cmd_ifm_pack_len
=
64
*
(
args
.
image0
.
width
);
cmd_channel_per_group
=
64
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)(((
filter_num
+
7
))
/
8
-
1
);
cmd_flt_N_impl
=
8
;
cmd_ifm_C_impl
=
64
;
cmd_flt_pack_length
=
(
const_kernel_width_1
)
*
(
const_kernel_height_2
)
*
64
;
cmd_step_h_mul_row_byte_len
=
(
const_stride_height_2
)
*
cmd_channel_num
*
args
.
image0
.
width
;
cmd_pad_h_mul_row_byte_len
=
(
const_pad_height_0
)
*
cmd_channel_num
*
args
.
image0
.
width
;
cmd_ifm_pack_byte_length
=
64
*
args
.
image0
.
width
;
row_len_align
=
args
.
image0
.
width
;
cmd_flt_cycle_num_mns1
=
(
cmd_channel_num
/
64
)
-
1
;
}
else
if
(
image_channels
>
16
)
{
cmd_channel_num
=
32
;
cmd_ifm_pack_len
=
32
*
(
args
.
image0
.
width
);
cmd_channel_per_group
=
32
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
filter_num
)
+
15
))
/
16
-
1
);
cmd_flt_N_impl
=
16
;
cmd_ifm_C_impl
=
32
;
cmd_flt_pack_length
=
(
const_kernel_width_1
)
*
(
const_kernel_height_2
)
*
32
;
cmd_step_h_mul_row_byte_len
=
(
const_stride_height_2
)
*
cmd_channel_num
*
((((
args
.
image0
.
width
)
+
1
))
/
2
)
*
2
;
cmd_pad_h_mul_row_byte_len
=
(
const_pad_height_0
)
*
cmd_channel_num
*
((((
args
.
image0
.
width
)
+
1
))
/
2
)
*
2
;
cmd_ifm_pack_byte_length
=
32
*
(
uint64_t
)((((
args
.
image0
.
width
)
+
1
))
/
2
)
*
2
;
row_len_align
=
(
uint64_t
)((((
args
.
image0
.
width
)
+
1
))
/
2
);
cmd_flt_cycle_num_mns1
=
0
;
}
else
if
(
image_channels
>
8
)
{
cmd_channel_num
=
16
;
cmd_ifm_pack_len
=
16
*
(
args
.
image0
.
width
);
cmd_channel_per_group
=
16
;
cmd_flt_batch_num_mns1
=
(
uint64_t
)((((
filter_num
)
+
15
))
/
16
-
1
);
cmd_flt_N_impl
=
32
;
cmd_ifm_C_impl
=
16
;
cmd_flt_pack_length
=
(
const_kernel_width_1
)
*
(
const_kernel_height_2
)
*
16
;
cmd_step_h_mul_row_byte_len
=
(
const_stride_height_2
)
*
cmd_channel_num
*
((((
args
.
image0
.
width
)
+
3
))
/
4
)
*
4
;
cmd_pad_h_mul_row_byte_len
=
(
const_pad_height_0
)
*
cmd_channel_num
*
((((
args
.
image0
.
width
)
+
3
))
/
4
)
*
4
;
cmd_ifm_pack_byte_length
=
16
*
(
uint64_t
)((((
args
.
image0
.
width
)
+
3
))
/
4
)
*
4
;
row_len_align
=
(
uint64_t
)((((
args
.
image0
.
width
)
+
3
))
/
4
);
cmd_flt_cycle_num_mns1
=
0
;
}
cmd_flt_N_impl
=
16
;
cmd_flt_batch_num_mns1
=
0
;
cmd_flt_pack_length
=
64
;
uint64_t
cmd_flt_N_len
=
0
;
uint64_t
cmd_flt_length
=
64
;
uint64_t
cmd_ifm_row_byte_length
=
cmd_channel_num
*
(
args
.
image0
.
width
);
uint64_t
cmd_ifm_buf_col_len
=
0
;
uint64_t
ifm_one_batch_len
=
(
1048576
/
((
2
*
row_len_align
)
*
cmd_channel_num
));
uint64_t
cmd_ifm_batch_num_tmp
=
(
uint64_t
)(
((
ew_image_height
)
+
ifm_one_batch_len
-
1
)
/
ifm_one_batch_len
);
DLOG
<<
"ifm_one_batch_len = "
<<
hex
<<
ifm_one_batch_len
;
DLOG
<<
"cmd_ifm_batch_num_tmp = "
<<
hex
<<
cmd_ifm_batch_num_tmp
;
if
(
1
==
cmd_ifm_batch_num_tmp
)
{
cmd_ifm_buf_col_len
=
ew_image_height
;
}
else
{
cmd_ifm_buf_col_len
=
ifm_one_batch_len
;
}
uint64_t
cmd_ifm_batch_num_mns1
=
(((
ew_image_height
)
+
cmd_ifm_buf_col_len
-
1
)
/
cmd_ifm_buf_col_len
)
-
1
;
DLOG
<<
"___db____ew____:cmd_ifm_batch_num_mns1 = "
<<
hex
<<
cmd_ifm_batch_num_mns1
;
uint64_t
cmd_flt_total_batch_num
=
1
;
uint64_t
cmd_ifm_buf_col_len_rem
=
(
ew_image_height
)
-
cmd_ifm_batch_num_mns1
*
cmd_ifm_buf_col_len
;
//-------- ofm batch number reg && initial URAM reading address
// logic-----------------
uint64_t
cmd_init_raddr_cnt
=
1
;
uint64_t
cmd_init_raddr_flag
=
0
;
int64_t
cmd_init_raddr_index
=
-
8
;
int64_t
cmd_init_raddr_col_0
=
-
4
;
int64_t
cmd_init_raddr_col_1
=
-
4
;
int64_t
conv_ofm_buf_col_len
=
0
;
int64_t
conv_ofm_buf_col_len_rem
=
0
;
if
(((
const_pad_height_0
)
%
(
2
*
(
const_stride_height_2
)))
==
0
)
{
cmd_init_raddr_cnt
=
0
;
cmd_init_raddr_flag
=
0
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(((
const_pad_height_0
)
+
1
)
/
2
);
cmd_init_raddr_col_0
=
cmd_init_raddr_index
;
cmd_init_raddr_col_1
=
cmd_init_raddr_index
;
}
else
if
(((
const_pad_height_0
)
-
2
*
((
const_pad_height_0
)
/
(
2
*
(
const_stride_height_2
))))
<=
(
const_stride_height_2
))
{
cmd_init_raddr_cnt
=
(
const_stride_height_2
)
-
((
const_pad_height_0
)
-
((
const_pad_height_0
)
/
(
2
*
(
const_stride_height_2
))));
cmd_init_raddr_flag
=
1
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
const_pad_height_0
)
-
(
int64_t
)
row_len_align
*
((
const_pad_height_0
)
/
(
2
*
const_stride_height_2
));
cmd_init_raddr_col_0
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
const_pad_height_0
)
-
(
int64_t
)
row_len_align
*
((
const_pad_height_0
)
/
(
2
*
(
const_stride_height_2
)));
cmd_init_raddr_col_1
=
cmd_init_raddr_col_0
+
const_stride_height_2
*
(
int64_t
)
row_len_align
;
// 0;
}
else
if
(((
const_pad_height_0
)
-
2
*
((
const_pad_height_0
)
/
(
2
*
(
const_stride_height_2
))))
<=
2
*
(
const_stride_height_2
))
{
cmd_init_raddr_cnt
=
2
*
(
const_stride_height_2
)
*
(((
const_pad_height_0
)
+
2
*
(
const_stride_height_2
)
-
1
)
/
(
2
*
(
const_stride_height_2
)))
-
(
const_pad_height_0
);
cmd_init_raddr_flag
=
0
;
cmd_init_raddr_index
=
0
-
(
int64_t
)
row_len_align
*
(
int64_t
)(
const_stride_height_2
)
*
(((
const_pad_height_0
)
+
2
*
(
const_stride_height_2
)
-
1
)
/
(
2
*
(
const_stride_height_2
)));
cmd_init_raddr_col_0
=
0
-
(
int64_t
)
row_len_align
*
((
const_pad_height_0
)
/
(
2
*
(
const_stride_height_2
)))
-
(
int64_t
)
row_len_align
*
(
2
*
(
const_stride_height_2
)
*
(((
const_pad_height_0
)
+
2
*
(
const_stride_height_2
)
-
1
)
/
(
2
*
(
const_stride_height_2
)))
-
(
const_pad_height_0
));
cmd_init_raddr_col_1
=
cmd_init_raddr_col_0
;
}
if
(
cmd_ifm_batch_num_mns1
==
0
)
{
if
((
const_kernel_height_2
)
<=
(
const_stride_height_2
))
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
+
2
*
(
const_pad_height_0
)
-
3
*
(
const_stride_height_2
);
}
else
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
+
2
*
(
const_pad_height_0
)
-
3
*
(
const_stride_height_2
)
-
(
const_kernel_height_2
);
}
conv_ofm_buf_col_len_rem
=
conv_ofm_buf_col_len
;
}
else
{
int
N_rem
=
0
;
int
row_rem
=
0
;
if
((
const_kernel_height_2
)
<=
(
const_stride_height_2
))
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
-
3
*
(
const_stride_height_2
);
N_rem
=
(
cmd_ifm_buf_col_len
-
(
const_kernel_height_2
))
/
(
const_stride_height_2
)
+
1
;
row_rem
=
cmd_ifm_buf_col_len
-
(
const_stride_height_2
)
*
N_rem
;
conv_ofm_buf_col_len_rem
=
cmd_ifm_buf_col_len_rem
+
2
*
(
const_pad_height_0
)
+
row_rem
-
3
*
(
const_stride_height_2
);
}
else
{
conv_ofm_buf_col_len
=
cmd_ifm_buf_col_len
+
2
*
(
const_pad_height_0
)
-
3
*
(
const_stride_height_2
)
-
(
const_kernel_height_2
);
N_rem
=
(
cmd_ifm_buf_col_len
-
(
const_kernel_height_2
))
/
(
const_stride_height_2
)
+
1
;
row_rem
=
cmd_ifm_buf_col_len
-
(
const_stride_height_2
)
*
N_rem
;
conv_ofm_buf_col_len_rem
=
cmd_ifm_buf_col_len_rem
+
(
const_pad_height_0
)
+
row_rem
-
3
*
(
const_stride_height_2
)
-
(
const_kernel_height_2
);
}
}
//*************************
uint64_t
ifm_height_raw_batch
=
0
;
uint64_t
cmd_ofm_height_batch_reg
;
uint64_t
conv_ofm_height_batch_tmp
=
0
;
uint64_t
conv_ofm_height_batch
[
16
];
int
ofm_height_norm_batch
;
int
height_batch_num
;
int
row_norm_size
=
get_ofm_batch_size
(
args
.
image0
.
width
,
cmd_channel_num
);
int
ifm_norm_size
=
ew_image_height
*
row_norm_size
*
sizeof
(
short
);
// NOLINT
if
(
ifm_norm_size
<=
(
1024
*
1024
))
{
conv_ofm_height_batch
[
0
]
=
get_image_out_axis
(
ew_image_height
,
const_pad_height_0
,
const_kernel_height_2
,
const_stride_height_2
);
height_batch_num
=
0
;
}
else
if
(
row_norm_size
<
(
1024
*
1024
))
{
// raw ifm batch ,should make ofm be 2*N
ifm_height_raw_batch
=
(
int
)(((
double
)(
1024
*
1024
)
-
row_norm_size
+
1
)
/
// NOLINT
(
double
)(
2
*
row_norm_size
));
// NOLINT
ofm_height_norm_batch
=
get_image_out_axis
(
ifm_height_raw_batch
,
0
,
const_kernel_height_2
,
const_stride_height_2
);
if
(
ofm_height_norm_batch
%
2
==
0
)
{
ofm_height_norm_batch
=
ofm_height_norm_batch
;
}
else
{
ofm_height_norm_batch
=
ofm_height_norm_batch
-
1
;
}
DLOG
<<
"ofm_height_norm_batch = "
<<
hex
<<
ofm_height_norm_batch
;
int
ofm_height_rems
=
cmd_ofm_height
;
int
i
=
0
;
for
(
i
=
0
;
0
<
ofm_height_rems
;
i
++
)
{
if
(
ofm_height_norm_batch
<=
ofm_height_rems
)
{
ofm_height_rems
=
ofm_height_rems
-
ofm_height_norm_batch
;
conv_ofm_height_batch
[
i
]
=
ofm_height_norm_batch
;
DLOG
<<
"ofm_height_norm_batch[i] = "
<<
hex
<<
conv_ofm_height_batch
[
i
];
}
else
{
conv_ofm_height_batch
[
i
]
=
ofm_height_rems
;
break
;
}
}
height_batch_num
=
i
;
}
//*************************
//----------------------- para functions --------------------------------
uint64_t
cmd_filter_quant_scale
=
0x3c00
;
uint64_t
cmd_image_quant_scale
=
0x3c00
;
uint64_t
wParallelsim
=
cmd_ifm_C_impl
>>
3
;
uint64_t
wParallelsim_num
=
cmd_flt_cycle_num_mns1
;
uint64_t
win_size
=
(
const_kernel_width_1
)
*
(
const_kernel_height_2
)
*
(
cmd_ifm_pack_num_per_row_mns1
+
1
)
-
1
;
//
uint64_t
conv_ofm_width
=
(((
args
.
image0
.
width
)
-
(
const_kernel_width_1
)
+
(
const_pad_width_0
)
+
(
const_pad_width_0
))
/
(
const_stride_width_1
));
uint64_t
conv_ofm_dma_length
=
cmd_channel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_dma_stride
=
cmd_channel_num
*
sizeof
(
short
);
// NOLINT
uint64_t
cmd_image_addr_low
=
0
;
uint64_t
cmd_image_addr_high
=
0
;
uint64_t
cmd_image_addr_diff
=
0
;
if
(
cmd_filter_vir_base_addr
<
cmd_image_vir_base_addr
)
{
cmd_image_addr_low
=
(
uint64_t
)
cmd_filter_vir_base_addr
;
cmd_image_addr_high
=
(
uint64_t
)
cmd_image_vir_base_addr
;
}
else
{
cmd_image_addr_low
=
(
uint64_t
)
cmd_image_vir_base_addr
;
cmd_image_addr_high
=
(
uint64_t
)
cmd_filter_vir_base_addr
;
}
cmd_image_addr_diff
=
cmd_image_addr_high
-
cmd_image_addr_low
;
uint64_t
o_ust_rst
=
0
;
uint64_t
conv_ofm_dma_repeat
=
(
uint64_t
)(((((
args
.
image0
.
width
)
-
(
const_kernel_width_1
)
+
(
const_pad_width_0
)
+
(
const_pad_width_0
)))
/
(
const_stride_width_1
))
+
1
);
uint64_t
conv_ofm_dma_offset
=
cmd_channel_num
*
conv_ofm_dma_repeat
*
sizeof
(
short
);
// NOLINT
uint64_t
conv_ofm_inter_stride
=
conv_ofm_dma_offset
*
2
;
//----------------- register contation ------------------
uint64_t
cmd_ifm_flt_base_addr
=
(
cmd_image_addr_high
<<
32
)
|
(
cmd_image_addr_low
);
uint64_t
cmd_ifm_flt_dim
=
((
uint64_t
)(
const_kernel_height_2
)
<<
48
)
|
((
uint64_t
)(
const_kernel_width_1
)
<<
32
)
|
((
uint64_t
)(
ew_image_height
)
<<
16
)
|
((
uint64_t
)(
args
.
image0
.
width
));
uint64_t
cmd_pad_step_size
=
((
uint64_t
)(
const_stride_height_2
)
<<
48
)
|
((
uint64_t
)(
const_stride_width_1
)
<<
32
)
|
((
uint64_t
)(
const_pad_height_0
)
<<
16
)
|
((
uint64_t
)(
const_pad_width_0
));
uint64_t
cmd_param1
=
((
uint64_t
)
cmd_filter_per_group
<<
48
)
|
((
uint64_t
)
cmd_channel_num
<<
32
)
|
((
uint64_t
)
filter_num
<<
16
)
|
((
uint64_t
)
cmd_group_num
);
uint64_t
cmd_param2
=
((
uint64_t
)
cmd_flt_sqr_len
<<
48
)
|
((
uint64_t
)
cmd_ifm_pack_len
<<
32
)
|
((
uint64_t
)
cmd_ifm_pre_row_num
<<
16
)
|
((
uint64_t
)
cmd_channel_per_group
);
uint64_t
cmd_param3
=
((
uint64_t
)
cmd_flt_batch_num_mns1
<<
48
)
|
((
uint64_t
)
cmd_flt_total_batch_num
<<
32
)
|
((
uint64_t
)
cmd_flt_N_impl
<<
16
)
|
((
uint64_t
)
cmd_flt_pre_batch_num
);
uint64_t
cmd_param4
=
((
uint64_t
)
cmd_ifm_pack_num_per_row_mns1
<<
48
)
|
((
uint64_t
)
cmd_bn_num
<<
32
)
|
((
uint64_t
)
cmd_bias_num
<<
16
)
|
((
uint64_t
)
cmd_flt_N_len
);
uint64_t
cmd_param5
=
((
uint64_t
)
cmd_ifm_stride_row_length
<<
48
)
|
((
uint64_t
)
cmd_flt_pack_length
<<
32
)
|
((
uint64_t
)
cmd_flt_cycle_num_mns1
<<
16
)
|
((
uint64_t
)
cmd_flt_pack_num_per_kernel_mns1
);
uint64_t
cmd_param6
=
((
uint64_t
)
cmd_ofm_width_mns1
<<
48
)
|
((
uint64_t
)
cmd_ifm_batch_num_mns1
<<
32
)
|
((
uint64_t
)
cmd_ifm_buf_col_len
<<
16
)
|
((
uint64_t
)
cmd_ifm_C_impl
);
uint64_t
cmd_param7
=
((
uint64_t
)
conv_ofm_inter_stride
<<
32
)
|
((
uint64_t
)
cmd_ifm_buf_col_len_rem
<<
16
)
|
((
uint64_t
)
cmd_ofm_height
);
uint64_t
cmd_param8
=
((
uint64_t
)
cmd_flt_length
<<
32
)
|
((
uint64_t
)
cmd_ifm_row_byte_length
);
uint64_t
cmd_ifm_flt_quant_scale
=
((
uint64_t
)
cmd_filter_quant_scale
<<
32
)
|
((
uint64_t
)
cmd_image_quant_scale
);
uint64_t
cmd_step_pad_mul_row_len
=
((
uint64_t
)
cmd_pad_h_mul_row_byte_len
<<
32
)
|
((
uint64_t
)
cmd_step_h_mul_row_byte_len
);
//---- ofm paras ----
uint64_t
cmd_conv_param_reg
=
((
uint64_t
)
wParallelsim_num
<<
32
)
|
((
uint64_t
)
wParallelsim
<<
16
)
|
((
uint64_t
)
win_size
);
uint64_t
cmd_ofm_addr_width_reg
=
((
uint64_t
)
conv_ofm_width
<<
32
)
|
((
uint64_t
)
conv_ofm_addr_base
);
uint64_t
cmd_intra_stride_atoms_reg
=
((
uint64_t
)
conv_ofm_dma_length
<<
32
)
|
((
uint64_t
)
conv_ofm_dma_stride
);
uint64_t
cmd_user_ctrl_reg
=
((
uint64_t
)
o_ust_rst
);
uint64_t
cmd_wdma_param_reg
=
((
uint64_t
)(
conv_ofm_dma_repeat
|
0x80000000
)
<<
32
)
|
((
uint64_t
)
conv_ofm_dma_offset
);
uint64_t
cmd_init_raddr_reg
=
((
cmd_init_raddr_col_1
&
0xffff
)
<<
48
)
|
((
cmd_init_raddr_col_0
&
0xffff
)
<<
32
)
|
(((
cmd_init_raddr_index
&
0xffff
)
<<
16
))
|
(
cmd_init_raddr_flag
&
0xffff
)
<<
15
|
((
cmd_init_raddr_cnt
&
0xffff
));
uint64_t
cmd_mult_factor
=
((
uint64_t
)
args
.
const0
)
|
((
uint64_t
)
args
.
const1
<<
16
);
uint64_t
cmd_para31
=
(
cmd_para31
&
0x1
)
|
args
.
relu_enabled
;
DLOG
<<
"cmd_init_raddr_col_1 = "
<<
hex
<<
cmd_init_raddr_col_1
;
DLOG
<<
"cmd_init_raddr_col_0 = "
<<
hex
<<
cmd_init_raddr_col_0
;
DLOG
<<
"cmd_init_raddr_index = "
<<
hex
<<
cmd_init_raddr_index
;
//
DLOG
<<
"cmd_init_raddr_cnt = "
<<
hex
<<
cmd_init_raddr_cnt
;
DLOG
<<
"cmd_ifm_buf_col_len = "
<<
hex
<<
cmd_ifm_buf_col_len
;
DLOG
<<
"cmd_ifm_buf_col_len_rem = "
<<
hex
<<
cmd_ifm_buf_col_len_rem
;
DLOG
<<
"conv_ofm_buf_col_len = "
<<
hex
<<
conv_ofm_buf_col_len
;
DLOG
<<
"conv_ofm_buf_col_len_rem = "
<<
hex
<<
conv_ofm_buf_col_len_rem
;
DLOG
<<
"cmd_ifm_flt_base_addr = "
<<
hex
<<
cmd_ifm_flt_base_addr
;
DLOG
<<
"cmd_scale_base_addr = "
<<
hex
<<
cmd_scale_base_addr
;
DLOG
<<
"cmd_ifm_flt_dim = "
<<
hex
<<
cmd_ifm_flt_dim
;
DLOG
<<
"cmd_pad_step_size = "
<<
hex
<<
cmd_pad_step_size
;
DLOG
<<
"cmd_param1 = "
<<
hex
<<
cmd_param1
;
DLOG
<<
"cmd_param2 = "
<<
hex
<<
cmd_param2
;
DLOG
<<
"cmd_param3 = "
<<
hex
<<
cmd_param3
;
DLOG
<<
"cmd_param4 = "
<<
hex
<<
cmd_param4
;
DLOG
<<
"cmd_param5 = "
<<
hex
<<
cmd_param5
;
DLOG
<<
"cmd_param6 = "
<<
hex
<<
cmd_param6
;
DLOG
<<
"cmd_param7 = "
<<
hex
<<
cmd_param7
;
DLOG
<<
"cmd_param8 = "
<<
hex
<<
cmd_param8
;
DLOG
<<
"cmd_ifm_flt_quant_scale = "
<<
hex
<<
cmd_ifm_flt_quant_scale
;
DLOG
<<
"cmd_step_pad_mul_row_len = "
<<
hex
<<
cmd_step_pad_mul_row_len
;
DLOG
<<
"cmd_ifm_pack_byte_length = "
<<
hex
<<
cmd_ifm_pack_byte_length
;
DLOG
<<
"cmd_conv_param_reg = "
<<
hex
<<
cmd_conv_param_reg
;
DLOG
<<
"cmd_ofm_addr_width_reg = "
<<
hex
<<
cmd_ofm_addr_width_reg
;
DLOG
<<
"cmd_intra_stride_atoms_reg = "
<<
hex
<<
cmd_intra_stride_atoms_reg
;
DLOG
<<
"cmd_init_raddr_reg = "
<<
hex
<<
cmd_init_raddr_reg
;
DLOG
<<
"cmd_mult_factor = "
<<
hex
<<
cmd_mult_factor
;
DLOG
<<
"cmd_wdma_param_reg = "
<<
hex
<<
cmd_wdma_param_reg
;
DLOG
<<
"cmd_para31 = "
<<
hex
<<
cmd_para31
;
reg_writeq
(
cmd_ifm_flt_base_addr
,
MUL8
(
1
));
reg_writeq
(
cmd_scale_base_addr
,
MUL8
(
2
));
reg_writeq
(
cmd_ifm_flt_dim
,
MUL8
(
3
));
reg_writeq
(
cmd_pad_step_size
,
MUL8
(
4
));
reg_writeq
(
cmd_param1
,
MUL8
(
5
));
reg_writeq
(
cmd_param2
,
MUL8
(
6
));
reg_writeq
(
cmd_param3
,
MUL8
(
7
));
reg_writeq
(
cmd_param4
,
MUL8
(
8
));
reg_writeq
(
cmd_param5
,
MUL8
(
9
));
reg_writeq
(
cmd_param6
,
MUL8
(
10
));
reg_writeq
(
cmd_param7
,
MUL8
(
11
));
reg_writeq
(
cmd_param8
,
MUL8
(
12
));
reg_writeq
(
cmd_ifm_flt_quant_scale
,
MUL8
(
13
));
reg_writeq
(
cmd_step_pad_mul_row_len
,
MUL8
(
14
));
reg_writeq
(
cmd_ifm_pack_byte_length
,
MUL8
(
15
));
reg_writeq
(
cmd_conv_param_reg
,
MUL8
(
16
));
reg_writeq
(
cmd_ofm_addr_width_reg
,
MUL8
(
17
));
reg_writeq
(
cmd_intra_stride_atoms_reg
,
MUL8
(
18
));
reg_writeq
(
cmd_init_raddr_reg
,
MUL8
(
29
));
reg_writeq
(
cmd_para31
,
MUL8
(
31
));
reg_writeq
(
0
,
MUL8
(
19
));
for
(
int
i
=
0
;
i
<
height_batch_num
+
1
;
i
++
)
{
conv_ofm_height_batch_tmp
=
int
((
conv_ofm_height_batch
[
i
]
+
1
)
/
2
)
-
1
;
// NOLINT
cmd_ofm_height_batch_reg
=
((
uint64_t
)(
conv_ofm_buf_col_len_rem
&
0xffff
)
<<
48
)
|
((
uint64_t
)(
conv_ofm_buf_col_len
&
0xffff
)
<<
32
)
|
((
uint64_t
)
conv_ofm_height_batch_tmp
+
0x80000000
);
reg_writeq
(
cmd_ofm_height_batch_reg
,
MUL8
(
19
));
reg_writeq
(
cmd_ofm_height_batch_reg
&
0xffffffff00000000
,
MUL8
(
19
));
usleep
(
1
);
}
reg_writeq
(
cmd_wdma_param_reg
,
MUL8
(
25
));
DLOG
<<
"cmd_ofm_height_batch_reg = "
<<
hex
<<
cmd_ofm_height_batch_reg
;
/******************************************************************/
reg_writeq
(
cmd_mult_factor
,
MUL8
(
30
));
/******************************************************************/
reg_writeq
(
0
,
MUL8
(
0
));
reg_writeq
(
0x2100000000000000
,
MUL8
(
0
));
int
ret
=
fpga_regpoll
(
MUL8
(
48
),
CONV_DONE
,
0xffffff
);
if
(
ret
==
-
1
)
{
DLOG
<<
"fpga EW no interrupt!!"
;
return
ret
;
}
reg_readq
(
MUL8
(
63
));
usleep
(
10
);
// get max value
float
scale
=
Findfp16Max
();
(
args
.
output
.
scale_address
)[
0
]
=
scale
;
// NOLINT
(
args
.
output
.
scale_address
)[
1
]
=
(
float
)(
1.0
/
scale
);
// NOLINT
DLOG
<<
"Findfp16Max scale = "
<<
scale
;
DLOG
<<
"ret="
<<
ret
;
return
ret
;
}
int
PerformBypass
(
const
struct
BypassArgs
&
args
)
{
...
...
@@ -166,60 +1598,63 @@ int PerformBypass(const struct BypassArgs &args) {
return
0
;
#endif
// uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
// uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
// uint64_t bp_enable;
// int64_t length;
// uint64_t pixels;
//
// // fp32->fp16
// if ((args.input_data_type) && (!args.output_data_type)) {
// pixels = (args.image.channels) * (args.image.width) *
// (args.image.height); length = pixels * sizeof(float); bp_enable =
// 0x8800000000000000 + length;
// }
// // fp16->fp32
// else if ((!args.input_data_type) && (args.output_data_type)) {
// pixels = filter::calc_aligned_channel((args.image.channels)) *
// (args.image.width) * (args.image.height);
// length = pixels * sizeof(short);
// length = align_to_x((int)length, 64); // NOLINT
// bp_enable = 0x8a00000000000000 + length;
// }
// // fp16->fp16 findmax
// else if ((!args.input_data_type) && (!args.output_data_type)) {
// pixels = (args.image.channels) * (args.image.width) *
// (args.image.height); length = pixels * sizeof(short); bp_enable =
// 0x8900000000000000 + length;
// } else {
// return -1;
// }
//
// // start bypass
// driver::reg_writeq(ifm_src_paddr, MUL8(27));
// driver::reg_writeq(ifm_dst_paddr, MUL8(28));
// driver::reg_writeq(0, MUL8(0));
// driver::reg_writeq(bp_enable, MUL8(0));
// // poll
// int ret = -1;
// ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
// if (ret != -1) {
// // clear "irq"
// driver::reg_readq(MUL8(63));
// }
// // get max value
// if ((!args.input_data_type) && (!args.output_data_type)) {
// float scale = Findfp16Max();
// args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT
// args.output.scale_address[1] = scale;
// }
// return ret;
uint64_t
ifm_src_paddr
=
vaddr_to_paddr
(
args
.
image
.
address
);
uint64_t
ifm_dst_paddr
=
vaddr_to_paddr
(
args
.
output
.
address
);
uint64_t
bp_enable
;
int64_t
length
;
uint64_t
pixels
;
// fp32->fp16
if
((
args
.
input_data_type
)
&&
(
!
args
.
output_data_type
))
{
DLOG
<<
"fp32-fp16"
;
pixels
=
(
args
.
image
.
channels
)
*
(
args
.
image
.
width
)
*
(
args
.
image
.
height
);
length
=
pixels
*
sizeof
(
float
);
bp_enable
=
0x8800000000000000UL
+
(
uint64_t
)
length
;
}
// fp16->fp32
else
if
((
!
args
.
input_data_type
)
&&
(
args
.
output_data_type
))
{
// NOLINT
DLOG
<<
"fp16-fp32"
;
pixels
=
filter
::
calc_aligned_channel
((
args
.
image
.
channels
))
*
(
args
.
image
.
width
)
*
(
args
.
image
.
height
);
length
=
pixels
*
sizeof
(
short
);
// NOLINT
length
=
align_to_x
((
int
)
length
,
64
);
// NOLINT
bp_enable
=
0x8a00000000000000UL
+
length
;
}
// fp16->fp16 findmax
else
if
((
!
args
.
input_data_type
)
&&
(
!
args
.
output_data_type
))
{
// NOLINT
DLOG
<<
"16-16"
;
pixels
=
(
args
.
image
.
channels
)
*
(
args
.
image
.
width
)
*
(
args
.
image
.
height
);
length
=
pixels
*
sizeof
(
short
);
// NOLINT
bp_enable
=
0x8900000000000000
+
length
;
}
else
{
return
-
1
;
}
// start bypass
reg_writeq
(
0
,
MUL8
(
0
));
reg_writeq
(
ifm_src_paddr
,
MUL8
(
27
));
reg_writeq
(
ifm_dst_paddr
,
MUL8
(
28
));
reg_writeq
(
bp_enable
,
MUL8
(
0
));
int
ret
=
-
1
;
ret
=
fpga_regpoll
(
MUL8
(
48
),
BYPASS_DONE
,
0xffffff
);
if
(
ret
!=
-
1
)
{
DLOG
<<
"test done"
;
}
reg_readq
(
MUL8
(
63
));
usleep
(
10
);
// get max value
float
scale
=
Findfp16Max
();
args
.
output
.
scale_address
[
0
]
=
scale
;
// NOLINT
args
.
output
.
scale_address
[
1
]
=
(
float
)(
1.0
/
scale
);
// NOLINT
DLOG
<<
"ret="
<<
ret
;
return
ret
;
}
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaConcat==========="
;
DLOG
<<
" Image_num: "
<<
args
.
image_num
<<
" out_address:"
<<
args
.
image_out
<<
" out_scale_address:"
<<
args
.
scale_out
<<
" out_channel:"
<<
args
.
out_channel
;
...
...
src/fpga/common/fpga_common.cpp
100644 → 100755
浏览文件 @
7a8b998f
...
...
@@ -113,6 +113,12 @@ int fpga_invalidate(void *address, size_t size) {
return
0
;
#endif
}
uint64_t
vaddr_to_paddr
(
void
*
address
)
{
#ifdef PADDLE_MOBILE_ZU5
return
driver
::
vaddr_to_paddr
(
address
);
#else
return
0
;
#endif
}
}
// namespace fpga
}
// namespace paddle_mobile
src/operators/kernel/fpga/V2/feed_kernel.cpp
浏览文件 @
7a8b998f
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/feed_kernel.h"
#include "fpga/V2/filter.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -24,7 +24,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
fpga
::
format_fp16_ofm
(
output
,
aligned_channel
);
return
true
;
}
template
<
>
void
FeedKernel
<
FPGA
,
float
>::
Compute
(
const
FeedParam
<
FPGA
>
&
param
)
{
auto
input
=
...
...
@@ -33,6 +32,9 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
auto
input_ptr
=
input
->
data
<
float
>
();
Tensor
*
output
=
param
.
Out
();
auto
output_ptr
=
output
->
data
<
float
>
();
auto
channel
=
input
->
dims
()[
1
];
uint32_t
aligned_channels
=
fpga
::
filter
::
calc_aligned_channel
((
int
)
channel
);
// NOLINT
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP32
};
...
...
@@ -41,7 +43,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> ¶m) {
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
image
.
address
=
reinterpret_cast
<
void
*>
(
input_ptr
);
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
]
;
args
.
image
.
channels
=
aligned_channels
;
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
pad_height
=
0
;
...
...
src/operators/kernel/fpga/V2/softmax_kernel.cpp
100644 → 100755
浏览文件 @
7a8b998f
...
...
@@ -25,7 +25,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto
input_ptr
=
input
->
data
<
float
>
();
auto
float_input
=
new
Tensor
;
float_input
->
mutable_data
<
float
>
({
1
,
input
->
dims
()[
1
]});
fpga
::
format_fp32_ofm
(
float_input
,
8
);
fpga
::
format_fp32_ofm
(
float_input
,
1024
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录