Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8ab9036e
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8ab9036e
编写于
12月 30, 2018
作者:
qnqinan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add dwconv compute function and update some files to support FPGA dwconv
上级
19308114
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
186 addition
and
37 deletion
+186
-37
src/fpga/V1/bias_scale.cpp
src/fpga/V1/bias_scale.cpp
+6
-11
src/fpga/V1/filter.cpp
src/fpga/V1/filter.cpp
+14
-23
src/fpga/V1/filter.h
src/fpga/V1/filter.h
+1
-1
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+162
-0
src/fpga/common/pe.h
src/fpga/common/pe.h
+1
-1
src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+1
-1
src/operators/kernel/fpga/V1/reshape_kernel.cpp
src/operators/kernel/fpga/V1/reshape_kernel.cpp
+1
-0
未找到文件。
src/fpga/V1/bias_scale.cpp
浏览文件 @
8ab9036e
...
...
@@ -86,20 +86,15 @@ void format_bias_array(float **bias_array, int num) {
float
*
ptr_unaligned
=
*
bias_array
;
int
num_before_align
=
num
;
int
num_after_align
=
align_to_x
(
num_before_align
,
BIAS_NUM_ALIGNMENT
);
floa
t
*
ptr_aligned
=
(
float
*
)
fpga_malloc
(
num_after_align
*
sizeof
(
floa
t
));
// NOLINT
int16_
t
*
ptr_aligned
=
(
int16_t
*
)
fpga_malloc
(
num_after_align
*
sizeof
(
int16_
t
));
// NOLINT
memset
(
ptr_aligned
,
0
,
num_after_align
*
sizeof
(
float
));
if
(
num
<
16
)
{
memcpy
(
ptr_aligned
,
ptr_unaligned
,
num
*
sizeof
(
float
));
for
(
int
i
=
num
;
i
<
num_after_align
;
i
++
)
{
ptr_aligned
[
i
]
=
ptr_unaligned
[
i
%
num
];
}
}
else
{
memcpy
(
ptr_aligned
,
ptr_unaligned
,
num
*
sizeof
(
float
));
memset
(
ptr_aligned
,
0
,
num_after_align
*
sizeof
(
int16_t
));
for
(
int
i
=
0
;
i
<
num_before_align
;
i
++
)
{
ptr_aligned
[
i
]
=
fp32_2_fp16
(
ptr_unaligned
[
i
]);
}
*
bias_array
=
(
float
*
)
ptr_aligned
;
// NOLINT
fpga_free
(
ptr_unaligned
);
*
bias_array
=
ptr_aligned
;
}
}
// namespace bias_scale
...
...
src/fpga/V1/filter.cpp
100644 → 100755
浏览文件 @
8ab9036e
...
...
@@ -292,34 +292,25 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
fpga_free
(
tmp
);
}
void
align_element_n
w
(
int16_t
**
data_in
,
int
num
,
int
height
,
int
width
)
{
int
unalign_n
w
=
num
*
width
;
int
align_n
w
=
align_to_x
(
num
*
width
,
FILTER_ELEMENT_ALIGNMENT
);
if
(
unalign_n
w
==
align_nw
)
{
void
align_element_n
(
int16_t
**
data_in
,
int
num
,
int
height
,
int
width
)
{
int
unalign_n
=
num
;
int
align_n
=
align_to_x
(
num
,
FILTER_ELEMENT_ALIGNMENT
);
if
(
unalign_n
==
align_n
)
{
return
;
}
else
{
int16_t
*
tmp
=
*
data_in
;
int
num_element
=
height
*
align_nw
;
int
num_element
=
height
*
width
*
align_n
;
int16_t
*
data_tmp
=
(
int16_t
*
)
fpga_malloc
(
num_element
*
sizeof
(
int16_t
));
// NOLINT
memset
(
data_tmp
,
0
,
num_element
*
sizeof
(
int16_t
));
if
(
unalign_nw
>=
FILTER_ELEMENT_ALIGNMENT
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int
offset_unalign
=
h
*
unalign_nw
;
int
offset_align
=
h
*
align_nw
;
for
(
int
nw
=
0
;
nw
<
unalign_nw
;
nw
++
)
{
data_tmp
[
offset_align
+
nw
]
=
*
((
*
data_in
)
+
offset_unalign
+
nw
);
}
}
}
else
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int
offset_unalign
=
h
*
unalign_nw
;
int
offset_align
=
h
*
align_nw
;
for
(
int
nw
=
0
;
nw
<
align_nw
;
nw
++
)
{
data_tmp
[
offset_align
+
nw
]
=
*
((
*
data_in
)
+
offset_unalign
+
nw
%
unalign_nw
);
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
int
offset_unalign
=
h
*
width
*
unalign_n
+
w
*
unalign_n
;
int
offset_align
=
h
*
width
*
align_n
+
w
*
align_n
;
for
(
int
n
=
0
;
n
<
unalign_n
;
n
++
)
{
data_tmp
[
offset_align
+
n
]
=
*
((
*
data_in
)
+
offset_unalign
+
n
);
}
}
}
...
...
@@ -351,9 +342,9 @@ void format_dwconv_filter(float **data_in, int num, int height, int width,
quantize_to_fp16
(
data_in
,
num
,
height
,
width
,
scale_ptr
);
int16_t
**
quantize_data
=
(
int16_t
**
)
data_in
;
// NOLINT
convert_to_hwn
(
quantize_data
,
num
,
height
,
width
);
align_element_n
w
(
quantize_data
,
num
,
height
,
width
);
fpga_flush
(
*
quantize_data
,
align_to_x
(
num
*
width
,
FILTER_ELEMENT_ALIGNMENT
)
*
height
*
sizeof
(
char
));
align_element_n
(
quantize_data
,
num
,
height
,
width
);
fpga_flush
(
*
quantize_data
,
align_to_x
(
num
,
FILTER_ELEMENT_ALIGNMENT
)
*
height
*
width
*
sizeof
(
int16_t
));
}
}
// namespace filter
}
// namespace fpga
...
...
src/fpga/V1/filter.h
100644 → 100755
浏览文件 @
8ab9036e
...
...
@@ -39,7 +39,7 @@ void format_fc_filter(float** data_in, int num, int channel, int height,
int
width
,
int
group_num
,
float
max
);
void
convert_to_hwn
(
int16_t
**
data_in
,
int
num
,
int
height
,
int
width
);
void
align_element_n
w
(
int16_t
**
data_in
,
int
num
,
int
height
,
int
width
);
void
align_element_n
(
int16_t
**
data_in
,
int
num
,
int
height
,
int
width
);
void
quantize_to_fp16
(
float
**
data_in
,
int
num
,
int
height
,
int
width
,
float
*
scale_ptr
);
void
format_dwconv_filter
(
float
**
data_in
,
int
num
,
int
height
,
int
width
,
...
...
src/fpga/V1/pe.cpp
浏览文件 @
8ab9036e
...
...
@@ -159,6 +159,12 @@ using namespace std; // NOLINT
#define REG_EW_IMAGE_PIXEL 0x0F30
#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
/*dwconv*/
#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
#define REG_DWCONV_FILTER_SHAPE 0xe10
#define REG_DWCONV_FILTER_N_ALIGN 0xe18
#define REG_DWCONV_CMD 0xe00
int
ComputeFpgaConv
(
const
struct
SplitConvArgs
&
args
)
{
// ComputeBasicConv(args.conv_arg[0]);
#ifdef FPGA_PRINT_MODE
...
...
@@ -746,6 +752,162 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
args
.
height
,
args
.
width
);
return
0
;
}
// ComputeFPGASplit
int
ComputeDWConv
(
const
struct
DWconvArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeDWConv==========="
;
DLOG
<<
" mode:"
<<
args
.
relu_enabled
;
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" filter_address:"
<<
args
.
filter_address
<<
" bias_address:"
<<
args
.
bias_address
;
DLOG
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifdef PADDLE_MOBILE_ZU5
DLOG
<<
"DWConv"
;
// return 0;
uint64_t
output_scale
=
0
;
uint64_t
timer_cnt
=
0
;
int
ret
=
0
;
uint64_t
cmd
=
args
.
relu_enabled
;
uint64_t
image_physical_address
=
0
;
uint64_t
output_physical_address
=
0
;
uint64_t
filter_physical_address
=
0
;
uint64_t
bias_physical_address
=
0
;
image_physical_address
=
vaddr_to_paddr
(
args
.
image
.
address
);
output_physical_address
=
vaddr_to_paddr
(
args
.
output
.
address
);
filter_physical_address
=
vaddr_to_paddr
(
args
.
filter_address
);
bias_physical_address
=
vaddr_to_paddr
(
args
.
bias_address
);
uint64_t
filter_N_align
=
align_to_x
((
uint64_t
)
args
.
image
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
filter_amount_per_row_align
=
filter_N_align
*
(
uint64_t
)
args
.
kernel
.
width
;
uint64_t
filter_amount_align
=
filter_N_align
*
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
;
uint32_t
output_height
=
(
uint32_t
)(
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
1
);
uint32_t
output_width
=
(
uint32_t
)(
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
1
);
uint64_t
image_amount_per_row
=
align_to_x
((
uint64_t
)
args
.
image
.
width
*
(
uint64_t
)
args
.
image
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
image_one_pad_per_row
=
align_to_x
((
uint64_t
)
args
.
image
.
width
*
(
uint64_t
)
args
.
image
.
channels
,
FILTER_ELEMENT_ALIGNMENT
)
+
(
uint64_t
)
args
.
image
.
pad_width
*
(
uint64_t
)
args
.
image
.
channels
;
uint64_t
image_two_pad_per_row
=
align_to_x
(
((
uint64_t
)
args
.
image
.
width
+
(
uint64_t
)
args
.
image
.
pad_width
*
2
)
*
(
uint64_t
)
args
.
image
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
image_row_mul_pooling_hight
=
image_amount_per_row
*
(
uint64_t
)
args
.
kernel
.
height
;
uint64_t
image_row_mul_pad_hight
=
image_amount_per_row
*
(
uint64_t
)
args
.
image
.
pad_height
;
uint64_t
image_row_mul_step_hight
=
image_amount_per_row
*
(
uint64_t
)
args
.
kernel
.
stride_h
;
uint64_t
result_amount_align_32
=
align_to_x
((
uint64_t
)
output_width
*
(
uint64_t
)
args
.
image
.
channels
,
FILTER_ELEMENT_ALIGNMENT
);
uint64_t
result_amount_align_64
=
align_to_x
(
(
uint64_t
)
output_width
*
(
uint64_t
)
args
.
image
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
image_calcu_height
=
(
uint64_t
)
args
.
kernel
.
height
+
((
uint64_t
)
output_height
-
1
)
*
(
uint64_t
)
args
.
kernel
.
stride_h
;
uint64_t
image_pad_left
=
args
.
image
.
channels
*
args
.
image
.
pad_width
;
uint64_t
image_skip_window
=
args
.
image
.
channels
*
args
.
kernel
.
stride_w
;
uint64_t
image_padleft_skipwindow
=
(
image_skip_window
<<
32
)
|
image_pad_left
;
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
if
(
ERROR
==
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_POOLING
]
->
status
)
{
ret
=
-
EIO
;
DLOG
<<
"Conv Status Error!"
;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
}
/*restart scale*/
reg_writeq
(
output_scale
,
REG_SCALE_PARAMETER
);
reg_writeq
(
image_physical_address
,
REG_POOLING_IMAGE_BASE_ADDR
);
reg_writeq
(
output_physical_address
,
REG_POOLING_RESULT_BASE_ADDR
);
reg_writeq
((
bias_physical_address
<<
32
|
filter_physical_address
),
REG_DWCONV_FILTER_BASE_ADDR
);
reg_writeq
(
filter_amount_per_row_align
|
(
filter_amount_align
<<
32
),
REG_DWCONV_FILTER_SHAPE
);
reg_writeq
(
filter_N_align
,
REG_DWCONV_FILTER_N_ALIGN
);
reg_writeq
(
((
uint64_t
)
args
.
image
.
height
)
|
(((
uint64_t
)
args
.
image
.
width
)
<<
32
),
REG_POOLING_IMAGE_PIXEL
);
reg_writeq
(
((
uint64_t
)
args
.
kernel
.
height
)
|
(((
uint64_t
)
args
.
kernel
.
width
)
<<
32
),
REG_POOLING_WINDOW_SIZE
);
reg_writeq
(((
uint64_t
)
output_height
)
|
(((
uint64_t
)
output_width
)
<<
32
),
REG_POOLING_RESULT_PIXEL
);
reg_writeq
(((
uint64_t
)
args
.
image
.
pad_height
)
|
(((
uint64_t
)
args
.
image
.
pad_width
)
<<
32
),
REG_POOLING_PAD_PIXEL
);
reg_writeq
(((
uint64_t
)
args
.
kernel
.
stride_h
)
|
(((
uint64_t
)
args
.
kernel
.
stride_w
)
<<
32
),
REG_POOLING_STEP_PIXEL
);
reg_writeq
((
uint64_t
)
args
.
image
.
channels
,
REG_POOLING_CHANNEL_NUMBER
);
reg_writeq
(
image_amount_per_row
,
REG_POOLING_IMAGE_AMOUNT_PER_ROW
);
reg_writeq
(
image_one_pad_per_row
,
REG_POOLING_IMAGE_ONE_PAD_PER_ROW
);
reg_writeq
(
image_two_pad_per_row
,
REG_POOLING_IMAGE_TWO_PAD_PER_ROW
);
reg_writeq
(
image_row_mul_pooling_hight
,
REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT
);
reg_writeq
(
image_row_mul_pad_hight
,
REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT
);
reg_writeq
(
image_row_mul_step_hight
,
REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT
);
reg_writeq
(
result_amount_align_32
,
REG_POOLING_RESULT_AMOUNT_ALIGN_32
);
reg_writeq
(
result_amount_align_64
,
REG_POOLING_RESULT_AMOUNT_ALIGN_64
);
reg_writeq
(
image_calcu_height
,
REG_POOLING_IMAGE_CALCU_HEIGHT
);
reg_writeq
(
image_padleft_skipwindow
,
REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW
);
/*SDK刷Cache保证数据一致性*/
reg_writeq
(
cmd
,
REG_DWCONV_CMD
);
DLOG
<<
"before reg poll"
;
if
(
0
!=
fpga_regpoll
(
REG_INTERRUPT
,
INTERRUPT_POOLING
,
PE_IRQ_TIMEOUT
))
{
g_fpgainfo
.
pe_data
->
pes
[
PE_IDX_POOLING
]
->
status
=
ERROR
;
ret
=
-
EIO
;
DLOG
<<
"Pooling Wait Irq Timeout!"
;
}
DLOG
<<
"after reg poll"
;
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
#endif
return
0
;
}
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/common/pe.h
浏览文件 @
8ab9036e
...
...
@@ -27,6 +27,6 @@ int ComputeFpgaConv(const struct SplitConvArgs& args);
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
);
int
ComputeFPGASplit
(
const
struct
SplitArgs
&
args
);
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
);
int
ComputeDWConv
(
const
struct
DWconvArgs
&
args
);
}
// namespace fpga
}
// namespace paddle_mobile
src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
浏览文件 @
8ab9036e
...
...
@@ -83,7 +83,7 @@ template <>
void
ConvAddBNReluKernel
<
FPGA
,
float
>::
Compute
(
const
FusionConvAddBNReluParam
<
FPGA
>
&
param
)
{
if
(
param
.
Groups
()
==
param
.
Output
()
->
dims
()[
1
])
{
// fpga::ComputeFpga
Conv(param.FpgaDwconvArgs());
fpga
::
ComputeDW
Conv
(
param
.
FpgaDwconvArgs
());
}
else
{
fpga
::
ComputeFpgaConv
(
param
.
FpgaArgs
());
}
...
...
src/operators/kernel/fpga/V1/reshape_kernel.cpp
浏览文件 @
8ab9036e
...
...
@@ -21,6 +21,7 @@ namespace operators {
template
<
>
bool
ReshapeKernel
<
FPGA
,
float
>::
Init
(
ReshapeParam
<
FPGA
>
*
param
)
{
param
->
Out
()
->
ShareDataWith
(
*
param
->
InputX
());
return
true
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录