Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
c2a649d3
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
337
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c2a649d3
编写于
1月 21, 2019
作者:
qnqinan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add dw deconv with group in FPGA track
上级
c5ad3169
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
414 addition
and
46 deletion
+414
-46
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+139
-0
src/fpga/V1/api.h
src/fpga/V1/api.h
+8
-0
src/fpga/V1/deconv_filter.cpp
src/fpga/V1/deconv_filter.cpp
+35
-22
src/fpga/V1/deconv_filter.h
src/fpga/V1/deconv_filter.h
+6
-2
src/fpga/V1/filter.cpp
src/fpga/V1/filter.cpp
+10
-0
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+146
-5
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+14
-0
src/fpga/common/pe.h
src/fpga/common/pe.h
+3
-0
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+22
-8
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+23
-8
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+1
-1
src/operators/op_param.h
src/operators/op_param.h
+7
-0
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
c2a649d3
...
...
@@ -151,6 +151,30 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_DWDconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
*
scale_ptr
,
int
stride
)
{
auto
dims
=
filter_tensor
->
dims
();
auto
num
=
dims
[
0
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
filter_tensor
->
data
<
float
>
();
size_t
memory_size
=
num
*
height
*
width
*
sizeof
(
float
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
// NOLINT
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
int
hw
=
height
*
width
;
deconv_filter
::
deconv_NC_convert
(
&
new_data
,
num
,
1
,
hw
);
num
=
dims
[
1
];
int
channel
=
dims
[
0
];
deconv_filter
::
DWDconv_format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
scale_ptr
,
stride
);
// framework::DDim dims_new =
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
)
{
filter_tensor
->
scale
[
0
]
=
float
(
max_value
/
127.0
);
// NOLINT
filter_tensor
->
scale
[
1
]
=
float
(
127.0
/
max_value
);
// NOLINT
...
...
@@ -243,6 +267,17 @@ void format_dwconv_data(framework::Tensor *filter_tensor,
format_bias_array
(
bias_ptr
,
channel
);
format_fp16_ofm
(
ofm_tensor
);
}
void
format_DWDeconv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
int
group
,
int
sub_conv_n
)
{
int
channel
=
ofm_tensor
->
dims
()[
1
];
// dw-deconv
format_DWDconv_filter
(
filter_tensor
,
(
reinterpret_cast
<
float
*>
(
*
bs_ptr
)
+
sub_conv_n
*
channel
),
sub_conv_n
);
format_bias_array
(
bs_ptr
,
channel
);
format_fp16_ofm
(
ofm_tensor
);
}
void
expand_conv_arg
(
ConvArgs
*
arg
)
{
ConvArgs
args
=
*
arg
;
...
...
@@ -788,5 +823,109 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
arg
->
output
.
scale_address
=
out
->
scale
;
}
// end dwconv arg fill
void
fill_DWDeconv_arg
(
struct
DWDeconvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
input_ptr
=
input
->
data
<
float
>
();
auto
output_ptr
=
out
->
mutable_data
<
float
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
arg
->
sub_conv_num
=
(
uint32_t
)
stride_w
;
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
int
sub_conv_num
=
stride_w
;
int
sub_pad
=
deconv_filter
::
deconv_calc_sub_pad
((
int
)
filter
->
dims
()[
3
],
// NOLINT
padding_w
,
stride_w
);
auto
sub_filter_width
=
(
uint32_t
)
deconv_filter
::
deconv_get_sub_filter_axis
(
(
int
)
filter
->
dims
()[
3
],
stride_w
);
// NOLINT
auto
sub_output_width
=
(
uint32_t
)
deconv_filter
::
deconv_get_sub_out_axis
(
(
int
)
input
->
dims
()[
3
],
sub_pad
,
sub_filter_width
);
// NOLINT
auto
sub_output_height
=
(
uint32_t
)
deconv_filter
::
deconv_get_sub_out_axis
(
(
int
)
input
->
dims
()[
2
],
sub_pad
,
sub_filter_width
);
// NOLINT
arg
->
sub_output_width
=
(
uint32_t
)
sub_output_width
;
arg
->
sub_output_height
=
(
uint32_t
)
sub_output_height
;
arg
->
omit_size
=
(
uint32_t
)
deconv_filter
::
deconv_get_omit
(
stride_w
,
(
int
)
filter
->
dims
()[
3
],
padding_w
);
// NOLINT
auto
sub_channels
=
(
int
)
input
->
dims
()[
1
];
// NOLINT
uint32_t
omit_size
=
arg
->
omit_size
;
int
real_out_width
=
sub_output_width
*
sub_conv_num
-
2
*
omit_size
;
int
real_out_height
=
sub_output_height
*
sub_conv_num
-
2
*
omit_size
;
int
sub_filter_num
=
sub_conv_num
*
(
arg
->
filter_num
);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
1
,
arg
->
filter_num
,
real_out_height
,
real_out_width
});
fpga
::
format_fp16_ofm
(
out
,
dims_out_new
);
auto
out_ptr
=
out
->
data
<
float
>
();
/*====For Addition
arg->output.address =
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
*/
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
int
filter_offset
=
sub_filter_width
*
sub_filter_width
*
align_to_x
(
sub_channels
,
FILTER_ELEMENT_ALIGNMENT
)
*
arg
->
sub_conv_num
;
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
dw_conv_args
.
push_back
(
std
::
make_shared
<
DWconvArgs
>
());
arg
->
dw_conv_args
[
i
]
->
sub_conv_num
=
sub_conv_num
;
arg
->
dw_conv_args
[
i
]
->
relu_enabled
=
relu_enabled
;
arg
->
dw_conv_args
[
i
]
->
bias_address
=
bias_ptr
;
arg
->
dw_conv_args
[
i
]
->
filter_address
=
fpga_malloc
(
filter_offset
*
sizeof
(
int16_t
));
memcpy
(
arg
->
dw_conv_args
[
i
]
->
filter_address
,
(
reinterpret_cast
<
half
*>
(
filter_ptr
)
+
i
*
filter_offset
),
filter_offset
*
sizeof
(
int16_t
));
arg
->
vector_dw_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
dw_conv_args
[
i
]
->
filter_address
),
deleter
));
arg
->
dw_conv_args
[
i
]
->
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
dw_conv_args
[
i
]
->
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
dw_conv_args
[
i
]
->
kernel
.
stride_h
=
(
uint32_t
)
1
;
arg
->
dw_conv_args
[
i
]
->
kernel
.
stride_w
=
(
uint32_t
)
1
;
arg
->
dw_conv_args
[
i
]
->
image
.
address
=
input_ptr
;
arg
->
dw_conv_args
[
i
]
->
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
arg
->
dw_conv_args
[
i
]
->
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
dw_conv_args
[
i
]
->
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
dw_conv_args
[
i
]
->
image
.
pad_height
=
sub_pad
;
arg
->
dw_conv_args
[
i
]
->
image
.
pad_width
=
sub_pad
;
arg
->
dw_conv_args
[
i
]
->
image
.
scale_address
=
input
->
scale
;
arg
->
dw_conv_args
[
i
]
->
output
.
address
=
fpga_malloc
(
sub_output_height
*
align_to_x
(
sub_output_width
*
sub_channels
*
sub_conv_num
,
IMAGE_ALIGNMENT
)
*
sizeof
(
int16_t
));
arg
->
dw_conv_args
[
i
]
->
output
.
scale_address
=
static_cast
<
float
*>
(
fpga_malloc
(
2
*
sizeof
(
float
)));
arg
->
vector_dw_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
dw_conv_args
[
i
]
->
output
.
address
),
deleter
));
arg
->
vector_dw_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
dw_conv_args
[
i
]
->
output
.
scale_address
),
deleter
));
}
// arg->output.scale_address = out->scale;
}
// end dwconv arg fill
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/api.h
浏览文件 @
c2a649d3
...
...
@@ -57,6 +57,10 @@ void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
);
void
fill_DWDeconv_arg
(
struct
DWDeconvArgs
*
arg
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
,
framework
::
Tensor
*
filter
,
bool
relu_enabled
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
);
void
format_deconv_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
,
int
stride
);
...
...
@@ -69,6 +73,10 @@ void format_deconv_data(framework::Tensor* filter_tensor,
void
format_dwconv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
*
scale_ptr
,
float
**
bias_ptr
);
void
format_DWDeconv_data
(
framework
::
Tensor
*
filter_tensor
,
framework
::
Tensor
*
ofm_tensor
,
float
**
bs_ptr
,
int
group
,
int
sub_conv_n
);
template
<
typename
Dtype
>
void
savefile
(
std
::
string
filename
,
void
*
buffer
,
int
dataSize
,
Dtype
tmp
)
{
float
data
;
...
...
src/fpga/V1/deconv_filter.cpp
浏览文件 @
c2a649d3
...
...
@@ -21,15 +21,6 @@ limitations under the License. */
#include "fpga/V1/api.h"
// #include "fpga_api.h"
// just for test
//#include <string>
//#include "deconv.h"
//#include "deconv_api.h"
// using namespace std;
// using namespace paddle_mobile::fpga;
// using namespace baidu::fpga::deconv::api;
// namespace api = baidu::fpga::deconv::api;
namespace
paddle_mobile
{
namespace
fpga
{
namespace
deconv_filter
{
...
...
@@ -42,7 +33,8 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
float
*
tmp
=
*
data_in
;
int
data_size
=
num
*
channel
*
width
*
height
;
int
hw_len
=
height
*
width
;
auto
tmp_data
=
(
float
*
)
fpga_malloc
(
data_size
*
sizeof
(
float
));
auto
tmp_data
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
data_size
*
sizeof
(
float
)));
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
for
(
int
k
=
0
;
k
<
hw_len
;
++
k
)
{
...
...
@@ -97,9 +89,10 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
return
(
stride
-
idx
);
}
void
deconv_get_sub_filter
(
char
**
data_in
,
int
height
,
int
width
,
int
sub_conv_n
,
int
kernel_num
,
int
channel
)
{
char
*
ptr_tmp
=
*
data_in
;
template
<
typename
T
>
void
deconv_get_sub_filter
(
T
**
data_in
,
int
height
,
int
width
,
int
sub_conv_n
,
int
kernel_num
,
int
channel
)
{
T
*
ptr_tmp
=
*
data_in
;
int
sub_num
=
kernel_num
*
sub_conv_n
;
int
sub_h
=
height
/
sub_conv_n
;
int
sub_w
=
width
/
sub_conv_n
;
...
...
@@ -107,7 +100,8 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
int
sub_filter_size
=
kernel_num
*
sub_h
*
sub_w
*
channel
*
sub_conv_n
*
sub_conv_n
;
char
*
ptr_sub_filter
=
(
char
*
)
fpga_malloc
(
sub_filter_size
*
sizeof
(
char
));
T
*
ptr_sub_filter
=
reinterpret_cast
<
T
*>
(
fpga_malloc
(
sub_filter_size
*
sizeof
(
T
)));
for
(
int
idx
=
0
;
idx
<
sub_conv_n
;
++
idx
)
{
for
(
int
nn
=
0
;
nn
<
sub_num
;
++
nn
)
{
int
ni
=
nn
%
kernel_num
;
...
...
@@ -124,7 +118,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
fpga_copy
(
ptr_sub_filter
+
idx
*
sub_h
*
sub_w
*
channel
*
sub_num
+
sidx
,
(
*
data_in
)
+
kidx
,
channel
*
sizeof
(
char
));
(
*
data_in
)
+
kidx
,
channel
*
sizeof
(
T
));
// for (int cc =0; cc < channel; ++cc) {
// ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
// (*data_in)[kidx + cc];
...
...
@@ -140,7 +134,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
void
deconv_NC_convert
(
float
**
filter_in
,
int
kernel_num
,
int
channels
,
int
hw
)
{
float
*
tmp
=
*
filter_in
;
float
*
ptr_filter
=
(
float
*
)
(
paddle_mobile
::
fpga
::
fpga_malloc
(
float
*
ptr_filter
=
reinterpret_cast
<
float
*>
(
paddle_mobile
::
fpga
::
fpga_malloc
(
hw
*
kernel_num
*
channels
*
sizeof
(
float
)));
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
...
...
@@ -188,7 +182,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
result2);
}*/
deconv_get_sub_filter
(
quantize_data
,
height
,
width
,
stride
,
num
,
channel
);
deconv_get_sub_filter
<
char
>
(
quantize_data
,
height
,
width
,
stride
,
num
,
channel
);
/*{
char result2 = (char)0;
string filename = "sub_filter_filter_data";
...
...
@@ -212,10 +207,12 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
((
residual
==
0
)
?
div_num
:
(
div_num
-
1
))
+
align_to_x
(
residual
,
FILTER_NUM_ALIGNMENT
);
char
**
ptr_ptr_data
=
(
char
**
)
fpga_malloc
(
sub_conv_n
*
sizeof
(
char
*
));
char
**
ptr_ptr_data
=
reinterpret_cast
<
char
**>
(
fpga_malloc
(
sub_conv_n
*
sizeof
(
char
*
)));
int
origin_offset
=
sub_chw
*
sub_num
;
for
(
int
i
=
0
;
i
<
sub_conv_n
;
++
i
)
{
(
ptr_ptr_data
)[
i
]
=
(
char
*
)
fpga_malloc
(
origin_offset
*
sizeof
(
char
));
(
ptr_ptr_data
)[
i
]
=
reinterpret_cast
<
char
*>
(
fpga_malloc
(
origin_offset
*
sizeof
(
char
)));
fpga_copy
((
ptr_ptr_data
)[
i
],
(
*
quantize_data
)
+
origin_offset
*
i
,
origin_offset
*
sizeof
(
char
));
...
...
@@ -233,8 +230,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
int
align_offset
=
align_to_x
(
sub_chw
,
FILTER_ELEMENT_ALIGNMENT
)
*
num_after_alignment
;
char
*
ptr_space
=
(
char
*
)
fpga_malloc
(
sub_conv_n
*
align_offset
*
sizeof
(
char
));
// continuous space
char
*
ptr_space
=
reinterpret_cast
<
char
*>
(
fpga_malloc
(
sub_conv_n
*
align_offset
*
sizeof
(
char
)
));
// continuous space
for
(
int
i
=
0
;
i
<
sub_conv_n
;
++
i
)
{
char
*
ptr_tmp
=
(
ptr_ptr_data
)[
i
];
...
...
@@ -251,7 +248,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_copy
(
ptr_space
+
i
*
align_offset
,
ptr_tmp
,
align_offset
);
fpga_free
(
ptr_tmp
);
}
*
data_in
=
(
float
*
)
ptr_space
;
*
data_in
=
reinterpret_cast
<
float
*>
(
ptr_space
)
;
/* {
char result2 = (char)0;
...
...
@@ -262,6 +259,22 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_flush
(
ptr_space
,
sub_conv_n
*
align_offset
*
sizeof
(
char
));
}
void
DWDconv_format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
float
*
scale_ptr
,
int
stride
)
{
deconv_inverse_filter
(
data_in
,
num
,
channel
,
width
,
height
);
filter
::
quantize_to_fp16
(
data_in
,
channel
,
height
,
width
,
scale_ptr
);
int16_t
**
quantize_data
=
(
int16_t
**
)
data_in
;
// NOLINT
filter
::
convert_to_hwn
(
quantize_data
,
channel
,
height
,
width
);
deconv_get_sub_filter
<
int16_t
>
(
quantize_data
,
height
,
width
,
stride
,
num
,
channel
);
filter
::
align_element_n
(
quantize_data
,
channel
,
height
,
width
);
fpga_flush
(
*
quantize_data
,
align_to_x
(
channel
,
FILTER_ELEMENT_ALIGNMENT
)
*
height
*
width
*
sizeof
(
int16_t
));
}
}
// namespace deconv_filter
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/deconv_filter.h
浏览文件 @
c2a649d3
...
...
@@ -24,11 +24,15 @@ int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
int
deconv_get_sub_filter_axis
(
int
filter_axis
,
int
stride
);
int
deconv_get_sub_out_axis
(
int
image_axis
,
int
sub_pad
,
int
sub_filter_axis
);
int
deconv_get_omit
(
int
stride
,
int
filter_width
,
int
pad
);
void
deconv_get_sub_filter
(
char
**
data_in
,
int
height
,
int
width
,
int
sub_conv_n
,
int
kernel_num
,
int
channel
);
template
<
typename
T
>
void
deconv_get_sub_filter
(
T
**
data_in
,
int
height
,
int
width
,
int
sub_conv_n
,
int
kernel_num
,
int
channel
);
void
deconv_format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
,
int
stride
);
void
deconv_NC_convert
(
float
**
filter_in
,
int
kernel_num
,
int
channels
,
int
hw
);
void
DWDconv_format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
float
*
scale_ptr
,
int
stride
);
}
// namespace deconv_filter
}
// namespace fpga
...
...
src/fpga/V1/filter.cpp
100755 → 100644
浏览文件 @
c2a649d3
...
...
@@ -346,6 +346,16 @@ void format_dwconv_filter(float **data_in, int num, int height, int width,
fpga_flush
(
*
quantize_data
,
align_to_x
(
num
,
FILTER_ELEMENT_ALIGNMENT
)
*
height
*
width
*
sizeof
(
int16_t
));
}
void
format_DWDeconv_filter
(
float
**
data_in
,
int
num
,
int
height
,
int
width
,
float
*
scale_ptr
)
{
quantize_to_fp16
(
data_in
,
num
,
height
,
width
,
scale_ptr
);
int16_t
**
quantize_data
=
(
int16_t
**
)
data_in
;
// NOLINT
convert_to_hwn
(
quantize_data
,
num
,
height
,
width
);
align_element_n
(
quantize_data
,
num
,
height
,
width
);
fpga_flush
(
*
quantize_data
,
align_to_x
(
num
,
FILTER_ELEMENT_ALIGNMENT
)
*
height
*
width
*
sizeof
(
int16_t
));
}
}
// namespace filter
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/pe.cpp
浏览文件 @
c2a649d3
...
...
@@ -19,6 +19,7 @@ limitations under the License. */
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
#ifdef COST_TIME_PRINT
#include <sys/time.h>
#include <time.h>
...
...
@@ -163,6 +164,7 @@ using namespace std; // NOLINT
#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
#define REG_DWCONV_FILTER_SHAPE 0xe10
#define REG_DWCONV_FILTER_N_ALIGN 0xe18
#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
#define REG_DWCONV_CMD 0xe00
int
ComputeFpgaConv
(
const
struct
SplitConvArgs
&
args
)
{
...
...
@@ -591,6 +593,20 @@ int PerformBypass(const struct BypassArgs &args) {
return
0
;
}
// PerformBypass
uint64_t
FPGAVersion
()
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaBypass==========="
;
#endif
#ifdef PADDLE_MOBILE_ZU5
uint64_t
fpga_ver
=
0
;
pthread_mutex_lock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
fpga_ver
=
reg_readq
(
REG_HARDWARE_STATUS
);
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
fpga_ver
;
#endif
return
0
;
}
// FPGAVersion
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaConcat==========="
;
...
...
@@ -655,6 +671,45 @@ void deconv_post_process(const struct DeconvArgs &args) {
fpga_flush
(
args
.
output
.
address
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
}
void
DWDeconv_post_process
(
const
struct
DWDeconvArgs
&
args
)
{
int
sub_conv_n
=
args
.
sub_conv_num
;
int
sub_height
=
args
.
sub_output_height
;
int
sub_width
=
args
.
sub_output_width
;
int
omit_size
=
args
.
omit_size
;
int
channel
=
args
.
filter_num
;
int
num
=
1
;
int
origin_h
=
sub_height
*
sub_conv_n
;
int
origin_w
=
sub_width
*
sub_conv_n
;
int
align_origin_w
=
align_to_x
(
origin_w
*
channel
,
IMAGE_ALIGNMENT
);
int
deconv_h
=
origin_h
-
2
*
omit_size
;
int
deconv_w
=
origin_w
-
2
*
omit_size
;
int
deconv_row_len
=
deconv_w
*
channel
;
int
align_deconv_row_len
=
align_to_x
(
deconv_row_len
,
IMAGE_ALIGNMENT
);
for
(
int
idx
=
0
;
idx
<
sub_conv_n
;
++
idx
)
{
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
dw_conv_args
[
idx
]
->
output
.
address
,
align_origin_w
*
origin_h
*
sizeof
(
int16_t
));
}
int
deconv_idx
=
0
;
for
(
int
nn
=
0
;
nn
<
num
;
++
nn
)
{
for
(
int
hh
=
0
;
hh
<
origin_h
;
++
hh
)
{
int
hx
=
(
hh
%
sub_conv_n
);
auto
sub_t
=
(
int16_t
*
)(
args
.
dw_conv_args
[
sub_conv_n
-
hx
-
1
]
// NOLINT
->
output
.
address
);
int
hi
=
(
hh
/
sub_conv_n
);
if
((
hh
<
omit_size
)
||
(
hh
>=
(
origin_h
-
omit_size
)))
continue
;
int
sidx
=
(
nn
*
origin_h
*
align_origin_w
+
hi
*
align_origin_w
+
omit_size
*
channel
);
fpga_copy
((
int16_t
*
)(
args
.
output
.
address
)
+
deconv_idx
,
// NOLINT
sub_t
+
sidx
,
sizeof
(
int16_t
)
*
deconv_row_len
);
// NOLINT
deconv_idx
+=
align_deconv_row_len
;
}
}
fpga_flush
(
args
.
output
.
address
,
num
*
align_deconv_row_len
*
deconv_h
*
sizeof
(
int16_t
));
}
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
...
...
@@ -792,17 +847,21 @@ int ComputeDWConv(const struct DWconvArgs &args) {
align_to_x
((
uint64_t
)
args
.
image
.
channels
,
IMAGE_ALIGNMENT
);
uint64_t
filter_amount_per_row_align
=
filter_N_align
*
(
uint64_t
)
args
.
kernel
.
width
;
uint64_t
filter_amount_align
=
filter_N_align
*
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
;
uint64_t
sub_filter_amount_align
=
filter_N_align
*
(
uint64_t
)
args
.
kernel
.
width
*
(
uint64_t
)
args
.
kernel
.
height
;
uint64_t
filter_amount_align
=
sub_filter_amount_align
*
(
uint64_t
)
args
.
sub_conv_num
;
uint32_t
output_height
=
(
uint32_t
)(
(
args
.
image
.
height
+
args
.
image
.
pad_height
*
2
-
args
.
kernel
.
height
)
/
args
.
kernel
.
stride_h
+
1
);
uint32_t
output_width
=
(
uint32_t
)(
(
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
1
);
((
args
.
image
.
width
+
args
.
image
.
pad_width
*
2
-
args
.
kernel
.
width
)
/
args
.
kernel
.
stride_w
+
1
)
*
args
.
sub_conv_num
);
uint64_t
image_amount_per_row
=
align_to_x
((
uint64_t
)
args
.
image
.
width
*
(
uint64_t
)
args
.
image
.
channels
,
...
...
@@ -845,12 +904,15 @@ int ComputeDWConv(const struct DWconvArgs &args) {
/*restart scale*/
reg_writeq
(
output_scale
,
REG_SCALE_PARAMETER
);
reg_writeq
(
image_physical_address
,
REG_POOLING_IMAGE_BASE_ADDR
);
reg_writeq
(
output_physical_address
,
REG_POOLING_RESULT_BASE_ADDR
);
reg_writeq
((
bias_physical_address
<<
32
|
filter_physical_address
),
REG_DWCONV_FILTER_BASE_ADDR
);
reg_writeq
(
filter_amount_per_row_align
|
(
filter_amount_align
<<
32
),
REG_DWCONV_FILTER_SHAPE
);
reg_writeq
(
sub_filter_amount_align
|
(((
uint64_t
)
args
.
sub_conv_num
)
<<
32
),
REG_DWCONV_FILTER_SUBNUMBER
);
reg_writeq
(
filter_N_align
,
REG_DWCONV_FILTER_N_ALIGN
);
reg_writeq
(
...
...
@@ -904,10 +966,89 @@ int ComputeDWConv(const struct DWconvArgs &args) {
output_scale
=
reg_readq
(
REG_SCALE_PARAMETER
);
output_scale
=
(
output_scale
<<
32
)
|
(
output_scale
>>
32
);
fpga_copy
(
args
.
output
.
scale_address
,
&
output_scale
,
sizeof
(
float
)
*
2
);
DLOG
<<
"output_scale:"
<<
output_scale
;
pthread_mutex_unlock
(
&
g_fpgainfo
.
pe_data
->
mutex
);
return
ret
;
#endif
return
0
;
}
int
ComputeDWDeconv
(
const
struct
DWDeconvArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFPGADeConv==========="
;
DLOG
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
<<
"omit_size:"
<<
args
.
omit_size
<<
"sub_output_width: "
<<
args
.
sub_output_width
<<
"sub_output_height: "
<<
args
.
sub_output_height
<<
" sub_conv_num:"
<<
args
.
sub_conv_num
;
DLOG
<<
"args.output.address: "
<<
args
.
output
.
address
<<
"args.output.scale_address: "
<<
args
.
output
.
scale_address
;
#endif
int
sub_conv_num
=
args
.
sub_conv_num
;
#ifdef COST_TIME_PRINT
timeval
start
,
end
;
long
dif_sec
,
dif_usec
;
// NOLINT
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
ComputeDWConv
(
*
args
.
dw_conv_args
[
i
]);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv basic_conv: "
<<
i
<<
" times: "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
}
if
(
sub_conv_num
>
1
)
{
float
max_scale
=
-
1.0
f
;
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
dw_conv_args
[
i
]
->
output
.
scale_address
,
2
*
sizeof
(
float
));
float
ptr_scale
=
(
args
.
dw_conv_args
[
i
]
->
output
.
scale_address
)[
0
];
if
(
ptr_scale
>
max_scale
)
{
args
.
output
.
scale_address
[
0
]
=
ptr_scale
;
args
.
output
.
scale_address
[
1
]
=
(
args
.
dw_conv_args
[
i
]
->
output
.
scale_address
)[
1
];
}
}
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv scale "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
}
#ifdef COST_TIME_PRINT
gettimeofday
(
&
start
,
NULL
);
#endif
DWDeconv_post_process
(
args
);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
dif_usec
=
end
.
tv_usec
-
start
.
tv_usec
;
std
::
cout
<<
"deconv_post_process "
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
"us"
<<
std
::
endl
;
#endif
#endif
return
0
;
}
// ComputeFpgaDeconv
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/common/fpga_common.h
100755 → 100644
浏览文件 @
c2a649d3
...
...
@@ -229,6 +229,7 @@ struct DeconvArgs {
std
::
vector
<
std
::
shared_ptr
<
SplitConvArgs
>>
split_conv_args
;
};
struct
DWconvArgs
{
uint32_t
sub_conv_num
;
bool
relu_enabled
;
void
*
bias_address
;
void
*
filter_address
;
...
...
@@ -236,6 +237,19 @@ struct DWconvArgs {
struct
ImageInputArgs
image
;
struct
ImageOutputArgs
output
;
};
struct
DWDeconvArgs
{
uint32_t
sub_conv_num
;
uint32_t
group_num
;
uint32_t
filter_num
;
uint32_t
omit_size
;
uint32_t
sub_output_width
;
uint32_t
sub_output_height
;
struct
ImageOutputArgs
output
;
std
::
vector
<
std
::
shared_ptr
<
DWconvArgs
>>
dw_conv_args
;
std
::
vector
<
std
::
shared_ptr
<
char
>>
vector_dw_conv_space
;
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static
inline
uint32_t
align_to_x
(
int64_t
num
,
int64_t
x
)
{
...
...
src/fpga/common/pe.h
浏览文件 @
c2a649d3
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
fpga
{
uint64_t
FPGAVersion
();
int
PerformBypass
(
const
struct
BypassArgs
&
args
);
int
ComputeBasicConv
(
const
struct
ConvArgs
&
args
);
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
);
...
...
@@ -28,5 +29,7 @@ int ComputeFPGAConcat(const struct ConcatArgs& args);
int
ComputeFPGASplit
(
const
struct
SplitArgs
&
args
);
int
ComputeFpgaDeconv
(
const
struct
DeconvArgs
&
args
);
int
ComputeDWConv
(
const
struct
DWconvArgs
&
args
);
int
ComputeDWDeconv
(
const
struct
DWDeconvArgs
&
args
);
}
// namespace fpga
}
// namespace paddle_mobile
src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
浏览文件 @
c2a649d3
...
...
@@ -49,13 +49,23 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
"filter width should be equal to filter height "
);
PADDLE_MOBILE_ENFORCE
(((
filter
->
dims
()[
2
]
%
param
->
Strides
()[
0
])
==
0
),
"filter axis should be the multiple of stride axis "
);
fpga
::
format_deconv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
(),
sub_conv_n
);
fpga
::
DeconvArgs
deconv_arg
=
{
0
};
fpga
::
fill_deconv_arg
(
&
deconv_arg
,
input
,
out
,
filter
,
relu_enabled
,
param
->
Groups
(),
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
deconv_arg
);
if
(
param
->
Groups
()
==
channel
)
{
fpga
::
format_DWDeconv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
(),
sub_conv_n
);
fpga
::
DWDeconvArgs
DWDeconv_arg
=
{
0
};
fpga
::
fill_DWDeconv_arg
(
&
DWDeconv_arg
,
input
,
out
,
filter
,
relu_enabled
,
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
DWDeconv_arg
);
}
else
{
fpga
::
format_deconv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
(),
sub_conv_n
);
fpga
::
DeconvArgs
deconv_arg
=
{
0
};
fpga
::
fill_deconv_arg
(
&
deconv_arg
,
input
,
out
,
filter
,
relu_enabled
,
param
->
Groups
(),
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
deconv_arg
);
}
return
true
;
}
...
...
@@ -63,7 +73,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
template
<
>
void
DeconvAddKernel
<
FPGA
,
float
>::
Compute
(
const
FusionDeconvAddParam
<
FPGA
>
&
param
)
{
fpga
::
ComputeFpgaDeconv
(
param
.
FpgaArgs
());
if
(
param
.
Groups
()
==
param
.
Output
()
->
dims
()[
1
])
{
fpga
::
ComputeDWDeconv
(
param
.
FpgaDWDconvArgs
());
}
else
{
fpga
::
ComputeFpgaDeconv
(
param
.
FpgaArgs
());
}
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
浏览文件 @
c2a649d3
...
...
@@ -50,20 +50,35 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
"filter width should be equal to filter height "
);
PADDLE_MOBILE_ENFORCE
(((
filter
->
dims
()[
2
]
%
param
->
Strides
()[
0
])
==
0
),
"filter axis should be the multiple of stride axis "
);
fpga
::
format_deconv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
(),
sub_conv_n
);
fpga
::
DeconvArgs
deconv_arg
=
{
0
};
fpga
::
fill_deconv_arg
(
&
deconv_arg
,
input
,
out
,
filter
,
relu_enabled
,
param
->
Groups
(),
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
deconv_arg
);
if
(
param
->
Groups
()
==
channel
)
{
fpga
::
format_DWDeconv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
(),
sub_conv_n
);
fpga
::
DWDeconvArgs
DWDeconv_arg
=
{
0
};
fpga
::
fill_DWDeconv_arg
(
&
DWDeconv_arg
,
input
,
out
,
filter
,
relu_enabled
,
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
DWDeconv_arg
);
}
else
{
fpga
::
format_deconv_data
(
filter
,
out
,
&
bs_ptr
,
param
->
Groups
(),
sub_conv_n
);
fpga
::
DeconvArgs
deconv_arg
=
{
0
};
fpga
::
fill_deconv_arg
(
&
deconv_arg
,
input
,
out
,
filter
,
relu_enabled
,
param
->
Groups
(),
param
->
Strides
()[
0
],
param
->
Strides
()[
1
],
param
->
Paddings
()[
0
],
param
->
Paddings
()[
1
],
bs_ptr
);
param
->
SetFpgaArgs
(
deconv_arg
);
}
return
true
;
}
template
<
>
void
DeconvAddReluKernel
<
FPGA
,
float
>::
Compute
(
const
FusionDeconvAddReluParam
<
FPGA
>
&
param
)
{
fpga
::
ComputeFpgaDeconv
(
param
.
FpgaArgs
());
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if
(
param
.
Groups
()
==
param
.
Output
()
->
dims
()[
1
])
{
fpga
::
ComputeDWDeconv
(
param
.
FpgaDWDconvArgs
());
}
else
{
fpga
::
ComputeFpgaDeconv
(
param
.
FpgaArgs
());
}
}
}
// namespace operators
...
...
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
c2a649d3
...
...
@@ -22,7 +22,7 @@ namespace operators {
template
<
>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input
=
const_cast
<
LoD
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
out
=
param
->
Out
();
fpga
::
format_fp32_ofm
(
out
);
...
...
src/operators/op_param.h
浏览文件 @
c2a649d3
...
...
@@ -2357,10 +2357,17 @@ class ConvTransposeParam : public OpParam {
private:
fpga
::
DeconvArgs
fpga_conv_args
;
fpga
::
DWDeconvArgs
fpga_DWDeconv_args
;
public:
const
fpga
::
DeconvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
const
fpga
::
DWDeconvArgs
&
FpgaDWDconvArgs
()
const
{
return
fpga_DWDeconv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
DeconvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
void
SetFpgaArgs
(
const
fpga
::
DWDeconvArgs
&
args
)
{
fpga_DWDeconv_args
=
args
;
}
#endif
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录