Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
35cfe592
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
35cfe592
编写于
1月 03, 2019
作者:
Z
zhangyang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Avoid memory leakage for FPGA track
上级
e96e3dd5
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
160 addition
and
95 deletion
+160
-95
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+143
-87
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+6
-6
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+4
-0
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+7
-2
未找到文件。
src/fpga/V1/api.cpp
浏览文件 @
35cfe592
...
...
@@ -411,6 +411,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto
input_ptr
=
input
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
out_ptr
=
out
->
data
<
float
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
// Either group_num or split_num = 1;
...
...
@@ -421,6 +422,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
conv_arg
=
(
ConvArgs
*
)
fpga_malloc
(
arg
->
split_num
*
sizeof
(
ConvArgs
));
// NOLINT
arg
->
shared_conv_arg
=
std
::
shared_ptr
<
ConvArgs
>
(
arg
->
conv_arg
,
deleter
);
memset
(
arg
->
conv_arg
,
0
,
arg
->
split_num
*
sizeof
(
struct
ConvArgs
));
arg
->
concat_arg
.
image_num
=
arg
->
split_num
;
...
...
@@ -431,11 +434,17 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int
n
=
arg
->
split_num
;
arg
->
concat_arg
.
images_in
=
(
half
**
)
fpga_malloc
(
n
*
sizeof
(
int
*
));
// NOLINT
static_cast
<
int16_t
**>
(
fpga_malloc
(
n
*
sizeof
(
int
*
)));
arg
->
concat_arg
.
scales_in
=
(
float
**
)
fpga_malloc
(
n
*
sizeof
(
float
*
));
// NOLINT
static_cast
<
float
**>
(
fpga_malloc
(
n
*
sizeof
(
float
*
)));
arg
->
concat_arg
.
channel_num
=
(
uint32_t
*
)
fpga_malloc
(
n
*
sizeof
(
uint32_t
));
// NOLINT
static_cast
<
uint32_t
*>
(
fpga_malloc
(
n
*
sizeof
(
uint32_t
)));
arg
->
vector_concat_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
concat_arg
.
images_in
),
deleter
));
arg
->
vector_concat_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
concat_arg
.
scales_in
),
deleter
));
arg
->
vector_concat_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
concat_arg
.
channel_num
),
deleter
));
auto
channel
=
(
int
)
out
->
dims
()[
1
];
// NOLINT
int
filter_num_per_div
=
get_filter_num_per_div
(
filter
,
group_num
);
...
...
@@ -469,6 +478,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto
filter_head
=
&
(
(
int8_t
*
)
filter_ptr
)[
i
*
element_num
*
filter_num_per_div
];
// NOLINT
arg
->
conv_arg
[
i
].
filter_address
=
fpga_malloc
(
filter_size
);
arg
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
conv_arg
[
i
].
filter_address
),
deleter
));
memcpy
(
arg
->
conv_arg
[
i
].
filter_address
,
filter_head
,
filter_size
);
fpga_flush
(
arg
->
conv_arg
[
i
].
filter_address
,
filter_size
);
...
...
@@ -477,18 +488,25 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
sizeof
(
float
);
auto
bs_head
=
&
bs_ptr
[
i
*
filter_num_per_div
*
2
];
arg
->
conv_arg
[
i
].
sb_address
=
fpga_malloc
(
bs_size
);
arg
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
conv_arg
[
i
].
sb_address
),
deleter
));
memcpy
(
arg
->
conv_arg
[
i
].
sb_address
,
bs_head
,
bs_size
);
fpga_flush
(
arg
->
conv_arg
[
i
].
sb_address
,
bs_size
);
if
(
n
>
1
)
{
arg
->
conv_arg
[
i
].
output
.
scale_address
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
// NOLINT
static_cast
<
float
*>
(
fpga_malloc
(
2
*
sizeof
(
float
)));
arg
->
conv_arg
[
i
].
output
.
address
=
fpga_malloc
(
out
->
dims
()[
2
]
*
align_to_x
((
int
)(
out
->
dims
()[
3
]
*
// NOLINT
arg
->
conv_arg
[
i
].
filter_num
),
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
arg
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
conv_arg
[
i
].
output
.
scale_address
),
deleter
));
arg
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
conv_arg
[
i
].
output
.
address
),
deleter
));
}
else
{
arg
->
conv_arg
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
conv_arg
[
i
].
output
.
address
=
out_ptr
;
...
...
@@ -512,6 +530,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
float
*
bs_ptr
)
{
auto
input_ptr
=
input
->
data
<
float
>
();
auto
filter_ptr
=
filter
->
data
<
float
>
();
auto
deleter
=
[](
void
*
p
)
{
fpga_free
(
p
);
};
arg
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
sub_conv_num
=
(
uint32_t
)
stride_h
;
...
...
@@ -554,25 +573,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
uint32_t
split_num
=
group_num
==
1
?
(
uint32_t
)
get_deconv_plit_num
(
filter
,
sub_conv_num
)
:
1
;
arg
->
split_conv_args
=
(
SplitConvArgs
*
)
fpga_malloc
(
// NOLINT
sub_conv_num
*
sizeof
(
SplitConvArgs
));
// NOLINT
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
arg
->
split_conv_args
[
i
].
filter_num
=
arg
->
split_conv_args
.
push_back
(
std
::
make_shared
<
SplitConvArgs
>
());
arg
->
split_conv_args
[
i
]
->
filter_num
=
(
arg
->
sub_conv_num
)
*
(
arg
->
filter_num
);
arg
->
split_conv_args
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
].
split_num
=
split_num
;
arg
->
split_conv_args
[
i
].
conv_arg
=
(
ConvArgs
*
)
fpga_malloc
(
split_num
*
sizeof
(
ConvArgs
));
// NOLINT
arg
->
split_conv_args
[
i
].
concat_arg
.
height
=
sub_output_height
;
arg
->
split_conv_args
[
i
].
concat_arg
.
width
=
sub_output_width
;
arg
->
split_conv_args
[
i
].
concat_arg
.
image_num
=
split_num
;
arg
->
split_conv_args
[
i
].
concat_arg
.
images_in
=
(
half
**
)
fpga_malloc
(
split_num
*
sizeof
(
half
*
));
// NOLINT
arg
->
split_conv_args
[
i
].
concat_arg
.
scales_in
=
(
float
**
)
fpga_malloc
(
split_num
*
sizeof
(
float
*
));
// NOLINT
arg
->
split_conv_args
[
i
].
concat_arg
.
channel_num
=
(
uint32_t
*
)
fpga_malloc
(
split_num
*
sizeof
(
uint32_t
));
// NOLINT
arg
->
split_conv_args
[
i
]
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
]
->
split_num
=
split_num
;
arg
->
split_conv_args
[
i
]
->
concat_arg
.
height
=
sub_output_height
;
arg
->
split_conv_args
[
i
]
->
concat_arg
.
width
=
sub_output_width
;
arg
->
split_conv_args
[
i
]
->
concat_arg
.
image_num
=
split_num
;
arg
->
split_conv_args
[
i
]
->
conv_arg
=
static_cast
<
ConvArgs
*>
(
fpga_malloc
(
split_num
*
sizeof
(
ConvArgs
)));
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
=
static_cast
<
int16_t
**>
(
fpga_malloc
(
split_num
*
sizeof
(
int16_t
*
)));
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
=
static_cast
<
float
**>
(
fpga_malloc
(
split_num
*
sizeof
(
float
*
)));
arg
->
split_conv_args
[
i
]
->
concat_arg
.
channel_num
=
static_cast
<
uint32_t
*>
(
fpga_malloc
(
split_num
*
sizeof
(
uint32_t
)));
arg
->
split_conv_args
[
i
]
->
shared_conv_arg
=
std
::
shared_ptr
<
ConvArgs
>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
,
deleter
);
arg
->
split_conv_args
[
i
]
->
vector_concat_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
),
deleter
));
arg
->
split_conv_args
[
i
]
->
vector_concat_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
),
deleter
));
arg
->
split_conv_args
[
i
]
->
vector_concat_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
concat_arg
.
channel_num
),
deleter
));
}
auto
filter_num_per_div
=
...
...
@@ -597,111 +632,132 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
uint32_t
out_addr_offset
=
0
;
for
(
int
i
=
0
;
i
<
sub_conv_num
;
++
i
)
{
if
(
sub_conv_num
==
1
)
{
arg
->
split_conv_args
[
i
]
.
output
.
address
=
arg
->
output
.
address
;
arg
->
split_conv_args
[
i
]
.
output
.
scale_address
=
arg
->
output
.
scale_address
;
arg
->
split_conv_args
[
i
]
->
output
.
address
=
arg
->
output
.
address
;
arg
->
split_conv_args
[
i
]
->
output
.
scale_address
=
arg
->
output
.
scale_address
;
out_addr_offset
=
0
;
}
else
{
auto
ptr_output
=
(
half
*
)
out_ptr
;
// NOLINT
out_addr_offset
=
sizeof
(
half
)
*
(
sub_conv_num
-
1
-
i
)
*
sizeof
(
int16_t
)
*
(
sub_conv_num
-
1
-
i
)
*
(
align_to_x
(
real_out_width
*
arg
->
filter_num
,
IMAGE_ALIGNMENT
));
arg
->
split_conv_args
[
i
].
output
.
address
=
(
void
*
)(
ptr_output
);
// NOLINT
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
// NOLINT
arg
->
split_conv_args
[
i
].
output
.
scale_address
=
ptr_output_scale
;
arg
->
split_conv_args
[
i
]
->
output
.
address
=
out_ptr
;
arg
->
split_conv_args
[
i
]
->
output
.
scale_address
=
static_cast
<
float
*>
(
fpga_malloc
(
2
*
sizeof
(
float
)));
arg
->
split_conv_args
[
i
]
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
output
.
scale_address
),
deleter
));
}
for
(
int
j
=
0
;
j
<
split_num
;
++
j
)
{
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
relu_enabled
=
relu_enabled
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
relu_enabled
=
relu_enabled
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
kernel
.
width
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
kernel
.
width
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
kernel
.
height
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
kernel
.
height
=
(
uint32_t
)
sub_filter_width
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
kernel
.
stride_w
=
1
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
kernel
.
stride_h
=
1
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
kernel
.
stride_w
=
1
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
kernel
.
stride_h
=
1
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
deconv_tx_param
.
deconv_en
=
1
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
deconv_tx_param
.
sub_conv_num
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
deconv_tx_param
.
deconv_en
=
1
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
deconv_tx_param
.
sub_conv_num
=
sub_conv_num
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
deconv_tx_param
.
omit_size
=
omit_size
;
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
deconv_tx_param
.
out_addr_offset
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
deconv_tx_param
.
omit_size
=
omit_size
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
deconv_tx_param
.
out_addr_offset
=
out_addr_offset
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
scale_address
=
input
->
scale
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
channels
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
scale_address
=
input
->
scale
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
channels
=
(
uint32_t
)
sub_channels
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
width
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
height
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
image
.
address
=
input_ptr
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
pad_width
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
pad_height
=
(
uint32_t
)
sub_pad
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
image
.
address
=
input_ptr
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_scale_address
=
filter
->
scale
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_num
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_scale_address
=
filter
->
scale
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_num
=
(
uint32_t
)(
j
==
split_num
-
1
?
sub_filter_num
-
(
split_num
-
1
)
*
filter_num_per_div
:
filter_num_per_div
);
size_t
filter_size
=
element_num
*
align_to_x
(
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_num
,
align_to_x
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_num
,
FILTER_NUM_ALIGNMENT
)
*
sizeof
(
int8_t
);
auto
filter_head
=
&
((
int8_t
*
)
filter_ptr
)[
j
*
element_num
*
filter_num_per_div
+
// NOLINT
i
*
filter_sub_conv_offset
];
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_address
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_address
=
fpga_malloc
(
filter_size
);
memcpy
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
filter_address
,
filter_head
,
arg
->
split_conv_args
[
i
]
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_address
),
deleter
));
memcpy
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_address
,
filter_head
,
filter_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_address
,
fpga_flush
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_address
,
filter_size
);
size_t
bs_align_num
=
align_to_x
(
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_num
,
BS_NUM_ALIGNMENT
);
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_num
,
BS_NUM_ALIGNMENT
);
size_t
bs_size
=
2
*
bs_align_num
*
sizeof
(
float
);
auto
bs_head
=
&
bs_ptr
[
j
*
filter_num_per_div
*
2
];
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
=
fpga_malloc
(
bs_size
);
memcpy
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
,
bs_head
,
bs_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
sb_address
,
bs_size
);
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
sb_address
=
fpga_malloc
(
bs_size
);
arg
->
split_conv_args
[
i
]
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
sb_address
),
deleter
));
memcpy
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
sb_address
,
bs_head
,
bs_size
);
fpga_flush
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
sb_address
,
bs_size
);
if
(
split_num
==
1
)
{
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
output
.
address
=
arg
->
split_conv_args
[
i
]
.
output
.
address
;
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
output
.
scale_address
=
arg
->
split_conv_args
[
i
]
.
output
.
scale_address
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
=
arg
->
split_conv_args
[
i
]
->
output
.
address
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
=
arg
->
split_conv_args
[
i
]
->
output
.
scale_address
;
}
else
{
auto
ptr_output
=
(
half
*
)
fpga_malloc
(
conv_output_size
*
sizeof
(
half
));
// NOLINT
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
=
(
void
*
)((
half
*
)
ptr_output
);
// NOLINT
auto
ptr_output_scale
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
// NOLINT
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
scale_address
=
ptr_output_scale
;
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
=
fpga_malloc
(
conv_output_size
*
sizeof
(
int16_t
));
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
=
static_cast
<
float
*>
(
fpga_malloc
(
2
*
sizeof
(
float
)));
arg
->
split_conv_args
[
i
]
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
),
deleter
));
arg
->
split_conv_args
[
i
]
->
vector_conv_space
.
push_back
(
std
::
shared_ptr
<
char
>
(
reinterpret_cast
<
char
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
),
deleter
));
}
arg
->
split_conv_args
[
i
]
.
concat_arg
.
images_in
[
j
]
=
(
half
*
)
arg
->
split_conv_args
[
i
].
conv_arg
[
j
].
output
.
address
;
// NOLINT
arg
->
split_conv_args
[
i
]
.
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
output
.
scale_address
;
arg
->
split_conv_args
[
i
]
.
concat_arg
.
channel_num
[
j
]
=
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
].
filter_num
;
expand_conv_arg
(
&
(
arg
->
split_conv_args
[
i
]
.
conv_arg
[
j
]));
arg
->
split_conv_args
[
i
]
->
concat_arg
.
images_in
[
j
]
=
static_cast
<
int16_t
*>
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
address
);
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scales_in
[
j
]
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
output
.
scale_address
;
arg
->
split_conv_args
[
i
]
->
concat_arg
.
channel_num
[
j
]
=
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
].
filter_num
;
expand_conv_arg
(
&
(
arg
->
split_conv_args
[
i
]
->
conv_arg
[
j
]));
}
arg
->
split_conv_args
[
i
]
.
concat_arg
.
image_out
=
arg
->
split_conv_args
[
i
]
.
output
.
address
;
arg
->
split_conv_args
[
i
]
.
concat_arg
.
scale_out
=
arg
->
split_conv_args
[
i
]
.
output
.
scale_address
;
arg
->
split_conv_args
[
i
]
->
concat_arg
.
image_out
=
arg
->
split_conv_args
[
i
]
->
output
.
address
;
arg
->
split_conv_args
[
i
]
->
concat_arg
.
scale_out
=
arg
->
split_conv_args
[
i
]
->
output
.
scale_address
;
}
filter
->
reset_data_ptr
(
nullptr
);
fpga_free
(
bs_ptr
);
...
...
@@ -717,16 +773,16 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
arg
->
relu_enabled
=
relu_enabled
;
arg
->
bias_address
=
bias_ptr
;
arg
->
filter_address
=
filter_ptr
;
arg
->
kernel
.
height
=
filter
->
dims
()[
2
];
arg
->
kernel
.
width
=
filter
->
dims
()[
3
];
arg
->
kernel
.
stride_h
=
stride_h
;
arg
->
kernel
.
stride_w
=
stride_w
;
arg
->
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
arg
->
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
arg
->
kernel
.
stride_h
=
(
uint32_t
)
stride_h
;
arg
->
kernel
.
stride_w
=
(
uint32_t
)
stride_w
;
arg
->
image
.
address
=
input_ptr
;
arg
->
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
arg
->
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
image
.
pad_height
=
padding_h
;
arg
->
image
.
pad_width
=
padding_w
;
arg
->
image
.
pad_height
=
(
uint32_t
)
padding_h
;
arg
->
image
.
pad_width
=
(
uint32_t
)
padding_w
;
arg
->
image
.
scale_address
=
input
->
scale
;
arg
->
output
.
address
=
output_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
...
...
src/fpga/V1/pe.cpp
浏览文件 @
35cfe592
...
...
@@ -632,7 +632,7 @@ void deconv_post_process(const struct DeconvArgs &args) {
for
(
int
idx
=
0
;
idx
<
sub_conv_n
;
++
idx
)
{
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
split_conv_args
[
idx
]
.
output
.
address
,
args
.
split_conv_args
[
idx
]
->
output
.
address
,
align_origin_w
*
origin_h
*
sizeof
(
int16_t
));
}
...
...
@@ -642,7 +642,7 @@ void deconv_post_process(const struct DeconvArgs &args) {
int
hx
=
(
hh
%
sub_conv_n
);
auto
sub_t
=
(
int16_t
*
)(
args
.
split_conv_args
[
sub_conv_n
-
hx
-
1
]
// NOLINT
.
output
.
address
);
->
output
.
address
);
int
hi
=
(
hh
/
sub_conv_n
);
if
((
hh
<
omit_size
)
||
(
hh
>=
(
origin_h
-
omit_size
)))
continue
;
int
sidx
=
(
nn
*
origin_h
*
align_origin_w
+
hi
*
align_origin_w
+
...
...
@@ -681,7 +681,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
gettimeofday
(
&
start
,
NULL
);
#endif
ComputeFpgaConv
(
args
.
split_conv_args
[
i
]);
ComputeFpgaConv
(
*
args
.
split_conv_args
[
i
]);
#ifdef COST_TIME_PRINT
gettimeofday
(
&
end
,
NULL
);
dif_sec
=
end
.
tv_sec
-
start
.
tv_sec
;
...
...
@@ -699,12 +699,12 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#endif
for
(
int
i
=
0
;
i
<
sub_conv_num
;
i
++
)
{
paddle_mobile
::
fpga
::
fpga_invalidate
(
args
.
split_conv_args
[
i
]
.
output
.
scale_address
,
2
*
sizeof
(
float
));
float
ptr_scale
=
(
args
.
split_conv_args
[
i
]
.
output
.
scale_address
)[
0
];
args
.
split_conv_args
[
i
]
->
output
.
scale_address
,
2
*
sizeof
(
float
));
float
ptr_scale
=
(
args
.
split_conv_args
[
i
]
->
output
.
scale_address
)[
0
];
if
(
ptr_scale
>
max_scale
)
{
args
.
output
.
scale_address
[
0
]
=
ptr_scale
;
args
.
output
.
scale_address
[
1
]
=
(
args
.
split_conv_args
[
i
]
.
output
.
scale_address
)[
1
];
(
args
.
split_conv_args
[
i
]
->
output
.
scale_address
)[
1
];
}
}
...
...
src/fpga/common/fpga_common.cpp
浏览文件 @
35cfe592
...
...
@@ -75,6 +75,9 @@ void *fpga_malloc(size_t size) {
}
void
fpga_free
(
void
*
ptr
)
{
if
(
ptr
==
nullptr
)
{
return
;
}
static
uint64_t
counter
=
0
;
size_t
size
=
0
;
auto
iter
=
memory_map
.
find
(
ptr
);
// std::map<void *, size_t>::iterator
...
...
@@ -123,5 +126,6 @@ uint64_t vaddr_to_paddr(void *address) {
return
0
;
#endif
}
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/common/fpga_common.h
浏览文件 @
35cfe592
...
...
@@ -16,6 +16,8 @@ limitations under the License. */
#include <cstddef>
#include <cstdint>
#include <memory>
#include <vector>
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -158,7 +160,7 @@ struct ConcatArgs {
void
*
image_out
;
float
*
scale_out
;
uint32_t
*
channel_num
;
uint32_t
*
aligned_channel_num
;
uint32_t
*
aligned_channel_num
;
// Not used so far. Reserved for V2.
uint32_t
out_channel
;
uint32_t
height
;
uint32_t
width
;
...
...
@@ -171,6 +173,9 @@ struct SplitConvArgs {
struct
ImageOutputArgs
output
;
struct
ConvArgs
*
conv_arg
;
struct
ConcatArgs
concat_arg
;
std
::
shared_ptr
<
ConvArgs
>
shared_conv_arg
;
std
::
vector
<
std
::
shared_ptr
<
char
>>
vector_concat_space
;
std
::
vector
<
std
::
shared_ptr
<
char
>>
vector_conv_space
;
};
struct
SplitArgs
{
...
...
@@ -221,7 +226,7 @@ struct DeconvArgs {
uint32_t
sub_output_width
;
uint32_t
sub_output_height
;
struct
ImageOutputArgs
output
;
st
ruct
SplitConvArgs
*
split_conv_args
;
st
d
::
vector
<
std
::
shared_ptr
<
SplitConvArgs
>>
split_conv_args
;
};
struct
DWconvArgs
{
bool
relu_enabled
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录