Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
5892754e
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5892754e
编写于
9月 10, 2018
作者:
qnqinan
提交者:
GitHub
9月 10, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #936 from zhangyang0701/develop
Implement concat op for FPGA track close
#935
上级
0cfa7ce3
f097d83e
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
317 addition
and
214 deletion
+317
-214
src/fpga/api.cpp
src/fpga/api.cpp
+36
-24
src/fpga/api.h
src/fpga/api.h
+16
-3
src/fpga/image.cpp
src/fpga/image.cpp
+4
-0
src/fpga/image.h
src/fpga/image.h
+7
-0
src/operators/kernel/fpga/concat_kernel.cpp
src/operators/kernel/fpga/concat_kernel.cpp
+33
-20
src/operators/kernel/fpga/conv_add_bn_kernel.cpp
src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+60
-27
src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+26
-24
src/operators/kernel/fpga/conv_add_relu_kernel.cpp
src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+24
-22
src/operators/kernel/fpga/conv_bn_kernel.cpp
src/operators/kernel/fpga/conv_bn_kernel.cpp
+26
-24
src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+26
-24
src/operators/kernel/fpga/fc_relu_kernel.cpp
src/operators/kernel/fpga/fc_relu_kernel.cpp
+25
-23
src/operators/kernel/fpga/fusion_fc_kernel.cpp
src/operators/kernel/fpga/fusion_fc_kernel.cpp
+24
-22
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+1
-1
src/operators/op_param.h
src/operators/op_param.h
+9
-0
未找到文件。
src/fpga/api.cpp
浏览文件 @
5892754e
...
...
@@ -89,8 +89,14 @@ DLOG << " kernel_height:" << args.kernel.height
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;*/
#endif
int
split_num
=
args
.
split_num
;
for
(
int
i
=
0
;
i
<
split_num
;
i
++
)
{
do_ioctl
(
IOCTL_CONFIG_CONV
,
&
args
.
conv_args
[
i
]);
}
return
do_ioctl
(
IOCTL_CONFIG_CONV
,
&
args
);
if
(
split_num
>
1
)
{
ComputeFPGAConcat
(
args
.
concat_arg
);
}
}
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
)
{
...
...
@@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) {
return
do_ioctl
(
IOCTL_CONFIG_BYPASS
,
&
args
);
}
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
)
{
image
::
concat_images
(
args
.
images_in
,
args
.
scales_in
,
args
.
image_out
,
args
.
scale_out
,
args
.
image_num
,
args
.
channel_num
,
args
.
height
,
args
.
width
);
return
0
;
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
auto
dims
=
image_tensor
->
dims
();
int
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
image_tensor
->
mutable_data
<
float
>
();
size_t
memory_size
=
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
...
...
@@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) {
void
format_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
auto
dims
=
ofm_tensor
->
dims
();
int
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
size_t
memory_size
=
height
*
align_to_x
(
channel
*
width
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
);
ofm_tensor
->
reset_data_ptr
(
fpga_malloc
(
memory_size
));
...
...
@@ -181,16 +194,16 @@ float filter_find_max(framework::Tensor *filter_tensor) {
int
get_plit_num
(
framework
::
Tensor
*
filter_tensor
)
{
auto
dims
=
filter_tensor
->
dims
();
int
chw
=
dims
[
1
]
*
dims
[
2
]
*
dims
[
3
];
int
num
=
dims
[
0
];
auto
chw
=
dims
[
1
]
*
dims
[
2
]
*
dims
[
3
];
auto
num
=
dims
[
0
];
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_split_num
(
num
,
div_capacity
);
}
int
get_element_num_per_div
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
)
{
auto
dims
=
filter_tensor
->
dims
();
int
chw
=
dims
[
1
]
*
dims
[
2
]
*
dims
[
3
];
int
num
=
dims
[
0
];
auto
chw
=
dims
[
1
]
*
dims
[
2
]
*
dims
[
3
];
auto
num
=
dims
[
0
];
int
div_capacity
=
filter
::
calc_division_capacity
(
chw
);
return
filter
::
calc_num_per_div
(
num
,
group_num
,
div_capacity
);
}
...
...
@@ -206,25 +219,10 @@ int get_aligned_filter_num(int num) {
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
)
{
auto
dims
=
filter_tensor
->
dims
();
int
num
=
dims
[
0
],
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
num
=
dims
[
0
],
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
filter_tensor
->
mutable_data
<
float
>
();
size_t
memory_size
=
num
*
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_fc_matrix
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
,
int
height
,
int
width
)
{
auto
dims
=
filter_tensor
->
dims
();
PADDLE_MOBILE_ENFORCE
(
height
==
1
&&
width
==
1
,
"IFM should be flattened for FC"
);
int
num
=
dims
[
1
],
channel
=
dims
[
0
]
/
height
/
width
;
auto
data_ptr
=
filter_tensor
->
mutable_data
<
float
>
();
size_t
memory_size
=
num
*
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
...
...
@@ -237,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array,
element_num_per_division
,
num
);
}
void
format_concat_output
(
framework
::
Tensor
*
out
,
int
height
,
int
width
,
int
image_num
,
uint32_t
*
channel_num
)
{
int
sum_channel
=
0
,
sum_cw
=
0
;
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
sum_channel
+=
channel_num
[
i
];
}
sum_cw
=
align_to_x
(
width
*
sum_channel
,
IMAGE_ALIGNMENT
);
auto
data_ptr
=
fpga_malloc
(
height
*
sum_cw
*
sizeof
(
half
));
auto
ddim
=
framework
::
make_ddim
({
-
1
,
sum_channel
,
height
,
width
});
out
->
Resize
(
ddim
);
out
->
reset_data_ptr
(
data_ptr
);
}
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/api.h
浏览文件 @
5892754e
...
...
@@ -92,12 +92,24 @@ struct ConvArgs {
struct
ImageOutputArgs
output
;
};
struct
ConcatArgs
{
uint32_t
image_num
;
half
**
images_in
;
float
**
scales_in
;
void
*
image_out
;
float
*
scale_out
;
uint32_t
*
channel_num
;
uint32_t
height
;
uint32_t
width
;
};
struct
WrapperConvArgs
{
uint32_t
split_num
;
uint32_t
group_num
;
uint32_t
filter_num
;
struct
ImageOutputArgs
output
;
struct
ConvArgs
*
args
;
struct
ConvArgs
*
conv_args
;
struct
ConcatArgs
concat_arg
;
};
struct
PoolingArgs
{
...
...
@@ -176,6 +188,7 @@ int PerformBypass(const struct BypassArgs& args);
int
ComputeFpgaConv
(
const
struct
WrapperConvArgs
&
args
);
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
);
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
);
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
);
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
void
format_image
(
framework
::
Tensor
*
image_tensor
);
...
...
@@ -188,10 +201,10 @@ int get_aligned_filter_num(int num);
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
);
void
format_fc_matrix
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
,
int
height
=
1
,
int
width
=
1
);
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
);
void
format_concat_output
(
framework
::
Tensor
*
out
,
int
height
,
int
width
,
int
image_num
,
uint32_t
*
channel_num
);
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/image.cpp
浏览文件 @
5892754e
...
...
@@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) {
align_element_conv
(
data_in
,
height
,
channel
*
width
);
}
void
concat_images
(
int16_t
**
images_in
,
float
**
scales_in
,
void
*
image_out
,
float
*
scale_out
,
int
image_num
,
uint32_t
*
channel_num
,
int
height
,
int
width
)
{}
}
// namespace image
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/image.h
浏览文件 @
5892754e
...
...
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -21,6 +24,10 @@ namespace image {
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
);
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
void
concat_images
(
int16_t
**
images_in
,
float
**
scales_in
,
void
*
image_out
,
float
*
scale_out
,
int
image_num
,
uint32_t
*
channel_num
,
int
height
,
int
width
);
// Concat featuremaps along channel direction
}
// namespace image
}
// namespace fpga
}
// namespace paddle_mobile
src/operators/kernel/fpga/concat_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -21,31 +21,44 @@ namespace operators {
template
<
>
bool
ConcatKernel
<
FPGA
,
float
>::
Init
(
ConcatParam
<
FPGA
>
*
param
)
{
auto
inputs
=
param
->
Inputs
();
auto
out
=
param
->
Out
();
auto
image_num
=
inputs
.
size
();
auto
images_in
=
(
half
**
)
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
int
*
));
auto
scales_in
=
(
float
**
)
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
float
*
));
auto
channel_num
=
(
uint32_t
*
)
fpga
::
fpga_malloc
(
image_num
*
sizeof
(
uint32_t
));
auto
height
=
inputs
[
0
]
->
dims
()[
2
];
auto
width
=
inputs
[
0
]
->
dims
()[
3
];
for
(
int
i
=
0
;
i
<
image_num
;
i
++
)
{
auto
input
=
inputs
[
i
];
PADDLE_MOBILE_ENFORCE
(
input
->
dims
()[
2
]
==
height
&&
input
->
dims
()[
3
]
==
width
,
"Image height & width should be unified"
);
images_in
[
i
]
=
(
half
*
)
input
->
data
<
float
>
();
channel_num
[
i
]
=
(
uint32_t
)
inputs
[
i
]
->
dims
()[
1
];
scales_in
[
i
]
=
input
->
scale
;
}
fpga
::
format_concat_output
(
out
,
(
int
)
height
,
(
int
)
width
,
(
int
)
image_num
,
channel_num
);
fpga
::
ConcatArgs
concatArgs
;
concatArgs
.
image_num
=
(
uint32_t
)
image_num
;
concatArgs
.
images_in
=
images_in
;
concatArgs
.
scales_in
=
scales_in
;
concatArgs
.
image_out
=
(
half
*
)
out
->
mutable_data
<
float
>
();
concatArgs
.
scale_out
=
out
->
scale
;
concatArgs
.
channel_num
=
channel_num
;
concatArgs
.
height
=
(
uint32_t
)
height
;
concatArgs
.
width
=
(
uint32_t
)
width
;
param
->
SetFpgaArgs
(
concatArgs
);
return
true
;
}
template
<
>
void
ConcatKernel
<
FPGA
,
float
>::
Compute
(
const
ConcatParam
<
FPGA
>
&
param
)
const
{
auto
inputs
=
param
.
Inputs
();
auto
*
out
=
param
.
Out
();
int64_t
axis
=
param
.
Axis
();
out
->
mutable_data
<
half
>
();
DDim
out_dim
=
out
->
dims
();
int
pixels
=
out_dim
[
1
]
*
out_dim
[
2
];
auto
out_channel
=
out_dim
[
3
];
auto
out_offset
=
0
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
input
=
inputs
[
i
];
auto
channels
=
input
->
dims
()[
3
];
out_offset
+=
channels
;
auto
src
=
input
->
data
<
half
>
();
for
(
int
j
=
0
;
j
<
pixels
;
++
j
)
{
auto
dst
=
out
->
mutable_data
<
half
>
()
+
out_offset
;
memory
::
Copy
(
dst
,
src
,
sizeof
(
half
));
}
}
ComputeFPGAConcat
(
param
.
FpgaArgs
());
}
template
class
ConcatKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/conv_add_bn_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -22,13 +22,13 @@ namespace operators {
template
<
>
bool
ConvAddBNKernel
<
FPGA
,
float
>::
Init
(
FusionConvAddBNParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
false
;
auto
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
*
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
Tensor
*
out
=
param
->
Output
();
auto
out
=
param
->
Output
();
auto
bn_mean_ptr
=
param
->
InputMean
()
->
data
<
float
>
();
auto
bn_var_ptr
=
param
->
InputVariance
()
->
data
<
float
>
();
...
...
@@ -40,10 +40,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
"Output channel should be equal to bias number"
);
const
int
channel
=
out
->
dims
()[
1
];
auto
*
bs_ptr
=
auto
bs_ptr
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
)));
auto
*
new_scale
=
new
Tensor
();
auto
*
new_bias
=
new
Tensor
();
auto
new_scale
=
new
Tensor
();
auto
new_bias
=
new
Tensor
();
auto
new_scale_ptr
=
new_scale
->
mutable_data
<
float
>
({
channel
});
auto
new_bias_ptr
=
new_bias
->
mutable_data
<
float
>
({
channel
});
...
...
@@ -75,35 +75,68 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
concat_arg
.
image_num
=
convArgs
.
split_num
;
convArgs
.
concat_arg
.
image_out
=
out_ptr
;
convArgs
.
concat_arg
.
scale_out
=
out
->
scale
;
convArgs
.
concat_arg
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
concat_arg
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
int
n
=
convArgs
.
split_num
;
convArgs
.
concat_arg
.
images_in
=
(
half
**
)
fpga
::
fpga_malloc
(
n
*
sizeof
(
int
*
));
convArgs
.
concat_arg
.
scales_in
=
(
float
**
)
fpga
::
fpga_malloc
(
n
*
sizeof
(
float
*
));
convArgs
.
concat_arg
.
channel_num
=
(
uint32_t
*
)
fpga
::
fpga_malloc
(
n
*
sizeof
(
uint32_t
));
convArgs
.
concat_arg
.
image_out
=
out_ptr
;
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
]);
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
if
(
n
>
1
)
{
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
output
.
address
=
fpga
::
fpga_malloc
(
input
->
dims
()[
2
]
*
input
->
dims
()[
3
]
*
convArgs
.
conv_args
[
i
].
filter_num
*
sizeof
(
half
));
}
else
{
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
out
->
scale
;
convArgs
.
conv_args
[
i
].
output
.
address
=
out_ptr
;
}
convArgs
.
concat_arg
.
images_in
[
i
]
=
(
half
*
)
convArgs
.
conv_args
[
i
].
output
.
address
;
convArgs
.
concat_arg
.
scales_in
[
i
]
=
(
float
*
)
convArgs
.
conv_args
[
i
].
sb_address
;
convArgs
.
concat_arg
.
channel_num
[
i
]
=
convArgs
.
conv_args
[
i
].
filter_num
;
}
return
true
;
}
...
...
src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -23,12 +23,12 @@ template <>
bool
ConvAddBNReluKernel
<
FPGA
,
float
>::
Init
(
FusionConvAddBNReluParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
true
;
Tensor
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
Tensor
*
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
Tensor
*
out
=
param
->
Output
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
auto
bn_mean_ptr
=
param
->
InputMean
()
->
data
<
float
>
();
auto
bn_var_ptr
=
param
->
InputVariance
()
->
data
<
float
>
();
auto
bn_scale_ptr
=
param
->
InputScale
()
->
data
<
float
>
();
...
...
@@ -39,9 +39,9 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
"Output channel should be equal to bias number"
);
const
int
channel
=
out
->
dims
()[
1
];
float
*
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
Tensor
*
new_scale
=
new
Tensor
();
Tensor
*
new_bias
=
new
Tensor
();
auto
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
auto
new_scale
=
new
Tensor
();
auto
new_bias
=
new
Tensor
();
auto
new_scale_ptr
=
new_scale
->
mutable_data
<
float
>
({
channel
});
auto
new_bias_ptr
=
new_bias
->
mutable_data
<
float
>
({
channel
});
...
...
@@ -73,8 +73,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
...
...
@@ -82,26 +82,28 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
}
return
true
;
return
true
;
...
...
src/operators/kernel/fpga/conv_add_relu_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -22,17 +22,17 @@ namespace operators {
template
<
>
bool
ConvAddReluKernel
<
FPGA
,
float
>::
Init
(
FusionConvAddReluParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
true
;
Tensor
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
const
Tensor
*
bias
=
param
->
Bias
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
auto
*
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
Tensor
*
out
=
param
->
Output
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
PADDLE_MOBILE_ENFORCE
(
out
->
dims
()[
1
]
==
bias
->
dims
()[
0
],
"Output channel should be equal to bias number"
);
int
channel
=
out
->
dims
()[
1
];
auto
*
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
auto
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
channel
;
i
++
)
{
bs_ptr
[
i
+
channel
]
=
1
;
bs_ptr
[
i
]
=
bias_ptr
[
i
];
...
...
@@ -55,8 +55,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
...
...
@@ -64,26 +64,28 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
}
return
true
;
}
...
...
src/operators/kernel/fpga/conv_bn_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -23,10 +23,10 @@ namespace operators {
template
<
>
bool
ConvBNKernel
<
FPGA
,
float
>::
Init
(
FusionConvBNParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
false
;
Tensor
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
Tensor
*
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
Tensor
*
out
=
param
->
Output
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
auto
bn_mean_ptr
=
param
->
InputMean
()
->
data
<
float
>
();
auto
bn_var_ptr
=
param
->
InputVariance
()
->
data
<
float
>
();
auto
bn_scale_ptr
=
param
->
InputScale
()
->
data
<
float
>
();
...
...
@@ -36,10 +36,10 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
"Output channel should be equal to bias number"
);
const
int
channel
=
out
->
dims
()[
1
];
float
*
bs_ptr
=
auto
bs_ptr
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
)));
Tensor
*
new_scale
=
new
Tensor
();
Tensor
*
new_bias
=
new
Tensor
();
auto
new_scale
=
new
Tensor
();
auto
new_bias
=
new
Tensor
();
auto
new_scale_ptr
=
new_scale
->
mutable_data
<
float
>
({
channel
});
auto
new_bias_ptr
=
new_bias
->
mutable_data
<
float
>
({
channel
});
...
...
@@ -70,8 +70,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
...
...
@@ -79,26 +79,28 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
}
return
true
;
}
...
...
src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -22,10 +22,10 @@ namespace operators {
template
<
>
bool
ConvBNReluKernel
<
FPGA
,
float
>::
Init
(
FusionConvBNReluParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
true
;
Tensor
*
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input
=
const_cast
<
Tensor
*>
(
param
->
Input
());
auto
input_ptr
=
input
->
data
<
float
>
();
Tensor
*
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
Tensor
*
out
=
param
->
Output
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
Filter
());
auto
out
=
param
->
Output
();
auto
bn_mean_ptr
=
param
->
InputMean
()
->
data
<
float
>
();
auto
bn_var_ptr
=
param
->
InputVariance
()
->
data
<
float
>
();
auto
bn_scale_ptr
=
param
->
InputScale
()
->
data
<
float
>
();
...
...
@@ -34,9 +34,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE
(
out
->
dims
()[
1
]
==
param
->
InputBias
()
->
dims
()[
0
],
"Output channel should be equal to bias number"
);
const
int
channel
=
out
->
dims
()[
1
];
float
*
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
Tensor
*
new_scale
=
new
Tensor
();
Tensor
*
new_bias
=
new
Tensor
();
auto
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
auto
new_scale
=
new
Tensor
();
auto
new_bias
=
new
Tensor
();
auto
new_scale_ptr
=
new_scale
->
mutable_data
<
float
>
({
channel
});
auto
new_bias_ptr
=
new_bias
->
mutable_data
<
float
>
({
channel
});
...
...
@@ -67,8 +67,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
...
...
@@ -76,26 +76,28 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
(
uint32_t
)
param
->
Groups
();
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
param
->
Strides
()[
0
];
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
param
->
Strides
()[
1
];
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
(
uint32_t
)
param
->
Paddings
()[
0
];
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
(
uint32_t
)
param
->
Paddings
()[
1
];
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input
->
scale
;
}
return
true
;
}
...
...
src/operators/kernel/fpga/fc_relu_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -20,16 +20,16 @@ namespace operators {
template
<
>
bool
FusionFcReluKernel
<
FPGA
,
float
>::
Init
(
FusionFcReluParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
true
;
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
*
filter
=
const_cast
<
Tensor
*>
(
param
->
InputY
());
const
Tensor
*
input_z
=
param
->
InputZ
();
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
InputY
());
auto
input_z
=
param
->
InputZ
();
auto
input_z_ptr
=
input_z
->
data
<
float
>
();
Tensor
*
out
=
param
->
Out
();
auto
out
=
param
->
Out
();
PADDLE_MOBILE_ENFORCE
(
input_x
->
dims
()[
1
]
==
filter
->
dims
()[
0
],
"Image channel should be equal to weight number"
);
int
channel
=
(
uint32_t
)
out
->
dims
()[
1
];
auto
*
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
auto
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
channel
;
i
++
)
{
bs_ptr
[
i
+
channel
]
=
1
;
bs_ptr
[
i
]
=
input_z_ptr
[
i
];
...
...
@@ -60,8 +60,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
...
...
@@ -69,26 +69,28 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
1
;
convArgs
.
args
[
i
].
kernel
.
stride_h
=
1
;
convArgs
.
args
[
i
].
kernel
.
stride_w
=
1
;
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_x_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input_x
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input_x
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input_x
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
0
;
convArgs
.
args
[
i
].
image
.
pad_width
=
0
;
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
1
;
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
1
;
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
1
;
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_x_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input_x
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input_x
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input_x
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
0
;
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
0
;
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input_x
->
scale
;
}
return
true
;
}
...
...
src/operators/kernel/fpga/fusion_fc_kernel.cpp
浏览文件 @
5892754e
...
...
@@ -21,17 +21,17 @@ namespace operators {
template
<
>
bool
FusionFcKernel
<
FPGA
,
float
>::
Init
(
FusionFcParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
false
;
auto
*
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
*
filter
=
const_cast
<
Tensor
*>
(
param
->
InputY
());
auto
filter
=
const_cast
<
Tensor
*>
(
param
->
InputY
());
const
Tensor
*
input_z
=
param
->
InputZ
();
auto
input_z_ptr
=
input_z
->
data
<
float
>
();
Tensor
*
out
=
param
->
Out
();
auto
out
=
param
->
Out
();
PADDLE_MOBILE_ENFORCE
(
input_x
->
dims
()[
1
]
==
filter
->
dims
()[
0
],
"Image channel should be equal to weight number"
);
int
channel
=
(
uint32_t
)
out
->
dims
()[
1
];
auto
*
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
auto
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
channel
;
i
++
)
{
bs_ptr
[
i
+
channel
]
=
1
;
bs_ptr
[
i
]
=
input_z_ptr
[
i
];
...
...
@@ -61,8 +61,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
convArgs
.
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
convArgs
.
output
.
address
=
out_ptr
;
convArgs
.
output
.
scale_address
=
out
->
scale
;
convArgs
.
args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
convArgs
.
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
convArgs
.
split_num
*
sizeof
(
fpga
::
ConvArgs
));
param
->
SetFpgaArgs
(
convArgs
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
...
...
@@ -70,26 +70,28 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
int
n
=
convArgs
.
split_num
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
convArgs
.
args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
args
[
i
].
group_num
=
1
;
convArgs
.
args
[
i
].
kernel
.
stride_h
=
1
;
convArgs
.
args
[
i
].
kernel
.
stride_w
=
1
;
convArgs
.
args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
address
=
input_x_ptr
;
convArgs
.
args
[
i
].
image
.
channels
=
(
uint32_t
)
input_x
->
dims
()[
1
];
convArgs
.
args
[
i
].
image
.
height
=
(
uint32_t
)
input_x
->
dims
()[
2
];
convArgs
.
args
[
i
].
image
.
width
=
(
uint32_t
)
input_x
->
dims
()[
3
];
convArgs
.
args
[
i
].
image
.
pad_height
=
0
;
convArgs
.
args
[
i
].
image
.
pad_width
=
0
;
convArgs
.
args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
args
[
i
].
filter_num
=
convArgs
.
conv_args
[
i
].
relu_enabled
=
relu_enabled
;
convArgs
.
conv_args
[
i
].
group_num
=
1
;
convArgs
.
conv_args
[
i
].
kernel
.
stride_h
=
1
;
convArgs
.
conv_args
[
i
].
kernel
.
stride_w
=
1
;
convArgs
.
conv_args
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
address
=
input_x_ptr
;
convArgs
.
conv_args
[
i
].
image
.
channels
=
(
uint32_t
)
input_x
->
dims
()[
1
];
convArgs
.
conv_args
[
i
].
image
.
height
=
(
uint32_t
)
input_x
->
dims
()[
2
];
convArgs
.
conv_args
[
i
].
image
.
width
=
(
uint32_t
)
input_x
->
dims
()[
3
];
convArgs
.
conv_args
[
i
].
image
.
pad_height
=
0
;
convArgs
.
conv_args
[
i
].
image
.
pad_width
=
0
;
convArgs
.
conv_args
[
i
].
filter_address
=
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
sb_address
=
&
((
int8_t
*
)
bs_ptr
)[
i
*
element_num
];
convArgs
.
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
element_num_per_div
)
:
element_num_per_div
);
convArgs
.
args
[
i
].
image
.
scale_address
=
convArgs
.
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
convArgs
.
conv_args
[
i
].
image
.
scale_address
=
input_x
->
scale
;
}
return
true
;
}
...
...
src/operators/math/gemm.cpp
浏览文件 @
5892754e
...
...
@@ -734,7 +734,7 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
#endif
}
}
WriteWithBnAddRelu
(
mc
,
nc
,
c
,
C
,
ldc
,
new_scale
,
new_bias
,
bias
);
//
WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
}
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
...
...
src/operators/op_param.h
浏览文件 @
5892754e
...
...
@@ -483,6 +483,15 @@ class ConcatParam : public OpParam {
vector
<
GType
*>
inputs_
;
GType
*
out_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
ConcatArgs
fpga_concat_args
;
public:
const
fpga
::
ConcatArgs
&
FpgaArgs
()
const
{
return
fpga_concat_args
;
}
void
SetFpgaArgs
(
const
fpga
::
ConcatArgs
&
args
)
{
fpga_concat_args
=
args
;
}
#endif
};
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录