Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
5e5aaeb0
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
337
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5e5aaeb0
编写于
9月 05, 2018
作者:
L
liuruilong
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
aabcc54d
d39dc7a7
变更
25
隐藏空白更改
内联
并排
Showing
25 changed file
with
1279 addition
and
97 deletion
+1279
-97
src/common/types.cpp
src/common/types.cpp
+2
-0
src/common/types.h
src/common/types.h
+1
-0
src/fpga/api.cpp
src/fpga/api.cpp
+58
-13
src/fpga/api.h
src/fpga/api.h
+8
-0
src/fpga/bias_scale.cpp
src/fpga/bias_scale.cpp
+25
-4
src/fpga/bias_scale.h
src/fpga/bias_scale.h
+2
-0
src/fpga/filter.cpp
src/fpga/filter.cpp
+178
-9
src/fpga/filter.h
src/fpga/filter.h
+5
-4
src/fpga/image.cpp
src/fpga/image.cpp
+54
-0
src/fpga/image.h
src/fpga/image.h
+13
-0
src/framework/tensor.h
src/framework/tensor.h
+32
-25
src/operators/fusion_conv_bn_add_relu_op.cpp
src/operators/fusion_conv_bn_add_relu_op.cpp
+61
-0
src/operators/fusion_conv_bn_add_relu_op.h
src/operators/fusion_conv_bn_add_relu_op.h
+125
-0
src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+66
-0
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
...ators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+147
-0
src/operators/kernel/conv_bn_add_relu_kernel.h
src/operators/kernel/conv_bn_add_relu_kernel.h
+45
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+271
-31
src/operators/math/gemm.h
src/operators/math/gemm.h
+8
-4
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+4
-4
src/operators/math/math_function.h
src/operators/math/math_function.h
+1
-1
src/operators/op_param.h
src/operators/op_param.h
+113
-0
test/CMakeLists.txt
test/CMakeLists.txt
+3
-0
test/common/test_gemm_accuracy.cpp
test/common/test_gemm_accuracy.cpp
+2
-2
test/fpga/test_format_data.cpp
test/fpga/test_format_data.cpp
+49
-0
tools/op.cmake
tools/op.cmake
+6
-0
未找到文件。
src/common/types.cpp
浏览文件 @
5e5aaeb0
...
@@ -26,6 +26,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
...
@@ -26,6 +26,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
=
"fusion_conv_add_prelu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
=
"fusion_conv_add_prelu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
=
"fusion_conv_add_add_prelu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
=
"fusion_conv_add_add_prelu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
=
"fusion_conv_bn_add_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
...
@@ -79,6 +80,7 @@ std::unordered_map<
...
@@ -79,6 +80,7 @@ std::unordered_map<
{
G_OP_TYPE_BOX_CODER
,
{
G_OP_TYPE_BOX_CODER
,
{{
"PriorBox"
,
"PriorBoxVar"
,
"TargetBox"
},
{
"OutputBox"
}}},
{{
"PriorBox"
,
"PriorBoxVar"
,
"TargetBox"
},
{
"OutputBox"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_PRIOR_BOX
,
{{
"Image"
,
"Input"
},
{
"Boxes"
,
"Variances"
}}},
{
G_OP_TYPE_PRIOR_BOX
,
{{
"Image"
,
"Input"
},
{
"Boxes"
,
"Variances"
}}},
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
...
...
src/common/types.h
浏览文件 @
5e5aaeb0
...
@@ -90,6 +90,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
...
@@ -90,6 +90,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
;
...
...
src/fpga/api.cpp
浏览文件 @
5e5aaeb0
...
@@ -12,27 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,27 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include
<errno.h>
#include
"api.h"
#include <fcntl.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cstring>
#include "bias_scale.h"
#include "api.h"
#include "filter.h"
#include "image.h"
#define FPGA_TEST_MODE
#define FPGA_TEST_MODE
#ifdef FPGA_TEST_MODE
#include "common/log.h"
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
fpga
{
namespace
fpga
{
...
@@ -164,5 +155,59 @@ int PerformBypass(const struct BypassArgs &args) {
...
@@ -164,5 +155,59 @@ int PerformBypass(const struct BypassArgs &args) {
return
do_ioctl
(
IOCTL_CONFIG_BYPASS
,
&
args
);
return
do_ioctl
(
IOCTL_CONFIG_BYPASS
,
&
args
);
}
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
auto
dims
=
image_tensor
->
dims
();
int
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
image_tensor
->
mutable_data
<
float
>
();
size_t
memory_size
=
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
image
::
format_image
(
&
new_data
,
channel
,
height
,
width
);
image_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_ofm
(
framework
::
Tensor
*
ofm_tensor
)
{
auto
dims
=
ofm_tensor
->
dims
();
int
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
size_t
memory_size
=
height
*
align_to_x
(
channel
*
width
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
);
ofm_tensor
->
reset_data_ptr
(
fpga_malloc
(
memory_size
));
}
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
)
{
auto
dims
=
filter_tensor
->
dims
();
int
num
=
dims
[
0
],
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
filter_tensor
->
mutable_data
<
float
>
();
size_t
memory_size
=
num
*
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
float
max_value
=
filter
::
find_max
(
new_data
,
num
*
channel
*
height
*
width
);
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_fc_matrix
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
,
int
height
,
int
width
)
{
auto
dims
=
filter_tensor
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
0
]
%
(
height
*
width
)
==
0
,
"Filter number should be divisible by group number"
);
int
num
=
dims
[
1
],
channel
=
dims
[
0
]
/
height
/
width
;
auto
data_ptr
=
filter_tensor
->
mutable_data
<
float
>
();
size_t
memory_size
=
num
*
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
float
max_value
=
filter
::
find_max
(
new_data
,
num
*
channel
*
height
*
width
);
filter
::
format_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
group_num
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
)
{
bias_scale
::
format_bias_scale_array
(
bias_scale_array
,
element_num_per_division
,
num
);
}
}
// namespace fpga
}
// namespace fpga
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/fpga/api.h
浏览文件 @
5e5aaeb0
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cstddef>
#include <cstddef>
#include <iostream>
#include <iostream>
#include <limits>
#include <limits>
#include "framework/tensor.h"
// memory management;
// memory management;
...
@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args);
...
@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args);
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
);
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
);
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
void
format_image
(
framework
::
Tensor
*
image_tensor
);
void
format_ofm
(
framework
::
Tensor
*
ofm_tensor
);
// only allocate memory
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
);
void
format_fc_matrix
(
framework
::
Tensor
*
filter_tensor
,
int
group_num
,
int
height
=
1
,
int
width
=
1
);
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
);
}
// namespace fpga
}
// namespace fpga
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/fpga/bias_scale.cpp
浏览文件 @
5e5aaeb0
...
@@ -21,6 +21,7 @@ namespace fpga {
...
@@ -21,6 +21,7 @@ namespace fpga {
namespace
bias_scale
{
namespace
bias_scale
{
void
align_element
(
float
**
data_in
,
int
num_per_div_before_alignment
,
int
num
)
{
void
align_element
(
float
**
data_in
,
int
num_per_div_before_alignment
,
int
num
)
{
int
copynum
=
0
;
float
*
ptr_unaligned
=
*
data_in
;
float
*
ptr_unaligned
=
*
data_in
;
int
div_num
=
int
div_num
=
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
...
@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
...
@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
memset
(
ptr_aligned
,
0
,
num_element
*
sizeof
(
float
));
memset
(
ptr_aligned
,
0
,
num_element
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
div_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
div_num
;
i
++
)
{
memcpy
(
ptr_aligned
+
i
*
num_per_div_after_alignment
,
ptr_unaligned
,
if
(
i
==
div_num
-
1
)
{
num_per_div_before_alignment
*
sizeof
(
float
));
copynum
=
(
num_per_div_after_alignment
*
div_num
>
num
)
?
(
num
%
num_per_div_after_alignment
)
:
(
num_per_div_before_alignment
);
}
else
{
copynum
=
num_per_div_before_alignment
;
}
memcpy
(
ptr_aligned
+
i
*
num_per_div_after_alignment
,
ptr_unaligned
+
num_per_div_before_alignment
*
i
,
copynum
*
sizeof
(
float
));
memcpy
(
ptr_aligned
+
(
div_num
+
i
)
*
num_per_div_after_alignment
,
ptr_unaligned
+
num_per_div_before_alignment
*
i
+
num
,
copynum
*
sizeof
(
float
));
}
}
fpga_free
(
ptr_unaligned
);
fpga_free
(
ptr_unaligned
);
...
@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) {
...
@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) {
memcpy
(
ptr_interleaved
+
8
*
i
,
ptr_uninterleaved
+
4
*
i
,
memcpy
(
ptr_interleaved
+
8
*
i
,
ptr_uninterleaved
+
4
*
i
,
4
*
sizeof
(
float
));
4
*
sizeof
(
float
));
memcpy
(
ptr_interleaved
+
8
*
i
+
4
,
memcpy
(
ptr_interleaved
+
8
*
i
+
4
,
ptr_uninterleaved
+
num_after_alignment
*
sizeof
(
float
)
+
4
*
i
,
ptr_uninterleaved
+
num_after_alignment
+
4
*
i
,
4
*
sizeof
(
float
));
4
*
sizeof
(
float
));
}
}
fpga_free
(
ptr_uninterleaved
);
fpga_free
(
ptr_uninterleaved
);
*
data_in
=
ptr_interleaved
;
*
data_in
=
ptr_interleaved
;
}
}
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
)
{
align_element
(
bias_scale_array
,
element_num_per_division
,
num
);
int
div_num
=
(
num
+
element_num_per_division
-
1
)
/
element_num_per_division
;
int
element_num_after_division
=
align_to_x
(
element_num_per_division
,
BS_NUM_ALIGNMENT
);
interleave
(
bias_scale_array
,
div_num
*
element_num_after_division
);
}
}
// namespace bias_scale
}
// namespace bias_scale
}
// namespace fpga
}
// namespace fpga
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/fpga/bias_scale.h
浏览文件 @
5e5aaeb0
...
@@ -22,6 +22,8 @@ namespace bias_scale {
...
@@ -22,6 +22,8 @@ namespace bias_scale {
void
align_element
(
float
**
data_in
,
int
num_per_div_before_alignment
,
int
num
);
void
align_element
(
float
**
data_in
,
int
num_per_div_before_alignment
,
int
num
);
void
interleave
(
float
**
data_in
,
int
num_after_alignment
);
void
interleave
(
float
**
data_in
,
int
num_after_alignment
);
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
);
}
// namespace bias_scale
}
// namespace bias_scale
}
// namespace fpga
}
// namespace fpga
...
...
src/fpga/filter.cpp
浏览文件 @
5e5aaeb0
...
@@ -19,21 +19,190 @@ namespace paddle_mobile {
...
@@ -19,21 +19,190 @@ namespace paddle_mobile {
namespace
fpga
{
namespace
fpga
{
namespace
filter
{
namespace
filter
{
void
convert_to_hwc
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
calc_division_capacity
(
int
chw
)
{
return
2048
/
((
chw
+
15
)
/
16
)
*
32
;
}
int
width
)
{}
float
find_max
(
float
*
data_in
,
int
num
)
{
return
0
;
}
int
calc_split_num
(
int
num
,
int
division_capacity
)
{
return
(
num
+
division_capacity
-
1
)
/
division_capacity
;
}
void
quantize
(
float
*
data_in
,
int
num
)
{}
int
calc_division_number
(
int
num
,
int
group_num
,
int
division_capacity
)
{
PADDLE_MOBILE_ENFORCE
(
num
%
group_num
==
0
,
"Filter number should be divisible by group number"
);
int
split_num
=
calc_split_num
(
num
,
division_capacity
);
PADDLE_MOBILE_ENFORCE
(
group_num
==
1
||
split_num
==
1
,
"Split number or group number should be 1"
);
return
group_num
*
split_num
;
}
void
align_element
(
float
**
data_in
,
int
num
,
int
chw
)
{}
int
calc_num_per_div
(
int
num
,
int
group_num
,
int
division_capacity
)
{
if
(
group_num
==
1
)
{
if
(
num
>
division_capacity
)
{
return
division_capacity
;
}
else
{
return
num
;
}
}
else
{
return
(
num
+
group_num
-
1
)
/
group_num
;
}
}
void
align_num
(
float
**
data_in
,
int
num_per_div_before_alignment
,
int
num
,
void
convert_to_hwc
(
char
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
chw
)
{}
int
width
)
{
char
*
tmp
=
*
data_in
;
int
chw
=
channel
*
height
*
width
;
char
*
data_tmp
=
(
char
*
)
fpga_malloc
(
chw
*
num
*
sizeof
(
char
));
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
int64_t
amount_per_row
=
width
*
channel
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int64_t
offset_height
=
h
*
amount_per_row
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
*
(
data_tmp
+
n
*
chw
+
offset_height
+
w
*
channel
+
c
)
=
*
((
*
data_in
)
++
);
}
}
}
}
void
reorder
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
)
{}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
interleave
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
)
{}
float
find_max
(
float
*
data_in
,
int
data_size
)
{
float
max
=
0.0
;
for
(
int
i
=
0
;
i
<
data_size
;
++
i
)
{
float
value
=
data_in
[
i
];
float
abs
=
value
>
0
?
value
:
-
value
;
max
=
std
::
max
(
max
,
abs
);
}
return
max
;
}
void
quantize
(
float
**
data_in
,
int
data_size
,
float
max
)
{
float
*
tmp
=
*
data_in
;
float
fix_range
=
127
;
float
scale
=
fix_range
/
max
;
char
*
tmp_data
=
(
char
*
)
fpga_malloc
(
data_size
*
sizeof
(
char
));
for
(
int
i
=
0
;
i
<
data_size
;
i
++
)
{
tmp_data
[
i
]
=
(
char
)((
*
data_in
)[
i
]
*
scale
);
}
*
data_in
=
(
float
*
)
tmp_data
;
fpga_free
(
tmp
);
}
void
align_element
(
char
**
data_in
,
int
num
,
int
chw
)
{
int
i
=
0
;
int
j
=
0
;
int
align_chw
=
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
if
(
align_chw
!=
chw
)
{
printf
(
"align %d
\n
"
,
align_chw
);
char
*
tmp
=
*
data_in
;
char
*
data_tmp
=
(
char
*
)
fpga_malloc
(
num
*
align_chw
*
sizeof
(
char
));
memset
(
data_tmp
,
0
,
num
*
align_chw
);
for
(
j
=
0
;
j
<
num
;
j
++
)
{
memcpy
(
data_tmp
+
j
*
align_chw
,
(
*
data_in
)
+
j
*
chw
,
chw
);
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
}
void
align_num
(
char
**
data_in
,
int
num_per_div_before_alignment
,
int
num
,
int
chw
)
{
int
i
=
0
;
int
align_chw
=
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
int
num_per_div_after_alignment
=
align_to_x
(
num_per_div_before_alignment
,
FILTER_NUM_ALIGNMENT
);
if
(
num_per_div_after_alignment
!=
num_per_div_before_alignment
)
{
char
*
tmp
=
*
data_in
;
int
div_num
=
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
int
num_element
=
div_num
*
num_per_div_after_alignment
*
align_chw
;
char
*
data_tmp
=
(
char
*
)
fpga_malloc
(
num_element
*
sizeof
(
char
));
memset
(
data_tmp
,
0
,
num_element
*
sizeof
(
char
));
for
(
i
=
0
;
i
<
div_num
;
i
++
)
{
memcpy
(
data_tmp
+
num_per_div_after_alignment
*
align_chw
*
i
,
*
data_in
+
num_per_div_before_alignment
*
align_chw
*
i
,
num_per_div_before_alignment
*
align_chw
);
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
}
void
reorder
(
char
**
data_in
,
int
num_after_alignment
,
int
chw
)
{
int
index
=
0
;
int
new_index
;
int
chw_align
=
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
char
*
data_tmp
=
(
char
*
)
fpga_malloc
(
chw_align
*
num_after_alignment
*
sizeof
(
char
));
char
*
tmp
=
*
data_in
;
for
(
index
=
0
;
index
<
num_after_alignment
;
index
++
)
{
new_index
=
index
/
32
*
32
+
(
index
%
16
/
4
*
8
)
+
(
index
%
16
%
4
)
+
(
index
/
16
%
2
*
4
);
memcpy
(
data_tmp
+
index
*
chw_align
,
*
data_in
+
new_index
*
chw_align
,
chw_align
);
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
interleave
(
char
**
data_in
,
int
num_after_alignment
,
int
chw
)
{
int
i
=
0
;
int
j
=
0
;
int
k
=
0
;
int
interleave_per_num
=
16
;
;
int
chw_align
=
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
);
char
*
data_tmp
=
(
char
*
)
fpga_malloc
(
chw_align
*
num_after_alignment
*
sizeof
(
char
));
char
*
tmp
=
*
data_in
;
int
interleave_num
=
chw_align
*
2
/
interleave_per_num
;
for
(
i
=
0
;
i
<
num_after_alignment
;
i
+=
2
)
{
for
(
j
=
0
,
k
=
0
;
j
<
interleave_num
;
j
+=
2
,
k
++
)
{
memcpy
(
data_tmp
+
i
*
chw_align
+
interleave_per_num
*
j
,
*
data_in
+
i
*
chw_align
+
interleave_per_num
*
k
,
interleave_per_num
);
memcpy
(
data_tmp
+
i
*
chw_align
+
interleave_per_num
*
(
j
+
1
),
*
data_in
+
(
i
+
1
)
*
chw_align
+
interleave_per_num
*
k
,
interleave_per_num
);
}
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
)
{
int
data_size
=
channel
*
height
*
width
*
num
;
int
chw
=
channel
*
height
*
width
;
int
division_capacity
=
calc_division_capacity
(
chw
);
int
num_per_div_before_alignment
=
calc_num_per_div
(
num
,
group_num
,
division_capacity
);
int
num_per_div_after_alignment
=
align_to_x
(
num_per_div_before_alignment
,
FILTER_NUM_ALIGNMENT
);
int
div_num
=
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
int
num_after_alignment
=
num_per_div_after_alignment
*
div_num
;
quantize
(
data_in
,
data_size
,
max
);
char
**
quantize_data
=
(
char
**
)
data_in
;
convert_to_hwc
(
quantize_data
,
num
,
channel
,
height
,
width
);
align_element
(
quantize_data
,
num
,
chw
);
align_num
(
quantize_data
,
num_per_div_before_alignment
,
num
,
chw
);
reorder
(
quantize_data
,
num_after_alignment
,
chw
);
interleave
(
quantize_data
,
num_after_alignment
,
chw
);
}
}
// namespace filter
}
// namespace filter
}
// namespace fpga
}
// namespace fpga
...
...
src/fpga/filter.h
浏览文件 @
5e5aaeb0
...
@@ -22,14 +22,15 @@ namespace fpga {
...
@@ -22,14 +22,15 @@ namespace fpga {
namespace
filter
{
namespace
filter
{
void
convert_to_hwc
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
void
convert_to_hwc
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
);
int
width
);
float
find_max
(
float
*
data_in
,
int
num
);
float
find_max
(
float
*
data_in
,
int
data_size
);
void
quantize
(
float
*
data_in
,
int
num
);
void
quantize
(
float
*
*
data_in
,
int
data_size
,
float
max
);
void
align_element
(
float
**
data_in
,
int
num
,
int
chw
);
void
align_element
(
float
**
data_in
,
int
num
,
int
chw
);
void
align_num
(
float
**
data_in
,
int
num_per_div_before_alignment
,
int
num
,
void
align_num
(
char
**
data_in
,
int
num_per_div_before_alignment
,
int
num
,
int
chw
);
int
chw
);
void
reorder
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
);
void
reorder
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
);
void
interleave
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
);
void
interleave
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
);
void
format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
);
}
// namespace filter
}
// namespace filter
}
// namespace fpga
}
// namespace fpga
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/fpga/image.cpp
浏览文件 @
5e5aaeb0
...
@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "image.h"
#include <memory.h>
#include "api.h"
namespace
paddle_mobile
{
namespace
fpga
{
namespace
image
{
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
));
int64_t
amount_per_row
=
width
*
channel
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int64_t
offset_height
=
h
*
amount_per_row
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
*
(
data_tmp
+
offset_height
+
w
*
channel
+
c
)
=
*
((
*
data_in
)
++
);
}
}
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
)
{
int
i
=
0
;
int
h
=
0
;
int
align_cw
=
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
if
(
align_cw
!=
cw
)
{
float
*
tmp
=
*
data_in
;
float
*
data_tmp
=
(
float
*
)
fpga_malloc
(
height
*
align_cw
*
sizeof
(
float
));
memset
(
data_tmp
,
0
,
height
*
align_cw
*
sizeof
(
float
));
for
(
h
=
0
;
h
<
height
;
h
++
)
{
memcpy
((
void
*
)(
data_tmp
+
h
*
align_cw
),
(
void
*
)(
*
data_in
+
h
*
cw
),
cw
*
sizeof
(
float
));
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
}
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
)
{
convert_to_hwc
(
data_in
,
channel
,
height
,
width
);
align_element_conv
(
data_in
,
height
,
channel
*
width
);
}
}
// namespace image
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/image.h
浏览文件 @
5e5aaeb0
...
@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace
paddle_mobile
{
namespace
fpga
{
namespace
image
{
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
void
align_element_conv
(
float
**
data_in
,
int
height
,
int
cw
);
void
format_image
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
);
}
// namespace image
}
// namespace fpga
}
// namespace paddle_mobile
src/framework/tensor.h
浏览文件 @
5e5aaeb0
...
@@ -254,30 +254,6 @@ class Tensor {
...
@@ -254,30 +254,6 @@ class Tensor {
"Tensor's dims_ is out of bound. "
);
"Tensor's dims_ is out of bound. "
);
}
}
#ifdef PADDLE_MOBILE_FPGA
struct
FPGAArgs
{
friend
class
Tensor
;
inline
float
*
scale_pointer
()
{
return
scale_
;
}
inline
float
scale
()
{
return
*
scale_
;
}
private:
float
*
scale_
;
};
struct
FPGAArgs
fpga_args
()
const
{
FPGAArgs
args
;
args
.
scale_
=
scale
.
get
();
return
args
;
}
void
SetFpgaScale
(
float
s
)
{
*
(
scale
.
get
())
=
s
;
}
private:
std
::
shared_ptr
<
float
>
scale
=
std
::
make_shared
<
float
>
(
0
);
#endif
private:
private:
/**
/**
* @note Placeholder hides type T, so it doesn't appear as a
* @note Placeholder hides type T, so it doesn't appear as a
...
@@ -313,9 +289,12 @@ class Tensor {
...
@@ -313,9 +289,12 @@ class Tensor {
virtual
std
::
type_index
type
()
const
{
return
type_
;
}
virtual
std
::
type_index
type
()
const
{
return
type_
;
}
virtual
void
set_type
(
std
::
type_index
type
)
{
type_
=
type
;
}
virtual
void
set_type
(
std
::
type_index
type
)
{
type_
=
type
;
}
#ifndef PADDLE_MOBILE_FPGA
/*! the pointer of memory block. */
/*! the pointer of memory block. */
std
::
unique_ptr
<
uint8_t
,
memory
::
PODDeleter
<
uint8_t
>>
ptr_
;
std
::
unique_ptr
<
uint8_t
,
memory
::
PODDeleter
<
uint8_t
>>
ptr_
;
#else
std
::
shared_ptr
<
uint8_t
>
ptr_
;
#endif
/*! the size of memory block. */
/*! the size of memory block. */
size_t
size_
;
size_t
size_
;
...
@@ -344,6 +323,34 @@ class Tensor {
...
@@ -344,6 +323,34 @@ class Tensor {
* begins.
* begins.
*/
*/
size_t
offset_
;
size_t
offset_
;
#ifdef PADDLE_MOBILE_FPGA
public:
inline
void
reset_data_ptr
(
void
*
p
)
{
((
PlaceholderImpl
*
)(
holder_
.
get
()))
->
ptr_
.
reset
((
uint8_t
*
)
p
);
}
struct
FPGAArgs
{
friend
class
Tensor
;
inline
float
*
scale_pointer
()
{
return
scale_
;
}
inline
float
scale
()
{
return
*
scale_
;
}
private:
float
*
scale_
;
};
struct
FPGAArgs
fpga_args
()
const
{
FPGAArgs
args
;
args
.
scale_
=
scale
.
get
();
return
args
;
}
void
SetFpgaScale
(
float
s
)
{
*
(
scale
.
get
())
=
s
;
}
private:
std
::
shared_ptr
<
float
>
scale
=
std
::
make_shared
<
float
>
(
0
);
#endif
};
};
#ifdef PADDLE_MOBILE_DEBUG
#ifdef PADDLE_MOBILE_DEBUG
...
...
src/operators/fusion_conv_bn_add_relu_op.cpp
0 → 100644
浏览文件 @
5e5aaeb0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvBNAddReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_bn_add_relu
,
ops
::
FusionConvBNAddReluOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_bn_add_relu
,
ops
::
FusionConvBNAddReluOp
);
#endif
#endif
src/operators/fusion_conv_bn_add_relu_op.h
0 → 100644
浏览文件 @
5e5aaeb0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_bn_add_relu_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
using
std
::
string
;
using
std
::
vector
;
class
FusionConvBNAddReluMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvBNAddReluMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_BATCHNORM
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_RELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
},
{
"X"
,
"X"
}}},
{
G_OP_TYPE_BATCHNORM
,
{{
"Scale"
,
"Scale"
},
{
"Mean"
,
"Mean"
},
{
"Bias"
,
"Bias"
},
{
"Variance"
,
"Variance"
},
{
"Y"
,
"BNY"
}}}},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
;
}
std
::
vector
<
std
::
pair
<
int
,
std
::
string
>>
NeedCheck
()
{
DLOG
<<
" conv bn add relu check add X "
;
return
{{
2
,
"Y"
},
{
2
,
"X"
}};
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvBNAddReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvBNAddReluParam
<
DeviceType
>
,
operators
::
ConvBNAddReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvBNAddReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvBNAddReluParam
<
DeviceType
>
,
operators
::
ConvBNAddReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
using
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvBNAddReluParam
<
DeviceType
>
,
operators
::
ConvBNAddReluKernel
<
DeviceType
,
T
>>::
OperatorWithKernel
;
void
InferShape
()
const
override
;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_bn_add_relu_registrar
(
new
FusionConvBNAddReluMatcher
());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_bn_add_relu_registrar
(
new
FusionConvBNAddReluMatcher
());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_bn_add_relu_registrar
(
new
FusionConvBNAddReluMatcher
());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
}
// namespace operators
}
// namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU
(
fusion_conv_bn_add_relu
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA
(
fusion_conv_bn_add_relu
);
#endif
#endif
src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
0 → 100644
浏览文件 @
5e5aaeb0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/kernel/conv_bn_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvBNAddReluKernel
<
CPU
,
float
>::
Init
(
FusionConvBNAddReluParam
<
CPU
>
*
param
)
{
const
Tensor
*
mean
=
param
->
InputMean
();
const
Tensor
*
variance
=
param
->
InputVariance
();
const
Tensor
*
scale
=
param
->
InputScale
();
const
Tensor
*
bias
=
param
->
InputBias
();
const
float
epsilon
=
param
->
Epsilon
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
const
int
C
=
mean
->
numel
();
float
inv_std_ptr
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
Tensor
*
new_scale
=
new
Tensor
();
Tensor
*
new_bias
=
new
Tensor
();
auto
new_scale_ptr
=
new_scale
->
mutable_data
<
float
>
({
C
});
auto
new_bias_ptr
=
new_bias
->
mutable_data
<
float
>
({
C
});
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
}
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
return
true
;
}
template
<
>
void
ConvBNAddReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
const
{
ConvBNAddReluCompute
<
float
>
(
param
);
}
template
class
ConvBNAddReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
0 → 100644
浏览文件 @
5e5aaeb0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
void
ConvBNAddReluBasic
(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
bias1
=
param
.
Bias
();
int
groups
=
param
.
Groups
();
DLOG
<<
"yangfei2"
;
DLOG
<<
bias1
->
dims
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
bias_batch
=
bias1
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias_data
=
bias_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
&
new_bias
,
g
,
bias_data
.
data
<
float
>
());
}
}
}
template
<
typename
P
>
void
ConvBNAddReluCompute
(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
Tensor
Bias
;
Bias
.
mutable_data
<
float
>
({
param
.
Groups
()});
if
(
param
.
Groups
()
==
param
.
Input
()
->
dims
()[
1
]
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
3
&&
param
.
Strides
()[
0
]
==
1
)
{
math
::
DepthwiseConvAddBNRelu3x3s1p1
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
true
);
}
else
if
(
param
.
Groups
()
==
param
.
Input
()
->
dims
()[
1
]
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
3
&&
param
.
Strides
()[
0
]
==
2
)
{
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math
::
DepthwiseConvAddBNRelu3x3s2p1v2
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
true
);
}
else
{
ConvBNAddReluBasic
(
param
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_bn_add_relu_kernel.h
0 → 100644
浏览文件 @
5e5aaeb0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVBNADDRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvBNAddReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvBNAddReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvBNAddReluParam
<
DeviceType
>
&
param
)
const
;
bool
Init
(
FusionConvBNAddReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/gemm.cpp
浏览文件 @
5e5aaeb0
...
@@ -716,6 +716,27 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
...
@@ -716,6 +716,27 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
}
}
}
}
// 分块矩阵乘法
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
#if __aarch64__
// AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x16
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
#else
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
#endif
}
}
WriteWithBnAddRelu
(
mc
,
nc
,
c
,
C
,
ldc
,
new_scale
,
new_bias
,
bias
);
}
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
...
@@ -1183,6 +1204,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
...
@@ -1183,6 +1204,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
}
}
}
}
// C = A * B, batchnorm(C),C = C + bias; relu(C)
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
,
*
bias_ptr
;
float32x4_t
cv
;
float32x4_t
nbias
;
float32x2_t
scale
;
float32x4_t
biasv
;
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
bias_ptr
=
bias
+
i
*
ldc
;
nbias
=
vld1q_dup_f32
(
new_bias
);
scale
=
vld1_dup_f32
(
new_scale
);
new_bias
++
;
new_scale
++
;
float
scale0
=
vget_lane_f32
(
scale
,
0
);
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
biasv
=
vld1q_f32
(
bias_ptr
);
cv
=
vmlaq_n_f32
(
nbias
,
cv
,
scale0
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vmaxq_f32
(
cv
,
zero
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
bias_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
biasv
=
vld1q_f32
(
bias_ptr
);
cv
=
vmlaq_n_f32
(
nbias
,
cv
,
scale0
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vmaxq_f32
(
cv
,
zero
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
#else
#else
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
...
@@ -2081,34 +2155,32 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
...
@@ -2081,34 +2155,32 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
if
(
bias1
==
nullptr
)
{
if
(
bias1
==
nullptr
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
float
r
=
*
c
+
*
bias
;
float
r
=
c
[
i
*
NC
+
j
]
+
bias
[
i
]
;
if
(
r
<
0
)
{
if
(
r
<
0
)
{
r
=
*
p
;
r
*=
p
[
i
]
;
}
}
c
++
;
C
[
i
*
ldc
+
j
]
=
r
;
}
}
bias
++
;
p
++
;
}
}
}
else
{
}
else
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
float
r
=
*
c
+
*
bias
;
float
r
=
c
[
i
*
NC
+
j
]
+
bias
[
i
]
;
r
+=
*
bias1
;
r
+=
bias1
[
i
*
ldc
+
j
]
;
if
(
r
<
0
)
{
if
(
r
<
0
)
{
r
*=
*
p
;
r
*=
p
[
i
]
;
}
}
c
++
;
C
[
i
*
ldc
+
j
]
=
r
;
bias1
++
;
}
}
bias
++
;
p
++
;
}
}
}
}
return
;
return
;
}
}
int
nc1
=
nc
/
8
;
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
int
step
=
4
*
(
ldc
-
nc
);
int
step
=
4
*
(
ldc
-
nc
);
int
step1
=
4
*
(
NC
-
nc
);
int
step1
=
4
*
(
NC
-
nc
);
...
@@ -2120,6 +2192,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
...
@@ -2120,6 +2192,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"loop_mc_%=:
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r5, %[nc1]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"mov r6, %[nc2]
\n\t
"
"vld1.32 {d0}, [%[bias]]
\n\t
"
"vld1.32 {d0}, [%[bias]]
\n\t
"
"vld1.32 {d1}, [%[p]]
\n\t
"
"vld1.32 {d1}, [%[p]]
\n\t
"
"vdup.32 q1, d0[0]
\n\t
"
"vdup.32 q1, d0[0]
\n\t
"
...
@@ -2131,20 +2204,64 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
...
@@ -2131,20 +2204,64 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"pld [%[c], #32]
\n\t
"
"pld [%[c], #32]
\n\t
"
"vld1.32 {q3, q4}, [%[c]]!
\n\t
"
"vld1.32 {q3, q4}, [%[c]]!
\n\t
"
"vld1.32 {q9, q10}, [%[c]]!
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vadd.f32 q9, q9, q1
\n\t
"
"vadd.f32 q10, q10, q1
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmax.f32 q11, q9, q14
\n\t
"
"vmin.f32 q13, q9, q14
\n\t
"
"vmax.f32 q12, q10, q14
\n\t
"
"vmin.f32 q15, q10, q14
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vmla.f32 q11, q13, q2
\n\t
"
"vmla.f32 q12, q15, q2
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"vst1.32 {q11, q12}, [%[C]]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"end_nc1_%=:
\n\t
"
"subs r6, r6, #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q3}, [%[c]]!
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vst1.32 {q5}, [%[C]]!
\n\t
"
"subs r6, r6, #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q4}, [%[c]]!
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vst1.32 {q6}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
"add %[p], %[p], #4
\n\t
"
"add %[p], %[p], #4
\n\t
"
"add %[bias], %[bias], #4
\n\t
"
"add %[bias], %[bias], #4
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
...
@@ -2155,10 +2272,11 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
...
@@ -2155,10 +2272,11 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"end_mc_%=:
\n\t
"
"end_mc_%=:
\n\t
"
:
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
p
]
"r"
(
p
),
[
bias
]
"r"
(
bias
),
[
nc3
]
"r"
(
nc3
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
p
]
"r"
(
p
),
[
bias1
]
"r"
(
bias1
)
[
bias
]
"r"
(
bias
),
[
bias1
]
"r"
(
bias1
)
:
"memory"
,
"r5"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
);
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
);
}
else
{
}
else
{
asm
volatile
(
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
...
@@ -2167,6 +2285,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
...
@@ -2167,6 +2285,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"loop_mc_%=:
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r5, %[nc1]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"mov r6, %[nc2]
\n\t
"
"vld1.32 {d0}, [%[bias]]
\n\t
"
"vld1.32 {d0}, [%[bias]]
\n\t
"
"vld1.32 {d1}, [%[p]]
\n\t
"
"vld1.32 {d1}, [%[p]]
\n\t
"
"vdup.32 q1, d0[0]
\n\t
"
"vdup.32 q1, d0[0]
\n\t
"
...
@@ -2192,25 +2311,74 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
...
@@ -2192,25 +2311,74 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"vmla.f32 q6, q8, q2
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"vld1.32 {q3, q4}, [%[c]]!
\n\t
"
"vld1.32 {q9, q10}, [%[bias1]]!
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vadd.f32 q3, q3, q9
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"end_nc1_%=:
\n\t
"
"subs r6, r6, #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q3}, [%[c]]!
\n\t
"
"vld1.32 {q9}, [%[bias1]]!
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vadd.f32 q3, q3, q9
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vst1.32 {q5}, [%[C]]!
\n\t
"
"subs r6, r6, #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"sub %[bias1], %[bias1], %[nc3]
\n\t
"
"vld1.32 {q4}, [%[c]]!
\n\t
"
"vld1.32 {q10}, [%[bias1]]!
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vst1.32 {q6}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
"add %[p], %[p], #4
\n\t
"
"add %[p], %[p], #4
\n\t
"
"add %[bias], %[bias], #4
\n\t
"
"add %[bias], %[bias], #4
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[C], %[C], %[step]
\n\t
"
"add %[C], %[C], %[step]
\n\t
"
"add %[bias1], %[bias1], %[step]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
"end_mc_%=:
\n\t
"
:
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
p
]
"r"
(
p
),
[
bias
]
"r"
(
bias
),
[
nc3
]
"r"
(
nc3
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
p
]
"r"
(
p
),
[
bias1
]
"r"
(
bias1
)
[
bias
]
"r"
(
bias
),
[
bias
1
]
"r"
(
bias1
)
:
"memory"
,
"r5"
,
"
q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8
"
,
:
"memory"
,
"r5"
,
"
r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7
"
,
"q9"
,
"q10"
);
"q
8"
,
"q
9"
,
"q10"
);
}
}
}
}
...
@@ -2426,6 +2594,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
...
@@ -2426,6 +2594,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
"q8"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
"q8"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
}
}
// C = A * B, batchnorm(C),C = C + bias; relu(C)
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
,
*
bias_ptr
;
float32x4_t
cv
;
float32x4_t
nbias
;
float32x2_t
scale
;
float32x4_t
biasv
;
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
bias_ptr
=
bias
+
i
*
ldc
;
nbias
=
vld1q_dup_f32
(
new_bias
);
scale
=
vld1_dup_f32
(
new_scale
);
new_bias
++
;
new_scale
++
;
float
scale0
=
vget_lane_f32
(
scale
,
0
);
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
biasv
=
vld1q_f32
(
bias_ptr
);
cv
=
vmlaq_n_f32
(
nbias
,
cv
,
scale0
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vmaxq_f32
(
cv
,
zero
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
bias_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
biasv
=
vld1q_f32
(
bias_ptr
);
cv
=
vmlaq_n_f32
(
nbias
,
cv
,
scale0
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vmaxq_f32
(
cv
,
zero
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
/*
/*
// C = A * B
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) {
void VecWriteBasic(int n, float *c, float *C, int ldc) {
...
@@ -2835,7 +3056,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -2835,7 +3056,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L1
=
32
*
1024
;
...
@@ -2882,8 +3103,14 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -2882,8 +3103,14 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
#else
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
#endif
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
if
(
bias
==
nullptr
)
{
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
else
{
InnerKernelWithBnAdd
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
,
bias
+
i
*
ldc
+
j
);
}
}
}
}
}
...
@@ -3071,7 +3298,8 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -3071,7 +3298,8 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
#ifdef _OPENMP
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
int
max_threads
=
omp_get_max_threads
();
#else
#else
...
@@ -3148,8 +3376,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -3148,8 +3376,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackA
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
procPackA
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
InnerKernelWithBn
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
if
(
bias
==
nullptr
)
{
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
InnerKernelWithBn
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
else
{
InnerKernelWithBnAdd
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
,
bias
+
i
*
ldc
);
}
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
...
@@ -3165,8 +3399,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -3165,8 +3399,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackB
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
procPackB
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
InnerKernelWithBn
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
if
(
bias
==
nullptr
)
{
ldc
,
relu
,
new_scale
,
new_bias
);
InnerKernelWithBn
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
);
}
else
{
InnerKernelWithBnAdd
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
,
bias
+
j
);
}
}
}
}
}
...
@@ -3185,7 +3425,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
...
@@ -3185,7 +3425,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
int
max_threads
=
1
;
int
max_threads
=
1
;
#endif
#endif
int
L1
=
32
*
1024
;
int
L1
=
8
*
1024
;
KC
=
k
;
KC
=
k
;
if
(
m
>
n
)
{
if
(
m
>
n
)
{
// 对 A 分块
// 对 A 分块
...
...
src/operators/math/gemm.h
浏览文件 @
5e5aaeb0
...
@@ -81,6 +81,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
...
@@ -81,6 +81,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
...
@@ -125,7 +129,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
...
@@ -125,7 +129,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C)
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
float
*
new_scale
,
float
*
new_bias
);
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
);
/*
/*
// 向量矩阵乘法结果回写
// 向量矩阵乘法结果回写
// C = A * B
// C = A * B
...
@@ -152,8 +157,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -152,8 +157,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
...
@@ -166,7 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -166,7 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
...
...
src/operators/math/math_function.cpp
浏览文件 @
5e5aaeb0
...
@@ -56,7 +56,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -56,7 +56,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
)
{
framework
::
Tensor
*
new_bias
,
int
group
,
float
*
bias
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
@@ -79,12 +79,12 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -79,12 +79,12 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
SgemmWithBn_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
SgemmWithBn_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
);
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#else
#else
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
);
bias
);
#endif
#endif
}
}
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
...
...
src/operators/math/math_function.h
浏览文件 @
5e5aaeb0
...
@@ -32,7 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -32,7 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
);
int
group
,
float
*
bias
=
nullptr
);
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
...
...
src/operators/op_param.h
浏览文件 @
5e5aaeb0
...
@@ -1472,6 +1472,119 @@ class FusionConvAddBNReluParam : public OpParam {
...
@@ -1472,6 +1472,119 @@ class FusionConvAddBNReluParam : public OpParam {
};
};
#endif
#endif
#ifdef FUSION_CONVBNADDRELU_OP
template
<
typename
Dtype
>
class
FusionConvBNAddReluParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
rtype
RType
;
public:
FusionConvBNAddReluParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
bias_
=
InputYFrom
<
GType
>
(
inputs
,
scope
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
filter_
=
FilterFrom
<
GType
>
(
inputs
,
scope
);
input_
=
InputFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
strides_
=
GetAttr
<
vector
<
int
>>
(
"strides"
,
attrs
);
paddings_
=
GetAttr
<
vector
<
int
>>
(
"paddings"
,
attrs
);
dilations_
=
GetAttr
<
vector
<
int
>>
(
"dilations"
,
attrs
);
groups
=
GetAttr
<
int
>
(
"groups"
,
attrs
);
input_bias_
=
InputBiasFrom
<
GType
>
(
inputs
,
scope
);
input_mean_
=
InputMeanFrom
<
GType
>
(
inputs
,
scope
);
input_scale_
=
InputScaleFrom
<
GType
>
(
inputs
,
scope
);
input_variance_
=
InputVarianceFrom
<
GType
>
(
inputs
,
scope
);
epsilon_
=
GetAttr
<
float
>
(
"epsilon"
,
attrs
);
momentum_
=
GetAttr
<
float
>
(
"momentum"
,
attrs
);
keyBNY_
=
getkey
(
"BNY"
,
inputs
,
0
);
keyX_
=
getkey
(
"X"
,
inputs
,
0
);
keyY_
=
getkey
(
"Y"
,
inputs
,
0
);
if
(
keyX_
==
keyBNY_
)
{
bias_
=
InputYFrom
<
GType
>
(
inputs
,
scope
);
}
else
if
(
keyY_
==
keyBNY_
)
{
bias_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
}
// is_test_ = GetAttr<bool>("is_test", attrs);
}
RType
*
Bias
()
const
{
return
bias_
;
}
const
int
&
Axis
()
const
{
return
axis_
;
}
const
RType
*
Input
()
const
{
return
input_
;
}
#ifdef PADDLE_MOBILE_FPGA
RType
*
Filter
()
const
{
return
filter_
;
}
#else
const
RType
*
Filter
()
const
{
return
filter_
;
}
#endif
RType
*
Output
()
const
{
return
output_
;
}
const
vector
<
int
>
&
Strides
()
const
{
return
strides_
;
}
const
vector
<
int
>
&
Paddings
()
const
{
return
paddings_
;
}
const
vector
<
int
>
&
Dilations
()
const
{
return
dilations_
;
}
const
int
&
Groups
()
const
{
return
groups
;
}
const
RType
*
InputBias
()
const
{
return
input_bias_
;
}
const
RType
*
InputMean
()
const
{
return
input_mean_
;
}
const
RType
*
InputScale
()
const
{
return
input_scale_
;
}
const
RType
*
InputVariance
()
const
{
return
input_variance_
;
}
const
float
&
Epsilon
()
const
{
return
epsilon_
;
}
const
float
&
Momentum
()
const
{
return
momentum_
;
}
const
bool
&
IsTest
()
const
{
return
is_test_
;
}
void
SetNewScale
(
RType
*
new_scale
)
{
new_scale_
=
new_scale
;
}
void
SetNewBias
(
RType
*
new_bias
)
{
new_bias_
=
new_bias
;
}
const
RType
*
NewScale
()
const
{
return
new_scale_
;
}
const
RType
*
NewBias
()
const
{
return
new_bias_
;
}
protected:
RType
*
bias_
;
int
axis_
;
RType
*
input_
;
RType
*
output_
;
RType
*
filter_
;
vector
<
int
>
strides_
;
vector
<
int
>
paddings_
;
vector
<
int
>
dilations_
;
int
groups
;
RType
*
input_bias_
;
RType
*
input_mean_
;
RType
*
input_scale_
;
RType
*
input_variance_
;
float
epsilon_
;
float
momentum_
;
bool
is_test_
;
RType
*
new_bias_
;
RType
*
new_scale_
;
std
::
string
keyBNY_
;
std
::
string
keyX_
;
std
::
string
keyY_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
ConvArgs
fpga_conv_args
;
public:
const
fpga
::
ConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
ConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#endif
#ifdef FUSION_CONVBN_OP
#ifdef FUSION_CONVBN_OP
template
<
typename
Dtype
>
template
<
typename
Dtype
>
class
FusionConvBNParam
:
public
OpParam
{
class
FusionConvBNParam
:
public
OpParam
{
...
...
test/CMakeLists.txt
浏览文件 @
5e5aaeb0
...
@@ -35,6 +35,9 @@ elseif("FPGAnets" IN_LIST NET)
...
@@ -35,6 +35,9 @@ elseif("FPGAnets" IN_LIST NET)
ADD_EXECUTABLE
(
test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h
)
ADD_EXECUTABLE
(
test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-fpga-concat-op paddle-mobile
)
target_link_libraries
(
test-fpga-concat-op paddle-mobile
)
ADD_EXECUTABLE
(
test-format-data fpga/test_format_data.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-format-data paddle-mobile
)
elseif
(
"mobilenetssd"
IN_LIST NET
)
elseif
(
"mobilenetssd"
IN_LIST NET
)
# gen test
# gen test
ADD_EXECUTABLE
(
test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h
)
ADD_EXECUTABLE
(
test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h
)
...
...
test/common/test_gemm_accuracy.cpp
浏览文件 @
5e5aaeb0
...
@@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
...
@@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
}
}
}
}
paddle_mobile
::
operators
::
math
::
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
paddle_mobile
::
operators
::
math
::
SgemmWithBn
(
c
,
ldc
,
relu
,
scale
,
bias
);
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
,
nullptr
);
int
eq
=
0
;
int
eq
=
0
;
int
neq
=
0
;
int
neq
=
0
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
...
...
test/fpga/test_format_data.cpp
0 → 100644
浏览文件 @
5e5aaeb0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "fpga/api.h"
namespace
frame
=
paddle_mobile
::
framework
;
namespace
fpga
=
paddle_mobile
::
fpga
;
using
std
::
cout
;
using
std
::
endl
;
int
main
()
{
std
::
vector
<
int
>
dims
{
1
,
1
,
3
,
3
};
std
::
vector
<
float
>
elements
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
frame
::
DDim
ddim
=
frame
::
make_ddim
(
dims
);
frame
::
Tensor
image
(
elements
,
ddim
);
int
num
=
image
.
numel
();
float
*
data_ptr
=
image
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
cout
<<
data_ptr
[
i
]
<<
" "
;
}
cout
<<
endl
;
fpga
::
format_image
(
&
image
);
data_ptr
=
image
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
48
;
i
++
)
{
cout
<<
data_ptr
[
i
]
<<
" "
;
}
cout
<<
endl
;
auto
dd
=
image
.
dims
();
cout
<<
dims
[
0
]
<<
dims
[
1
]
<<
dims
[
2
]
<<
dims
[
3
]
<<
endl
;
return
0
;
}
tools/op.cmake
浏览文件 @
5e5aaeb0
...
@@ -87,6 +87,7 @@ if ("resnet" IN_LIST NET)
...
@@ -87,6 +87,7 @@ if ("resnet" IN_LIST NET)
set
(
ELEMENTWISEADD_OP ON
)
set
(
ELEMENTWISEADD_OP ON
)
set
(
POOL_OP ON
)
set
(
POOL_OP ON
)
set
(
BATCHNORM_OP ON
)
set
(
BATCHNORM_OP ON
)
set
(
FUSION_CONVBNADDRELU_OP ON
)
set
(
MUL_OP ON
)
set
(
MUL_OP ON
)
set
(
RESHAPE_OP ON
)
set
(
RESHAPE_OP ON
)
set
(
SOFTMAX_OP ON
)
set
(
SOFTMAX_OP ON
)
...
@@ -141,6 +142,7 @@ if(NOT FOUND_MATCH)
...
@@ -141,6 +142,7 @@ if(NOT FOUND_MATCH)
set
(
FUSION_CONVADDADDPRELU_OP ON
)
set
(
FUSION_CONVADDADDPRELU_OP ON
)
set
(
FUSION_DWCONVBNRELU_OP ON
)
set
(
FUSION_DWCONVBNRELU_OP ON
)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
FUSION_CONVBNADDRELU_OP ON
)
set
(
PRELU_OP ON
)
set
(
PRELU_OP ON
)
set
(
RESIZE_OP ON
)
set
(
RESIZE_OP ON
)
set
(
SCALE_OP ON
)
set
(
SCALE_OP ON
)
...
@@ -244,6 +246,10 @@ if (FUSION_CONVBNRELU_OP)
...
@@ -244,6 +246,10 @@ if (FUSION_CONVBNRELU_OP)
add_definitions
(
-DFUSION_CONVBNRELU_OP
)
add_definitions
(
-DFUSION_CONVBNRELU_OP
)
endif
()
endif
()
if
(
FUSION_CONVBNADDRELU_OP
)
add_definitions
(
-DFUSION_CONVBNADDRELU_OP
)
endif
()
if
(
PRELU_OP
)
if
(
PRELU_OP
)
add_definitions
(
-DPRELU_OP
)
add_definitions
(
-DPRELU_OP
)
endif
()
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录