Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
85ba3b69
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
85ba3b69
编写于
3月 20, 2019
作者:
qnqinan
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/paddle-mobile
into develop
上级
4ea56919
8c2d98f1
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
4073 addition
and
258 deletion
+4073
-258
src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
...rators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+4
-0
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+5
-3
src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
...operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+4
-0
src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
...rators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+4
-0
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+4
-0
src/operators/kernel/arm/convolution/conv_common.cpp
src/operators/kernel/arm/convolution/conv_common.cpp
+22
-2
src/operators/kernel/arm/convolution/conv_kernel.cpp
src/operators/kernel/arm/convolution/conv_kernel.cpp
+4
-0
src/operators/kernel/central-arm-func/conv_arm_func.cpp
src/operators/kernel/central-arm-func/conv_arm_func.cpp
+20
-0
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+3
-0
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+9
-7
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+109
-86
src/operators/math/slidingwindow_conv3x3.cpp
src/operators/math/slidingwindow_conv3x3.cpp
+3710
-0
src/operators/math/slidingwindow_conv3x3.h
src/operators/math/slidingwindow_conv3x3.h
+38
-0
src/operators/op_param.h
src/operators/op_param.h
+2
-0
test/fpga/test_marker.cpp
test/fpga/test_marker.cpp
+78
-120
test/fpga/test_marker_api.cpp
test/fpga/test_marker_api.cpp
+57
-40
未找到文件。
src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -78,6 +78,10 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
...
@@ -78,6 +78,10 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
GemmConv
<
float
,
float
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
:
SlidingwindowConv3x3
<
float
,
float
>
(
param
);
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
...
...
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -32,10 +32,8 @@ template <>
...
@@ -32,10 +32,8 @@ template <>
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
...
@@ -46,6 +44,10 @@ void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> ¶m) {
...
@@ -46,6 +44,10 @@ void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> ¶m) {
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
GemmConv
<
float
,
float
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
:
SlidingwindowConv3x3
<
float
,
float
>
(
param
);
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
...
...
src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -45,6 +45,10 @@ void ConvAddReluKernel<CPU, float>::Compute(
...
@@ -45,6 +45,10 @@ void ConvAddReluKernel<CPU, float>::Compute(
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
GemmConv
<
float
,
float
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
:
SlidingwindowConv3x3
<
float
,
float
>
(
param
);
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
...
...
src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -76,6 +76,10 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
...
@@ -76,6 +76,10 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
GemmConv
<
float
,
float
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
:
SlidingwindowConv3x3
<
float
,
float
>
(
param
);
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
...
...
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -75,6 +75,10 @@ void ConvBNReluKernel<CPU, float>::Compute(
...
@@ -75,6 +75,10 @@ void ConvBNReluKernel<CPU, float>::Compute(
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
GemmConv
<
float
,
float
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
:
SlidingwindowConv3x3
<
float
,
float
>
(
param
);
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
...
...
src/operators/kernel/arm/convolution/conv_common.cpp
浏览文件 @
85ba3b69
...
@@ -57,8 +57,8 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
...
@@ -57,8 +57,8 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
param
->
Dilations
()[
0
]
==
param
->
Dilations
()[
1
]
&&
param
->
Dilations
()[
0
]
==
param
->
Dilations
()[
1
]
&&
param
->
Strides
()[
0
]
==
1
&&
param
->
Dilations
()[
0
]
==
1
param
->
Strides
()[
0
]
==
1
&&
param
->
Dilations
()[
0
]
==
1
#if 1
#if 1
&&
(
param
->
Input
()
->
dims
()[
1
]
>=
4
||
&&
(
param
->
Input
()
->
dims
()[
1
]
>=
8
&&
param
->
Output
()
->
dims
()[
1
]
>=
16
)
param
->
Output
()
->
dims
()[
1
]
>=
8
)
#endif
#endif
)
{
)
{
param
->
ExecMode
()
=
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
;
param
->
ExecMode
()
=
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
;
...
@@ -66,6 +66,26 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
...
@@ -66,6 +66,26 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
param
->
transformed_filter_
=
new
framework
::
LoDTensor
;
param
->
transformed_filter_
=
new
framework
::
LoDTensor
;
operators
::
math
::
winograd_transform_weight
<
8
,
3
>
(
operators
::
math
::
winograd_transform_weight
<
8
,
3
>
(
*
param
->
Filter
(),
param
->
transformed_filter_
);
*
param
->
Filter
(),
param
->
transformed_filter_
);
}
else
if
(
conv3x3
&&
!
depth3x3
&&
param
->
Strides
()[
0
]
==
param
->
Strides
()[
1
]
&&
param
->
Dilations
()[
0
]
==
param
->
Dilations
()[
1
]
&&
param
->
Strides
()[
0
]
==
1
&&
param
->
Dilations
()[
0
]
==
1
#if 1
&&
(
param
->
Input
()
->
dims
()[
2
]
>=
48
&&
param
->
Output
()
->
dims
()[
1
]
<=
24
)
#endif
)
{
param
->
ExecMode
()
=
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
;
}
else
if
(
conv3x3
&&
!
depth3x3
&&
param
->
Strides
()[
0
]
==
param
->
Strides
()[
1
]
&&
param
->
Dilations
()[
0
]
==
param
->
Dilations
()[
1
]
&&
param
->
Strides
()[
0
]
==
2
&&
param
->
Dilations
()[
0
]
==
1
#if 1
&&
(
param
->
Input
()
->
dims
()[
2
]
>=
48
&&
param
->
Output
()
->
dims
()[
1
]
<=
24
)
#endif
)
{
param
->
ExecMode
()
=
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
;
}
else
{
}
else
{
param
->
ExecMode
()
=
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
;
param
->
ExecMode
()
=
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
;
}
}
...
...
src/operators/kernel/arm/convolution/conv_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -54,6 +54,10 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> ¶m) {
...
@@ -54,6 +54,10 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> ¶m) {
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
GemmConv
<
float
,
float
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_SLIDINGWINDOW3x3S2_FLOAT
:
SlidingwindowConv3x3
<
float
,
float
>
(
param
);
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
...
...
src/operators/kernel/central-arm-func/conv_arm_func.cpp
浏览文件 @
85ba3b69
...
@@ -19,6 +19,7 @@ limitations under the License. */
...
@@ -19,6 +19,7 @@ limitations under the License. */
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/pad.h"
#include "operators/math/slidingwindow_conv3x3.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
...
@@ -232,10 +233,29 @@ void DepthwiseConv5x5(const ConvParam<CPU> ¶m) {
...
@@ -232,10 +233,29 @@ void DepthwiseConv5x5(const ConvParam<CPU> ¶m) {
}
}
}
}
template
<
typename
Itype
,
typename
Otype
>
void
SlidingwindowConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
if
(
strides
[
0
]
==
1
)
{
math
::
SlidingwindowConv3x3s1
<
Itype
,
Otype
>
(
input
,
filter
,
paddings
,
output
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
SlidingwindowConv3x3s2
<
Itype
,
Otype
>
(
input
,
filter
,
paddings
,
output
);
}
else
{
GemmConv
<
Itype
,
Otype
>
(
param
);
}
}
template
void
GemmConv
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
GemmConv
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
WinogradConv3x3
<
8
,
3
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
WinogradConv3x3
<
8
,
3
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv3x3
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv3x3
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv5x5
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv5x5
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
SlidingwindowConv3x3
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
#ifndef __aarch64__
#ifndef __aarch64__
template
void
GemmConv
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
GemmConv
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
...
...
src/operators/kernel/central-arm-func/conv_arm_func.h
浏览文件 @
85ba3b69
...
@@ -41,6 +41,9 @@ void DepthwiseConv3x3(const ConvParam<CPU> ¶m);
...
@@ -41,6 +41,9 @@ void DepthwiseConv3x3(const ConvParam<CPU> ¶m);
template
<
typename
Itype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Otype
>
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
);
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
);
template
<
typename
Itype
,
typename
Otype
>
void
SlidingwindowConv3x3
(
const
ConvParam
<
CPU
>
&
param
);
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
src/operators/kernel/fpga/V1/proposal_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -300,7 +300,7 @@ static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
...
@@ -300,7 +300,7 @@ static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
template
<
class
T
>
template
<
class
T
>
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
float
eta
,
int
post_nms_num
=
100
)
{
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
int64_t
box_size
=
bbox
->
dims
()[
1
];
...
@@ -314,7 +314,7 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
...
@@ -314,7 +314,7 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
int
selected_num
=
0
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
while
(
sorted_indices
.
size
()
!=
0
)
{
while
(
(
sorted_indices
.
size
()
!=
0
)
&&
(
selected_num
<
post_nms_num
)
)
{
int
idx
=
sorted_indices
.
back
().
second
;
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
for
(
int
kept_idx
:
selected_indices
)
{
...
@@ -397,17 +397,19 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
...
@@ -397,17 +397,19 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
}
}
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
// Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
,
post_nms_top_n
);
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
keep_nms
.
Resize
({
post_nms_top_n
});
keep_nms
.
Resize
({
post_nms_top_n
});
}
}
// proposals.mutable_data<T>({keep_nms.numel(), 4});//
original
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
});
//
original
// scores_sel.mutable_data<T>({keep_nms.numel(), 1});//
original
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
});
//
original
proposals
.
mutable_data
<
T
>
({
post_nms_top_n
,
4
});
// wong
//
proposals.mutable_data<T>({post_nms_top_n, 4}); // wong
scores_sel
.
mutable_data
<
T
>
({
post_nms_top_n
,
1
});
// wong
//
scores_sel.mutable_data<T>({post_nms_top_n, 1}); // wong
CPUGather
<
T
>
(
bbox_sel
,
keep_nms
,
&
proposals
);
CPUGather
<
T
>
(
bbox_sel
,
keep_nms
,
&
proposals
);
CPUGather
<
T
>
(
scores_filter
,
keep_nms
,
&
scores_sel
);
CPUGather
<
T
>
(
scores_filter
,
keep_nms
,
&
scores_sel
);
return
std
::
make_pair
(
proposals
,
scores_sel
);
return
std
::
make_pair
(
proposals
,
scores_sel
);
...
...
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
浏览文件 @
85ba3b69
...
@@ -15,7 +15,6 @@ limitations under the License. */
...
@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#ifdef PSROI_POOL_OP
#include <cmath>
#include <cmath>
#include <memory>
#include <vector>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "operators/kernel/detection_kernel.h"
...
@@ -72,16 +71,72 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
...
@@ -72,16 +71,72 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
return
true
;
return
true
;
}
}
/*
template <typename Dtype>
void PSROIPoolingForward(
const Dtype* bottom_data,
const int height, const int width, const int input_channel,
Dtype* top_data,
const int pooled_height, const int pooled_width, const int output_channel,
const Dtype* bottom_rois,
const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
{
int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
hstart = std::min(std::max(hstart, 0), height);
hend = std::min(std::max(hend, 0), height);
wstart = std::min(std::max(wstart, 0), width);
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
if(!is_empty){
Dtype bin_area = (hend - hstart) * (wend - wstart);
float rev_bin_area = 1 / bin_area;
float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
//static_cast<float>(bin_area) float pixels_c[output_channel];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int pixel_offset = (h * width + w) * input_channel;
for(int output_c = 0; output_c < output_channel; output_c++){
int input_channel_offset = output_c * pooled_height *
pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
}
float32x4_t pixel_low_c = vld1q_f32(pixels_c);
float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
}
}
sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
}
int output_index_base = (ph * pooled_width + pw) * output_channel;
top_data += output_index_base;
vst1q_f32(top_data, sum_pixels_low_c);
top_data += 4;
vst1q_f32(top_data, sum_pixels_high_c);
}*/
template
<
typename
Dtype
>
template
<
typename
Dtype
>
void
PSROIPooling
(
const
Dtype
*
bottom_data
,
const
int
channels
,
void
PSROIPooling
Forward
(
const
Dtype
*
bottom_data
,
const
int
height
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
width
,
const
int
input_channel
,
const
int
pooled_width
,
const
Dtype
*
bottom_rois
,
Dtype
*
top_data
,
const
int
pooled_height
,
const
int
output_dim
,
const
int
group_size
,
Dtype
*
top_data
,
const
int
pooled_width
,
const
int
output_channel
,
int
index
,
int
nid
,
const
Dtype
Bin_size_h
,
const
Dtype
*
bottom_rois
,
const
Dtype
Bin_size_h
,
const
Dtype
Bin_size_w
,
const
Dtype
roi_start_h
,
const
Dtype
Bin_size_w
,
const
Dtype
roi_start_h
,
const
Dtype
roi_start_w
,
const
int
ctop
,
const
int
ph
,
const
Dtype
roi_start_w
,
const
int
pw
,
const
int
ph
,
const
int
roi_batch_ind
)
{
const
int
roi_batch_ind
)
{
int
pw
=
index
;
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
);
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
);
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
);
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
);
...
@@ -94,60 +149,35 @@ void PSROIPooling(const Dtype* bottom_data, const int channels,
...
@@ -94,60 +149,35 @@ void PSROIPooling(const Dtype* bottom_data, const int channels,
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
int
c
=
(
ctop
*
group_size
+
ph
)
*
group_size
+
pw
;
float
sum_pixels_c
[
output_channel
]
=
{
0
};
float
pixels_c
[
output_channel
]
=
{
0
};
if
(
!
is_empty
)
{
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
bottom_data
+=
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
float
rec_bin_area
=
1
/
bin_area
;
Dtype
out_sum
=
0
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
int
bottom_index
=
h
*
width
+
w
;
int
pixel_offset
=
(
h
*
width
+
w
)
*
input_channel
;
out_sum
+=
bottom_data
[
bottom_index
];
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
}
int
input_channel_offset
=
output_c
*
pooled_height
*
pooled_width
;
int
input_bias
=
pixel_offset
+
input_channel_offset
+
ph
*
pooled_width
+
pw
;
pixels_c
[
output_c
]
=
bottom_data
[
input_bias
];
}
}
top_data
[
nid
+
index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
}
sum_pixels_c
[
output_c
]
+=
pixels_c
[
output_c
];
void
convert_to_chw
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
int
num
)
{
float
*
data_in_tmp
=
*
data_in
;
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
)));
// NOLINT
int64_t
amount_per_side
=
width
*
height
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
*
(
data_tmp
+
n
*
height
*
width
*
channel
+
c
*
amount_per_side
+
width
*
h
+
w
)
=
*
((
*
data_in
)
++
);
}
}
}
}
}
*
data_in
=
data_tmp
;
fpga
::
fpga_free
(
data_in_tmp
);
}
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
int
num
)
{
float
*
data_in_tmp
=
*
data_in
;
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
num
*
channel
*
height
*
width
*
sizeof
(
float
)));
int64_t
amount_per_row
=
width
*
channel
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int64_t
offset_height
=
h
*
amount_per_row
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
*
(
data_tmp
+
n
*
channel
*
height
*
width
+
offset_height
+
w
*
channel
+
c
)
=
*
((
*
data_in
)
++
);
}
}
}
}
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
sum_pixels_c
[
output_c
]
*=
rec_bin_area
;
}
}
}
}
*
data_in
=
data_tmp
;
fpga
::
fpga_free
(
data_in_tmp
);
int
output_index_base
=
(
ph
*
pooled_width
+
pw
)
*
output_channel
;
top_data
+=
output_index_base
;
memcpy
(
top_data
,
sum_pixels_c
,
output_channel
*
4
);
}
}
template
<
>
template
<
>
...
@@ -174,14 +204,15 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
...
@@ -174,14 +204,15 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
int
rois_num
=
rois
->
dims
()[
0
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
,
1
);
// fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
(
param
.
output_
)
->
Resize
(
dims_out_new
);
float
*
input_data
=
data_nhwc
;
// in->data<float>();
const
float
*
input_data
=
data_nhwc
;
// in->data<float>();
// shared_ptr<float> input_data(data_nhwc);
framework
::
Tensor
rois_batch_id_list
;
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
...
@@ -203,18 +234,19 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
...
@@ -203,18 +234,19 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"output_channels x pooled_height x pooled_width"
);
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
// calculate batch id index for each roi according to LoD
//
for (int n = 0; n < rois_batch_size; ++n) {
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
//
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
//
rois_batch_id_data[i] = n;
rois_batch_id_data
[
i
]
=
n
;
//
}
}
//
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
offset_output_data
=
output_data
+
pooled_height
*
pooled_width
*
output_channels
*
n
;
auto
roi_start_w
=
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
auto
roi_start_h
=
...
@@ -232,27 +264,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
...
@@ -232,27 +264,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
int
roi_batch_ind
=
0
;
// rois_batch_id_data[n];
int
roi_batch_ind
=
rois_batch_id_data
[
n
];
// std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
int
index
=
pooled_width
;
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
nid
=
n
*
output_channels
*
pooled_height
*
pooled_width
+
PSROIPoolingForward
<
float
>
(
input_data
,
height
,
width
,
input_channels
,
c
*
pooled_width
*
pooled_height
+
ph
*
pooled_width
;
offset_output_data
,
pooled_height
,
for
(
int
idx
=
0
;
idx
<
index
;
idx
++
)
{
pooled_width
,
output_channels
,
input_rois
,
PSROIPooling
<
float
>
(
input_data
,
input_channels
,
height
,
width
,
bin_size_h
,
bin_size_w
,
roi_start_h
,
pooled_height
,
pooled_width
,
input_rois
,
roi_start_w
,
pw
,
ph
,
roi_batch_ind
);
output_channels
,
pooled_height
,
output_data
,
idx
,
nid
,
bin_size_h
,
bin_size_w
,
roi_start_h
,
roi_start_w
,
c
,
ph
,
roi_batch_ind
);
}
}
}
}
}
}
}
fpga
::
fpga_free
(
input_data
);
fpga
::
image
::
convert_to_hwc
(
&
output_data
,
output_channels
,
pooled_height
,
pooled_width
,
rois_num
);
out
->
reset_data_ptr
(
output_data
);
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/math/slidingwindow_conv3x3.cpp
0 → 100644
浏览文件 @
85ba3b69
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/slidingwindow_conv3x3.h"
#include <vector>
#if __ARM_NEON
#include <arm_neon.h>
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
template
<
>
void
SlidingwindowConv3x3s1
<
float
,
float
>
(
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
const
int
batch
=
input
->
dims
()[
0
];
const
int
input_ch
=
input
->
dims
()[
1
];
const
int
input_h
=
input
->
dims
()[
2
];
const
int
input_w
=
input
->
dims
()[
3
];
const
int
output_ch
=
output
->
dims
()[
1
];
const
int
output_h
=
output
->
dims
()[
2
];
const
int
output_w
=
output
->
dims
()[
3
];
const
int
padding_h
=
paddings
[
0
];
const
int
padding_w
=
paddings
[
1
];
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
float
*
filter_data
=
filter
->
data
<
float
>
();
const
int
in_ch_size
=
input_h
*
input_w
;
const
int
in_batch_size
=
input_ch
*
in_ch_size
;
const
int
out_ch_size
=
output_h
*
output_w
;
const
int
out_batch_size
=
output_ch
*
out_ch_size
;
const
int
out_size
=
batch
*
out_batch_size
;
const
int
filter_ch_size
=
9
;
const
int
pad_filter_ch_size
=
(
2
*
padding_h
+
3
)
*
(
2
*
padding_w
+
3
);
const
int
pad_filter_start
=
2
*
padding_h
*
(
2
*
padding_w
+
3
)
+
2
*
padding_w
;
const
int
pad_filter_w
=
3
+
padding_w
*
2
;
bool
if_nopadding
=
false
;
#if __ARM_NEON
float
*
out_ptr
=
output_data
;
int
remain
=
out_size
&
0x3
;
float32x4_t
_zero
=
vdupq_n_f32
(
0.0
);
for
(
int
i
=
0
;
i
<
out_size
;
i
+=
4
)
{
vst1q_f32
(
out_ptr
,
_zero
);
out_ptr
+=
4
;
}
switch
(
remain
)
{
case
1
:
vst1q_lane_f32
(
out_ptr
,
_zero
,
0
);
break
;
case
2
:
vst1_f32
(
out_ptr
,
vget_low_f32
(
_zero
));
break
;
case
3
:
vst1_f32
(
out_ptr
,
vget_low_f32
(
_zero
));
vst1q_lane_f32
(
out_ptr
+
2
,
_zero
,
0
);
break
;
}
#else
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
out_size
;
++
i
)
{
output_data
[
i
]
=
0
;
}
#endif
if
(
padding_h
==
0
&&
padding_w
==
0
)
{
if_nopadding
=
true
;
}
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for
for
(
int
o_c
=
0
;
o_c
<
output_ch
-
1
;
o_c
+=
2
)
{
bool
issamefilter
;
const
float
*
f1
;
const
float
*
f1_c2
;
const
float
*
in_ptr1
,
*
in_ptr2
,
*
in_ptr3
,
*
in_ptr4
;
const
float
*
pad_filter0
,
*
pad_filter1
,
*
pad_filter2
,
*
pad_filter3
;
const
float
*
pad_filter0_c2
,
*
pad_filter1_c2
,
*
pad_filter2_c2
,
*
pad_filter3_c2
;
float
pad_filter_arr
[
pad_filter_ch_size
];
float
pad_filter_arr_c2
[
pad_filter_ch_size
];
float
*
output_data_ch
;
float
*
output_data_ch_2
;
const
float
*
input_data_ch
;
const
float
*
filter_data_ch
;
const
float
*
filter_data_ch_c2
;
filter_data_ch
=
filter_data
+
o_c
*
filter_ch_size
*
input_ch
;
filter_data_ch_c2
=
filter_data
+
(
o_c
+
1
)
*
filter_ch_size
*
input_ch
;
input_data_ch
=
input_data
;
output_data_ch
=
output_data
+
o_c
*
out_ch_size
;
output_data_ch_2
=
output_data
+
(
o_c
+
1
)
*
out_ch_size
;
for
(
int
i_c
=
0
;
i_c
<
input_ch
;
++
i_c
)
{
f1
=
filter_data_ch
;
f1_c2
=
filter_data_ch_c2
;
if
(
!
if_nopadding
)
{
memset
(
pad_filter_arr
,
0.
f
,
sizeof
(
pad_filter_arr
));
memset
(
pad_filter_arr_c2
,
0.
f
,
sizeof
(
pad_filter_arr_c2
));
for
(
int
i
=
0
;
i
<
9
;
i
++
)
{
int
j
=
i
/
3
*
(
2
*
padding_w
+
3
)
+
i
%
3
+
padding_h
*
3
+
padding_w
*
(
2
*
padding_h
+
1
);
pad_filter_arr
[
j
]
=
filter_data_ch
[
i
];
pad_filter_arr_c2
[
j
]
=
filter_data_ch_c2
[
i
];
}
pad_filter1
=
pad_filter_arr
;
pad_filter1
+=
pad_filter_start
;
pad_filter0
=
pad_filter1
-
pad_filter_w
;
pad_filter2
=
pad_filter1
+
pad_filter_w
;
pad_filter3
=
pad_filter2
+
pad_filter_w
;
pad_filter1_c2
=
pad_filter_arr_c2
;
pad_filter1_c2
+=
pad_filter_start
;
pad_filter0_c2
=
pad_filter1_c2
-
pad_filter_w
;
pad_filter2_c2
=
pad_filter1_c2
+
pad_filter_w
;
pad_filter3_c2
=
pad_filter2_c2
+
pad_filter_w
;
}
else
{
pad_filter1
=
filter_data_ch
;
pad_filter2
=
pad_filter1
+
3
;
pad_filter3
=
pad_filter2
+
3
;
pad_filter1_c2
=
filter_data_ch_c2
;
pad_filter2_c2
=
pad_filter1_c2
+
3
;
pad_filter3_c2
=
pad_filter2_c2
+
3
;
}
float
*
out_ptr1
,
*
out_ptr2
;
float
*
out_ptr1_c2
,
*
out_ptr2_c2
;
out_ptr1
=
output_data_ch
;
out_ptr2
=
out_ptr1
+
output_w
;
out_ptr1_c2
=
output_data_ch_2
;
out_ptr2_c2
=
out_ptr1_c2
+
output_w
;
in_ptr1
=
input_data_ch
;
in_ptr2
=
in_ptr1
+
input_w
;
in_ptr3
=
in_ptr2
+
input_w
;
in_ptr4
=
in_ptr3
+
input_w
;
int
o_h
=
0
;
for
(;
o_h
<
output_h
-
1
;
o_h
=
o_h
+
2
)
{
if
(
!
if_nopadding
&&
(
o_h
<
padding_h
||
o_h
>
output_h
-
padding_h
-
2
))
{
issamefilter
=
false
;
}
else
{
issamefilter
=
true
;
}
int
o_w
=
0
;
// pad left
for
(;
o_w
<
padding_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum2
=
0
;
float
sum1_c2
=
0
;
float
sum2_c2
=
0
;
if
(
issamefilter
)
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr2
,
_pad_filter1
);
float32x4_t
_sum2_c2
=
vmulq_f32
(
_in_ptr2
,
_pad_filter1_c2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr3
,
_pad_filter2_c2
);
float32x4_t
_in_ptr4
=
vld1q_f32
(
in_ptr4
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr4
,
_pad_filter3
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr4
,
_pad_filter3_c2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
_sum2_c2
=
vsetq_lane_f32
(
sum2_c2
,
_sum2_c2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ss2_2
=
vadd_f32
(
vget_low_f32
(
_sum2_c2
),
vget_high_f32
(
_sum2_c2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
float32x2_t
_ssss1_2_ssss2_2
=
vpadd_f32
(
_ss1_2
,
_ss2_2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
sum2_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
sum2
+=
in_ptr4
[
0
]
*
pad_filter3
[
0
];
sum2
+=
in_ptr4
[
1
]
*
pad_filter3
[
1
];
sum2
+=
in_ptr4
[
2
]
*
pad_filter3
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
sum2_c2
+=
in_ptr2
[
0
]
*
pad_filter1_c2
[
0
];
sum2_c2
+=
in_ptr2
[
1
]
*
pad_filter1_c2
[
1
];
sum2_c2
+=
in_ptr2
[
2
]
*
pad_filter1_c2
[
2
];
sum2_c2
+=
in_ptr3
[
0
]
*
pad_filter2_c2
[
0
];
sum2_c2
+=
in_ptr3
[
1
]
*
pad_filter2_c2
[
1
];
sum2_c2
+=
in_ptr3
[
2
]
*
pad_filter2_c2
[
2
];
sum2_c2
+=
in_ptr4
[
0
]
*
pad_filter3_c2
[
0
];
sum2_c2
+=
in_ptr4
[
1
]
*
pad_filter3_c2
[
1
];
sum2_c2
+=
in_ptr4
[
2
]
*
pad_filter3_c2
[
2
];
#endif
}
else
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_pad_filter0
=
vld1q_f32
(
pad_filter0
);
float32x4_t
_pad_filter0_c2
=
vld1q_f32
(
pad_filter0_c2
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter0
);
float32x4_t
_sum2_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter0_c2
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr2
,
_pad_filter1
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr2
,
_pad_filter1_c2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr3
,
_pad_filter2_c2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
_sum2_c2
=
vsetq_lane_f32
(
sum2_c2
,
_sum2_c2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ss2_2
=
vadd_f32
(
vget_low_f32
(
_sum2_c2
),
vget_high_f32
(
_sum2_c2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
float32x2_t
_ssss1_2_ssss2_2
=
vpadd_f32
(
_ss1_2
,
_ss2_2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
sum2_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr1
[
0
]
*
pad_filter0
[
0
];
sum2
+=
in_ptr1
[
1
]
*
pad_filter0
[
1
];
sum2
+=
in_ptr1
[
2
]
*
pad_filter0
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
sum2_c2
+=
in_ptr1
[
0
]
*
pad_filter0_c2
[
0
];
sum2_c2
+=
in_ptr1
[
1
]
*
pad_filter0_c2
[
1
];
sum2_c2
+=
in_ptr1
[
2
]
*
pad_filter0_c2
[
2
];
sum2_c2
+=
in_ptr2
[
0
]
*
pad_filter1_c2
[
0
];
sum2_c2
+=
in_ptr2
[
1
]
*
pad_filter1_c2
[
1
];
sum2_c2
+=
in_ptr2
[
2
]
*
pad_filter1_c2
[
2
];
sum2_c2
+=
in_ptr3
[
0
]
*
pad_filter2_c2
[
0
];
sum2_c2
+=
in_ptr3
[
1
]
*
pad_filter2_c2
[
1
];
sum2_c2
+=
in_ptr3
[
2
]
*
pad_filter2_c2
[
2
];
#endif
}
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter0_c2
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr2
+=
sum2
;
*
out_ptr1_c2
+=
sum1_c2
;
*
out_ptr2_c2
+=
sum2_c2
;
out_ptr1
++
;
out_ptr2
++
;
out_ptr1_c2
++
;
out_ptr2_c2
++
;
}
// valid
#if __ARM_NEON
#if __aarch64__
if
(
issamefilter
)
{
int
loop
=
(
output_w
-
2
*
padding_w
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"prfm pldl1keep, [%[f1_c2], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"ld1 {v2.4s, v3.4s}, [%[f1_c2]], #32
\n\t
"
"ld1 {v4.s}[0], [%[f1]]
\n\t
"
"sub %[f1],%[f1], #32
\n\t
"
"ld1 {v4.s}[1], [%[f1_c2]]
\n\t
"
"sub %[f1_c2],%[f1_c2], #32
\n\t
"
"prfm pldl1keep, [%[in_ptr1], #192]
\n\t
"
"prfm pldl1keep, [%[in_ptr4], #192]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1],%[in_ptr1], #16
\n\t
"
"ld1 {v6.d}[1], [%[in_ptr4]]
\n\t
"
"add %[in_ptr4],%[in_ptr4], #8
\n\t
"
"ld1 {v7.4s}, [%[in_ptr4]]
\n\t
"
"add %[in_ptr4],%[in_ptr4], #8
\n\t
"
"0:
\n\t
"
// load out_ptr
"prfm pldl1keep, [%[out_ptr1], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c2], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr2], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr2_c2], #128]
\n\t
"
"ld1 {v12.4s}, [%[out_ptr1]]
\n\t
"
"ld1 {v13.4s}, [%[out_ptr1_c2]]
\n\t
"
"ld1 {v14.4s}, [%[out_ptr2]]
\n\t
"
"ld1 {v15.4s}, [%[out_ptr2_c2]]
\n\t
"
// in_ptr1 and in_ptr4 multiply
"ext v8.16b, v5.16b, v6.16b, #4
\n\t
"
"fmla v12.4s, v5.4s, v0.s[0]
\n\t
"
"fmla v13.4s, v5.4s, v2.s[0]
\n\t
"
"ext v9.16b, v6.16b, v7.16b, #8
\n\t
"
"fmla v14.4s, v7.4s, v4.s[0]
\n\t
"
"fmla v15.4s, v7.4s, v4.s[1]
\n\t
"
"ext v10.16b, v5.16b, v6.16b, #8
\n\t
"
"fmla v12.4s, v8.4s, v0.s[1]
\n\t
"
"fmla v13.4s, v8.4s, v2.s[1]
\n\t
"
"ext v11.16b, v6.16b, v7.16b, #12
\n\t
"
"fmla v14.4s, v9.4s, v1.s[2]
\n\t
"
"fmla v15.4s, v9.4s, v3.s[2]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr2]]
\n\t
"
"fmla v12.4s, v10.4s, v0.s[2]
\n\t
"
"fmla v13.4s, v10.4s, v2.s[2]
\n\t
"
"add %[in_ptr2],%[in_ptr2], #16
\n\t
"
"fmla v14.4s, v11.4s, v1.s[3]
\n\t
"
"fmla v15.4s, v11.4s, v3.s[3]
\n\t
"
// in_ptr2 multiply
"ext v8.16b, v5.16b, v6.16b, #4
\n\t
"
"fmla v12.4s, v5.4s, v0.s[3]
\n\t
"
"fmla v13.4s, v5.4s, v2.s[3]
\n\t
"
"fmla v14.4s, v5.4s, v0.s[0]
\n\t
"
"fmla v15.4s, v5.4s, v2.s[0]
\n\t
"
"ext v9.16b, v5.16b, v6.16b, #8
\n\t
"
"fmla v12.4s, v8.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v8.4s, v3.s[0]
\n\t
"
"ld1 {v6.d}[1], [%[in_ptr3]]
\n\t
"
"add %[in_ptr3],%[in_ptr3], #8
\n\t
"
"fmla v14.4s, v8.4s, v0.s[1]
\n\t
"
"fmla v15.4s, v8.4s, v2.s[1]
\n\t
"
"ld1 {v7.4s}, [%[in_ptr3]]
\n\t
"
"add %[in_ptr3],%[in_ptr3], #8
\n\t
"
"fmla v12.4s, v9.4s, v1.s[1]
\n\t
"
"fmla v13.4s, v9.4s, v3.s[1]
\n\t
"
"ext v10.16b, v6.16b, v7.16b, #8
\n\t
"
"fmla v14.4s, v9.4s, v0.s[2]
\n\t
"
"fmla v15.4s, v9.4s, v2.s[2]
\n\t
"
// in_ptr3 multiply
"fmla v12.4s, v7.4s, v4.s[0]
\n\t
"
"fmla v13.4s, v7.4s, v4.s[1]
\n\t
"
"ext v11.16b, v6.16b, v7.16b, #12
\n\t
"
"fmla v14.4s, v7.4s, v1.s[1]
\n\t
"
"fmla v15.4s, v7.4s, v3.s[1]
\n\t
"
"fmla v12.4s, v10.4s, v1.s[2]
\n\t
"
"fmla v13.4s, v10.4s, v3.s[2]
\n\t
"
"fmla v14.4s, v10.4s, v0.s[3]
\n\t
"
"fmla v15.4s, v10.4s, v2.s[3]
\n\t
"
"fmla v12.4s, v11.4s, v1.s[3]
\n\t
"
"fmla v13.4s, v11.4s, v3.s[3]
\n\t
"
"prfm pldl1keep, [%[in_ptr1], #192]
\n\t
"
"fmla v14.4s, v11.4s, v1.s[0]
\n\t
"
"fmla v15.4s, v11.4s, v3.s[0]
\n\t
"
// store out_ptr
"prfm pldl1keep, [%[in_ptr4], #192]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1],%[in_ptr1], #16
\n\t
"
"st1 {v12.4s}, [%[out_ptr1]], #16
\n\t
"
"ld1 {v6.d}[1], [%[in_ptr4]]
\n\t
"
"add %[in_ptr4],%[in_ptr4], #8
\n\t
"
"st1 {v13.4s}, [%[out_ptr1_c2]], #16
\n\t
"
"ld1 {v7.4s}, [%[in_ptr4]]
\n\t
"
"add %[in_ptr4],%[in_ptr4], #8
\n\t
"
"st1 {v14.4s}, [%[out_ptr2]], #16
\n\t
"
"subs %[loop],%[loop], #1
\n\t
"
"st1 {v15.4s}, [%[out_ptr2_c2]], #16
\n\t
"
// cycle
"bne 0b
\n\t
"
"sub %[in_ptr1],%[in_ptr1], #16
\n\t
"
"sub %[in_ptr4],%[in_ptr4], #16
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr2
]
"+r"
(
out_ptr2
),
[
out_ptr1_c2
]
"+r"
(
out_ptr1_c2
),
[
out_ptr2_c2
]
"+r"
(
out_ptr2_c2
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
),
[
in_ptr4
]
"+r"
(
in_ptr4
)
:
[
f1
]
"r"
(
f1
),
[
f1_c2
]
"r"
(
f1_c2
)
:
"cc"
,
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
);
}
}
if
(
!
if_nopadding
&&
o_w
==
output_w
-
padding_w
)
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter0_c2
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
in_ptr1
--
;
in_ptr2
--
;
in_ptr3
--
;
in_ptr4
--
;
}
#else
if
(
issamefilter
)
{
int
loop
=
(
output_w
-
2
*
padding_w
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"pld [%[f1], #256]
\n\t
"
"pld [%[f1_c2], #256]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]
\n\t
"
"add %[f1], #32
\n\t
"
"vld1.f32 {d4-d7}, [%[f1_c2]]
\n\t
"
"add %[f1_c2], #32
\n\t
"
"vld1.f32 {d8[0]}, [%[f1]]
\n\t
"
"sub %[f1], #32
\n\t
"
"vld1.f32 {d8[1]}, [%[f1_c2]]
\n\t
"
"sub %[f1_c2], #32
\n\t
"
"pld [%[in_ptr1], #192]
\n\t
"
"pld [%[in_ptr4], #192]
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], #16
\n\t
"
"vld1.f32 {d13-d15}, [%[in_ptr4]]
\n\t
"
"add %[in_ptr4], #16
\n\t
"
"0:
\n\t
"
// load out_ptr
"pld [%[out_ptr1], #128]
\n\t
"
"pld [%[out_ptr1_c2], #128]
\n\t
"
"pld [%[out_ptr2], #128]
\n\t
"
"pld [%[out_ptr2_c2], #128]
\n\t
"
"vld1.f32 {d24, d25}, [%[out_ptr1]]
\n\t
"
"vld1.f32 {d26, d27}, [%[out_ptr1_c2]]
\n\t
"
"vld1.f32 {d28, d29}, [%[out_ptr2]]
\n\t
"
"vld1.f32 {d30, d31}, [%[out_ptr2_c2]]
\n\t
"
// in_ptr1 + in_ptr4 multiply
"vext.32 q8, q5, q6, #1
\n\t
"
"vmla.f32 q12, q5, d0[0]
\n\t
"
"vmla.f32 q13, q5, d4[0]
\n\t
"
"vext.32 q9, q6, q7, #2
\n\t
"
"vmla.f32 q14, q7, d8[0]
\n\t
"
"vmla.f32 q15, q7, d8[1]
\n\t
"
"vext.32 q10, q5, q6, #2
\n\t
"
"vmla.f32 q12, q8, d0[1]
\n\t
"
"vmla.f32 q13, q8, d4[1]
\n\t
"
"vext.32 q11, q6, q7, #3
\n\t
"
"vmla.f32 q14, q9, d3[0]
\n\t
"
"vmla.f32 q15, q9, d7[0]
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr2]]
\n\t
"
"add %[in_ptr2], #16
\n\t
"
"vmla.f32 q12, q10, d1[0]
\n\t
"
"vmla.f32 q13, q10, d5[0]
\n\t
"
"vmla.f32 q14, q11, d3[1]
\n\t
"
"vmla.f32 q15, q11, d7[1]
\n\t
"
// in_ptr2 multiply
"vext.32 q8, q5, q6, #1
\n\t
"
"vmla.f32 q12, q5, d1[1]
\n\t
"
"vmla.f32 q13, q5, d5[1]
\n\t
"
"vmla.f32 q14, q5, d0[0]
\n\t
"
"vmla.f32 q15, q5, d4[0]
\n\t
"
"vext.32 q9, q5, q6, #2
\n\t
"
"vmla.f32 q12, q8, d2[0]
\n\t
"
"vmla.f32 q13, q8, d6[0]
\n\t
"
"vld1.f32 {d13-d15}, [%[in_ptr3]]
\n\t
"
"add %[in_ptr3], #16
\n\t
"
"vmla.f32 q14, q8, d0[1]
\n\t
"
"vmla.f32 q15, q8, d4[1]
\n\t
"
"vmla.f32 q12, q9, d2[1]
\n\t
"
"vmla.f32 q13, q9, d6[1]
\n\t
"
"vmla.f32 q14, q9, d1[0]
\n\t
"
"vmla.f32 q15, q9, d5[0]
\n\t
"
// in_ptr3 multiply
"vext.32 q10, q6, q7, #2
\n\t
"
"vmla.f32 q12, q7, d8[0]
\n\t
"
"vmla.f32 q13, q7, d8[1]
\n\t
"
"vmla.f32 q14, q7, d2[1]
\n\t
"
"vmla.f32 q15, q7, d6[1]
\n\t
"
"vext.32 q11, q6, q7, #3
\n\t
"
"vmla.f32 q12, q10, d3[0]
\n\t
"
"vmla.f32 q13, q10, d7[0]
\n\t
"
"vmla.f32 q14, q10, d1[1]
\n\t
"
"vmla.f32 q15, q10, d5[1]
\n\t
"
"vmla.f32 q12, q11, d3[1]
\n\t
"
"vmla.f32 q13, q11, d7[1]
\n\t
"
"vmla.f32 q14, q11, d2[0]
\n\t
"
"vmla.f32 q15, q11, d6[0]
\n\t
"
// store out_ptr
"pld [%[in_ptr1], #192]
\n\t
"
"pld [%[in_ptr4], #192]
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], #16
\n\t
"
"vst1.f32 {d24, d25}, [%[out_ptr1]]!
\n\t
"
"vst1.f32 {d26, d27}, [%[out_ptr1_c2]]!
\n\t
"
"vld1.f32 {d13-d15}, [%[in_ptr4]]
\n\t
"
"add %[in_ptr4], #16
\n\t
"
"vst1.f32 {d28, d29}, [%[out_ptr2]]!
\n\t
"
"subs %[loop], #1
\n\t
"
"vst1.f32 {d30, d31}, [%[out_ptr2_c2]]!
\n\t
"
// cycle
"bne 0b
\n\t
"
"sub %[in_ptr1], #16
\n\t
"
"sub %[in_ptr4], #16
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr2
]
"+r"
(
out_ptr2
),
[
out_ptr1_c2
]
"+r"
(
out_ptr1_c2
),
[
out_ptr2_c2
]
"+r"
(
out_ptr2_c2
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
),
[
in_ptr4
]
"+r"
(
in_ptr4
)
:
[
f1
]
"r"
(
f1
),
[
f1_c2
]
"r"
(
f1_c2
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
}
if
(
!
if_nopadding
&&
o_w
==
output_w
-
padding_w
)
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter0_c2
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
in_ptr1
--
;
in_ptr2
--
;
in_ptr3
--
;
in_ptr4
--
;
}
#endif //__aarch64__
#endif // __ARM_NEON
// remain output_width
for
(;
o_w
<
output_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum2
=
0
;
float
sum1_c2
=
0
;
float
sum2_c2
=
0
;
if
(
issamefilter
)
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr2
,
_pad_filter1
);
float32x4_t
_sum2_c2
=
vmulq_f32
(
_in_ptr2
,
_pad_filter1_c2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr3
,
_pad_filter2_c2
);
float32x4_t
_in_ptr4
=
vld1q_f32
(
in_ptr4
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr4
,
_pad_filter3
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr4
,
_pad_filter3_c2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
_sum2_c2
=
vsetq_lane_f32
(
sum2_c2
,
_sum2_c2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ss2_2
=
vadd_f32
(
vget_low_f32
(
_sum2_c2
),
vget_high_f32
(
_sum2_c2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
float32x2_t
_ssss1_2_ssss2_2
=
vpadd_f32
(
_ss1_2
,
_ss2_2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
sum2_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
sum2
+=
in_ptr4
[
0
]
*
pad_filter3
[
0
];
sum2
+=
in_ptr4
[
1
]
*
pad_filter3
[
1
];
sum2
+=
in_ptr4
[
2
]
*
pad_filter3
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
sum2_c2
+=
in_ptr2
[
0
]
*
pad_filter1_c2
[
0
];
sum2_c2
+=
in_ptr2
[
1
]
*
pad_filter1_c2
[
1
];
sum2_c2
+=
in_ptr2
[
2
]
*
pad_filter1_c2
[
2
];
sum2_c2
+=
in_ptr3
[
0
]
*
pad_filter2_c2
[
0
];
sum2_c2
+=
in_ptr3
[
1
]
*
pad_filter2_c2
[
1
];
sum2_c2
+=
in_ptr3
[
2
]
*
pad_filter2_c2
[
2
];
sum2_c2
+=
in_ptr4
[
0
]
*
pad_filter3_c2
[
0
];
sum2_c2
+=
in_ptr4
[
1
]
*
pad_filter3_c2
[
1
];
sum2_c2
+=
in_ptr4
[
2
]
*
pad_filter3_c2
[
2
];
#endif
}
else
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_pad_filter0
=
vld1q_f32
(
pad_filter0
);
float32x4_t
_pad_filter0_c2
=
vld1q_f32
(
pad_filter0_c2
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter0
);
float32x4_t
_sum2_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter0_c2
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr2
,
_pad_filter1
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr2
,
_pad_filter1_c2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
_sum2_c2
=
vmlaq_f32
(
_sum2_c2
,
_in_ptr3
,
_pad_filter2_c2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
_sum2_c2
=
vsetq_lane_f32
(
sum2_c2
,
_sum2_c2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ss2_2
=
vadd_f32
(
vget_low_f32
(
_sum2_c2
),
vget_high_f32
(
_sum2_c2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
float32x2_t
_ssss1_2_ssss2_2
=
vpadd_f32
(
_ss1_2
,
_ss2_2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
sum2_c2
+=
vget_lane_f32
(
_ssss1_2_ssss2_2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr1
[
0
]
*
pad_filter0
[
0
];
sum2
+=
in_ptr1
[
1
]
*
pad_filter0
[
1
];
sum2
+=
in_ptr1
[
2
]
*
pad_filter0
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
sum2_c2
+=
in_ptr1
[
0
]
*
pad_filter0_c2
[
0
];
sum2_c2
+=
in_ptr1
[
1
]
*
pad_filter0_c2
[
1
];
sum2_c2
+=
in_ptr1
[
2
]
*
pad_filter0_c2
[
2
];
sum2_c2
+=
in_ptr2
[
0
]
*
pad_filter1_c2
[
0
];
sum2_c2
+=
in_ptr2
[
1
]
*
pad_filter1_c2
[
1
];
sum2_c2
+=
in_ptr2
[
2
]
*
pad_filter1_c2
[
2
];
sum2_c2
+=
in_ptr3
[
0
]
*
pad_filter2_c2
[
0
];
sum2_c2
+=
in_ptr3
[
1
]
*
pad_filter2_c2
[
1
];
sum2_c2
+=
in_ptr3
[
2
]
*
pad_filter2_c2
[
2
];
#endif
}
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter0_c2
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr2
+=
sum2
;
*
out_ptr1_c2
+=
sum1_c2
;
*
out_ptr2_c2
+=
sum2_c2
;
out_ptr1
++
;
out_ptr2
++
;
out_ptr1_c2
++
;
out_ptr2_c2
++
;
}
if
(
if_nopadding
)
{
in_ptr1
+=
2
+
input_w
;
in_ptr2
+=
2
+
input_w
;
in_ptr3
+=
2
+
input_w
;
in_ptr4
+=
2
+
input_w
;
}
else
if
(
o_h
==
padding_h
-
1
||
o_h
==
output_h
-
padding_h
-
2
)
{
in_ptr1
+=
3
;
in_ptr2
+=
3
;
in_ptr3
+=
3
;
in_ptr4
+=
3
;
pad_filter0
-=
2
;
pad_filter1
-=
2
;
pad_filter2
-=
2
;
pad_filter3
-=
2
;
pad_filter0_c2
-=
2
;
pad_filter1_c2
-=
2
;
pad_filter2_c2
-=
2
;
pad_filter3_c2
-=
2
;
}
else
if
(
issamefilter
)
{
in_ptr1
+=
3
+
input_w
;
in_ptr2
+=
3
+
input_w
;
in_ptr3
+=
3
+
input_w
;
in_ptr4
+=
3
+
input_w
;
pad_filter0
+=
2
*
padding_w
+
1
;
pad_filter1
+=
2
*
padding_w
+
1
;
pad_filter2
+=
2
*
padding_w
+
1
;
pad_filter3
+=
2
*
padding_w
+
1
;
pad_filter0_c2
+=
2
*
padding_w
+
1
;
pad_filter1_c2
+=
2
*
padding_w
+
1
;
pad_filter2_c2
+=
2
*
padding_w
+
1
;
pad_filter3_c2
+=
2
*
padding_w
+
1
;
}
else
{
pad_filter0
-=
3
+
2
*
padding_w
+
2
;
pad_filter1
-=
3
+
2
*
padding_w
+
2
;
pad_filter2
-=
3
+
2
*
padding_w
+
2
;
pad_filter3
-=
3
+
2
*
padding_w
+
2
;
pad_filter0_c2
-=
3
+
2
*
padding_w
+
2
;
pad_filter1_c2
-=
3
+
2
*
padding_w
+
2
;
pad_filter2_c2
-=
3
+
2
*
padding_w
+
2
;
pad_filter3_c2
-=
3
+
2
*
padding_w
+
2
;
in_ptr1
-=
input_w
-
3
;
in_ptr2
-=
input_w
-
3
;
in_ptr3
-=
input_w
-
3
;
in_ptr4
-=
input_w
-
3
;
}
out_ptr1
+=
output_w
;
out_ptr2
+=
output_w
;
out_ptr1_c2
+=
output_w
;
out_ptr2_c2
+=
output_w
;
}
// remain output_height
for
(;
o_h
<
output_h
;
++
o_h
)
{
int
o_w
=
0
;
// pad left
for
(;
o_w
<
padding_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum1_c2
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ssss1_ssss1_2
=
vpadd_f32
(
_ss1
,
_ss1_2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
#endif
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter0_c2
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr1_c2
+=
sum1_c2
;
out_ptr1
++
;
out_ptr1_c2
++
;
}
// valid
#if __ARM_NEON
#if __aarch64__
if
(
if_nopadding
)
{
int
loop
=
(
output_w
-
2
*
padding_w
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"prfm pldl1keep, [%[f1_c2], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]]
\n\t
"
"add %[f1], %[f1], #32
\n\t
"
"ld1 {v2.4s, v3.4s}, [%[f1_c2]]
\n\t
"
"add %[f1_c2], %[f1_c2], #32
\n\t
"
"ld1 {v4.s}[0], [%[f1]]
\n\t
"
"sub %[f1],%[f1], #32
\n\t
"
"ld1 {v4.s}[1], [%[f1_c2]]
\n\t
"
"sub %[f1_c2],%[f1_c2], #32
\n\t
"
"0:
\n\t
"
// load out_ptr
"prfm pldl1keep, [%[out_ptr1], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c2], #128]
\n\t
"
"ld1 {v12.4s}, [%[out_ptr1]]
\n\t
"
"ld1 {v13.4s}, [%[out_ptr1_c2]]
\n\t
"
// in_ptr1 multiply
"prfm pldl1keep, [%[in_ptr1], #192]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1],%[in_ptr1], #16
\n\t
"
"ext v8.16b, v5.16b, v6.16b, #4
\n\t
"
"fmla v12.4s, v5.4s, v0.s[0]
\n\t
"
"fmla v13.4s, v5.4s, v2.s[0]
\n\t
"
"ext v10.16b, v5.16b, v6.16b, #8
\n\t
"
"fmla v12.4s, v8.4s, v0.s[1]
\n\t
"
"fmla v13.4s, v8.4s, v2.s[1]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr2]]
\n\t
"
"add %[in_ptr2],%[in_ptr2], #16
\n\t
"
"fmla v12.4s, v10.4s, v0.s[2]
\n\t
"
"fmla v13.4s, v10.4s, v2.s[2]
\n\t
"
// in_ptr2 multiply
"ext v8.16b, v5.16b, v6.16b, #4
\n\t
"
"fmla v12.4s, v5.4s, v0.s[3]
\n\t
"
"fmla v13.4s, v5.4s, v2.s[3]
\n\t
"
"ext v9.16b, v5.16b, v6.16b, #8
\n\t
"
"fmla v12.4s, v8.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v8.4s, v3.s[0]
\n\t
"
"ld1 {v6.d}[1], [%[in_ptr3]]
\n\t
"
"add %[in_ptr3],%[in_ptr3], #8
\n\t
"
"ld1 {v7.4s}, [%[in_ptr3]]
\n\t
"
"add %[in_ptr3],%[in_ptr3], #8
\n\t
"
"fmla v12.4s, v9.4s, v1.s[1]
\n\t
"
"fmla v13.4s, v9.4s, v3.s[1]
\n\t
"
// in_ptr3 multiply
"ext v10.16b, v6.16b, v7.16b, #8
\n\t
"
"fmla v12.4s, v7.4s, v4.s[0]
\n\t
"
"fmla v13.4s, v7.4s, v4.s[1]
\n\t
"
"ext v11.16b, v6.16b, v7.16b, #12
\n\t
"
"fmla v12.4s, v10.4s, v1.s[2]
\n\t
"
"fmla v13.4s, v10.4s, v3.s[2]
\n\t
"
"fmla v12.4s, v11.4s, v1.s[3]
\n\t
"
"fmla v13.4s, v11.4s, v3.s[3]
\n\t
"
// store out_ptr
"st1 {v12.4s}, [%[out_ptr1]], #16
\n\t
"
"st1 {v13.4s}, [%[out_ptr1_c2]], #16
\n\t
"
// cycle
"subs %[loop],%[loop], #1
\n\t
"
"bne 0b
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr2
]
"+r"
(
out_ptr2
),
[
out_ptr1_c2
]
"+r"
(
out_ptr1_c2
),
[
out_ptr2_c2
]
"+r"
(
out_ptr2_c2
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
),
[
in_ptr4
]
"+r"
(
in_ptr4
)
:
[
f1
]
"r"
(
f1
),
[
f1_c2
]
"r"
(
f1_c2
)
:
"cc"
,
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
);
}
}
#else
if
(
if_nopadding
)
{
int
loop
=
(
output_w
-
2
*
padding_w
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"pld [%[f1], #256]
\n\t
"
"pld [%[f1_c2], #256]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]
\n\t
"
"add %[f1], #32
\n\t
"
"vld1.f32 {d4-d7}, [%[f1_c2]]
\n\t
"
"add %[f1_c2], #32
\n\t
"
"vld1.f32 {d8[0]}, [%[f1]]
\n\t
"
"sub %[f1], #32
\n\t
"
"vld1.f32 {d8[1]}, [%[f1_c2]]
\n\t
"
"sub %[f1_c2], #32
\n\t
"
"0:
\n\t
"
// load out_ptr
"pld [%[out_ptr1], #128]
\n\t
"
"pld [%[out_ptr1_c2], #128]
\n\t
"
"vld1.f32 {d24, d25}, [%[out_ptr1]]
\n\t
"
"vld1.f32 {d26, d27}, [%[out_ptr1_c2]]
\n\t
"
// in_ptr1 multiply
"pld [%[in_ptr1], #128]
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], #16
\n\t
"
"vext.32 q8, q5, q6, #1
\n\t
"
"pld [%[in_ptr2], #128]
\n\t
"
"vmla.f32 q12, q5, d0[0]
\n\t
"
"vmla.f32 q13, q5, d4[0]
\n\t
"
"vext.32 q10, q5, q6, #2
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr2]]
\n\t
"
"add %[in_ptr2], #16
\n\t
"
"vmla.f32 q12, q8, d0[1]
\n\t
"
"vmla.f32 q13, q8, d4[1]
\n\t
"
"vmla.f32 q12, q10, d1[0]
\n\t
"
"vmla.f32 q13, q10, d5[0]
\n\t
"
// in_ptr2 multiply
"vext.32 q8, q5, q6, #1
\n\t
"
"pld [%[in_ptr3], #128]
\n\t
"
"vmla.f32 q12, q5, d1[1]
\n\t
"
"vmla.f32 q13, q5, d5[1]
\n\t
"
"vext.32 q9, q5, q6, #2
\n\t
"
"vld1.f32 {d13-d15}, [%[in_ptr3]]
\n\t
"
"add %[in_ptr3], #16
\n\t
"
"vmla.f32 q12, q8, d2[0]
\n\t
"
"vmla.f32 q13, q8, d6[0]
\n\t
"
"vmla.f32 q12, q9, d2[1]
\n\t
"
"vmla.f32 q13, q9, d6[1]
\n\t
"
// in_ptr3 multiply
"vext.32 q10, q6, q7, #2
\n\t
"
"vmla.f32 q12, q7, d8[0]
\n\t
"
"vmla.f32 q13, q7, d8[1]
\n\t
"
"vext.32 q11, q6, q7, #3
\n\t
"
"vmla.f32 q12, q10, d3[0]
\n\t
"
"vmla.f32 q13, q10, d7[0]
\n\t
"
"vmla.f32 q12, q11, d3[1]
\n\t
"
"vmla.f32 q13, q11, d7[1]
\n\t
"
// store out_ptr
"subs %[loop], #1
\n\t
"
"vst1.f32 {d24, d25}, [%[out_ptr1]]!
\n\t
"
"vst1.f32 {d26, d27}, [%[out_ptr1_c2]]!
\n\t
"
// cycle
"bne 0b
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr2
]
"+r"
(
out_ptr2
),
[
out_ptr1_c2
]
"+r"
(
out_ptr1_c2
),
[
out_ptr2_c2
]
"+r"
(
out_ptr2_c2
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
),
[
in_ptr4
]
"+r"
(
in_ptr4
)
:
[
f1
]
"r"
(
f1
),
[
f1_c2
]
"r"
(
f1_c2
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
}
#endif //__aarch64__
#endif // __ARM_NEON
// remain output_width
for
(;
o_w
<
output_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum1_c2
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ssss1_ssss1_2
=
vpadd_f32
(
_ss1
,
_ss1_2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
#endif
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter0_c2
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr1_c2
+=
sum1_c2
;
out_ptr1
++
;
out_ptr1_c2
++
;
}
out_ptr1
+=
output_w
;
out_ptr1_c2
+=
output_w
;
}
filter_data_ch
+=
filter_ch_size
;
filter_data_ch_c2
+=
filter_ch_size
;
input_data_ch
+=
in_ch_size
;
}
}
int
out_ch_remain_start
=
output_ch
-
output_ch
%
2
;
// remain output_channel
for
(
int
o_c
=
out_ch_remain_start
;
o_c
<
output_ch
;
++
o_c
)
{
bool
issamefilter
;
const
float
*
in_ptr1
,
*
in_ptr2
,
*
in_ptr3
,
*
in_ptr4
;
const
float
*
f1
;
const
float
*
pad_filter0
,
*
pad_filter1
,
*
pad_filter2
,
*
pad_filter3
;
float
pad_filter_arr
[
pad_filter_ch_size
];
float
*
output_data_ch
;
const
float
*
input_data_ch
;
const
float
*
filter_data_ch
;
input_data_ch
=
input_data
;
output_data_ch
=
output_data
+
o_c
*
out_ch_size
;
filter_data_ch
=
filter_data
+
o_c
*
filter_ch_size
*
input_ch
;
for
(
int
i_c
=
0
;
i_c
<
input_ch
;
++
i_c
)
{
f1
=
filter_data_ch
;
if
(
!
if_nopadding
)
{
memset
(
pad_filter_arr
,
0.
f
,
sizeof
(
pad_filter_arr
));
for
(
int
i
=
0
;
i
<
9
;
++
i
)
{
int
j
=
i
/
3
*
(
2
*
padding_w
+
3
)
+
i
%
3
+
padding_h
*
3
+
padding_w
*
(
2
*
padding_h
+
1
);
pad_filter_arr
[
j
]
=
filter_data_ch
[
i
];
}
pad_filter1
=
pad_filter_arr
;
pad_filter1
+=
pad_filter_start
;
pad_filter0
=
pad_filter1
-
pad_filter_w
;
pad_filter2
=
pad_filter1
+
pad_filter_w
;
pad_filter3
=
pad_filter2
+
pad_filter_w
;
}
else
{
pad_filter1
=
filter_data_ch
;
pad_filter2
=
pad_filter1
+
3
;
pad_filter3
=
pad_filter2
+
3
;
}
float
*
out_ptr1
,
*
out_ptr2
;
out_ptr1
=
output_data_ch
;
out_ptr2
=
out_ptr1
+
output_w
;
in_ptr1
=
input_data_ch
;
in_ptr2
=
in_ptr1
+
input_w
;
in_ptr3
=
in_ptr2
+
input_w
;
in_ptr4
=
in_ptr3
+
input_w
;
int
o_h
=
0
;
for
(;
o_h
<
output_h
-
1
;
o_h
=
o_h
+
2
)
{
if
(
!
if_nopadding
&&
(
o_h
<
padding_h
||
o_h
>
output_h
-
padding_h
-
2
))
{
issamefilter
=
false
;
}
else
{
issamefilter
=
true
;
}
int
o_w
=
0
;
// pad left
for
(;
o_w
<
padding_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum2
=
0
;
if
(
issamefilter
)
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr2
,
_pad_filter1
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
float32x4_t
_in_ptr4
=
vld1q_f32
(
in_ptr4
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr4
,
_pad_filter3
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
sum2
+=
in_ptr4
[
0
]
*
pad_filter3
[
0
];
sum2
+=
in_ptr4
[
1
]
*
pad_filter3
[
1
];
sum2
+=
in_ptr4
[
2
]
*
pad_filter3
[
2
];
#endif
}
else
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter0
=
vld1q_f32
(
pad_filter0
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter0
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr2
,
_pad_filter1
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr1
[
0
]
*
pad_filter0
[
0
];
sum2
+=
in_ptr1
[
1
]
*
pad_filter0
[
1
];
sum2
+=
in_ptr1
[
2
]
*
pad_filter0
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
#endif
}
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr2
+=
sum2
;
out_ptr1
++
;
out_ptr2
++
;
}
// valid
#if __ARM_NEON
#if __aarch64__
if
(
issamefilter
)
{
int
loop
=
(
output_w
-
2
*
padding_w
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]]
\n\t
"
"add %[f1], %[f1], #32
\n\t
"
"ld1 {v4.s}[0], [%[f1]]
\n\t
"
"sub %[f1],%[f1], #32
\n\t
"
"0:
\n\t
"
// load out_ptr
"prfm pldl1keep, [%[out_ptr1], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr2], #128]
\n\t
"
"ld1 {v12.4s}, [%[out_ptr1]]
\n\t
"
"ld1 {v14.4s}, [%[out_ptr2]]
\n\t
"
// in_ptr1 + in_ptr4 multiply
"prfm pldl1keep, [%[in_ptr1], #192]
\n\t
"
"prfm pldl1keep, [%[in_ptr4], #192]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1],%[in_ptr1], #16
\n\t
"
"ld1 {v6.d}[1], [%[in_ptr4]]
\n\t
"
"add %[in_ptr4],%[in_ptr4], #8
\n\t
"
"ld1 {v7.4s}, [%[in_ptr4]]
\n\t
"
"add %[in_ptr4],%[in_ptr4], #8
\n\t
"
"ext v8.16b, v5.16b, v6.16b, #4
\n\t
"
"fmla v12.4s, v5.4s, v0.s[0]
\n\t
"
"ext v9.16b, v6.16b, v7.16b, #8
\n\t
"
"fmla v14.4s, v7.4s, v4.s[0]
\n\t
"
"ext v10.16b, v5.16b, v6.16b, #8
\n\t
"
"fmla v12.4s, v8.4s, v0.s[1]
\n\t
"
"ext v11.16b, v6.16b, v7.16b, #12
\n\t
"
"fmla v14.4s, v9.4s, v1.s[2]
\n\t
"
"ld1 {v5.4s, v6.4s}, [%[in_ptr2]]
\n\t
"
"add %[in_ptr2],%[in_ptr2], #16
\n\t
"
"fmla v12.4s, v10.4s, v0.s[2]
\n\t
"
"fmla v14.4s, v11.4s, v1.s[3]
\n\t
"
// in_ptr2 multiply
"ext v8.16b, v5.16b, v6.16b, #4
\n\t
"
"fmla v12.4s, v5.4s, v0.s[3]
\n\t
"
"fmla v14.4s, v5.4s, v0.s[0]
\n\t
"
"ext v9.16b, v5.16b, v6.16b, #8
\n\t
"
"fmla v12.4s, v8.4s, v1.s[0]
\n\t
"
"fmla v14.4s, v8.4s, v0.s[1]
\n\t
"
"ld1 {v6.d}[1], [%[in_ptr3]]
\n\t
"
"add %[in_ptr3],%[in_ptr3], #8
\n\t
"
"ld1 {v7.4s}, [%[in_ptr3]]
\n\t
"
"add %[in_ptr3],%[in_ptr3], #8
\n\t
"
"fmla v12.4s, v9.4s, v1.s[1]
\n\t
"
"fmla v14.4s, v9.4s, v0.s[2]
\n\t
"
// in_ptr3 multiply
"ext v10.16b, v6.16b, v7.16b, #8
\n\t
"
"fmla v12.4s, v7.4s, v4.s[0]
\n\t
"
"fmla v14.4s, v7.4s, v1.s[1]
\n\t
"
"ext v11.16b, v6.16b, v7.16b, #12
\n\t
"
"fmla v12.4s, v10.4s, v1.s[2]
\n\t
"
"fmla v14.4s, v10.4s, v0.s[3]
\n\t
"
"fmla v12.4s, v11.4s, v1.s[3]
\n\t
"
"fmla v14.4s, v11.4s, v1.s[0]
\n\t
"
// store out_ptr
"st1 {v12.4s}, [%[out_ptr1]], #16
\n\t
"
"st1 {v14.4s}, [%[out_ptr2]], #16
\n\t
"
// cycle
"subs %[loop],%[loop], #1
\n\t
"
"bne 0b
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr2
]
"+r"
(
out_ptr2
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
),
[
in_ptr4
]
"+r"
(
in_ptr4
)
:
[
f1
]
"r"
(
f1
)
:
"cc"
,
"memory"
,
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v14"
);
}
}
if
(
!
if_nopadding
&&
o_w
==
output_w
-
padding_w
)
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
in_ptr1
--
;
in_ptr2
--
;
in_ptr3
--
;
in_ptr4
--
;
}
#else
if
(
issamefilter
)
{
int
loop
=
(
output_w
-
2
*
padding_w
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"pld [%[f1], #256]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]
\n\t
"
"add %[f1], #32
\n\t
"
"vld1.f32 {d8[0]}, [%[f1]]
\n\t
"
"sub %[f1], #32
\n\t
"
"0:
\n\t
"
// load out_ptr
"pld [%[out_ptr1], #128]
\n\t
"
"pld [%[out_ptr2], #128]
\n\t
"
"vld1.f32 {d24, d25}, [%[out_ptr1]]
\n\t
"
"vld1.f32 {d28, d29}, [%[out_ptr2]]
\n\t
"
// in_ptr1 + in_ptr4 multiply
"pld [%[in_ptr1], #192]
\n\t
"
"pld [%[in_ptr4], #192]
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], #16
\n\t
"
"vld1.f32 {d13-d15}, [%[in_ptr4]]
\n\t
"
"add %[in_ptr4], #16
\n\t
"
"vext.32 q8, q5, q6, #1
\n\t
"
"vmla.f32 q12, q5, d0[0]
\n\t
"
"vext.32 q9, q6, q7, #2
\n\t
"
"vmla.f32 q14, q7, d8[0]
\n\t
"
"vext.32 q10, q5, q6, #2
\n\t
"
"vmla.f32 q12, q8, d0[1]
\n\t
"
"vext.32 q11, q6, q7, #3
\n\t
"
"vmla.f32 q14, q9, d3[0]
\n\t
"
"vld1.f32 {d10-d12}, [%[in_ptr2]]
\n\t
"
"add %[in_ptr2], #16
\n\t
"
"vmla.f32 q12, q10, d1[0]
\n\t
"
"vmla.f32 q14, q11, d3[1]
\n\t
"
// in_ptr2 multiply
"vext.32 q8, q5, q6, #1
\n\t
"
"vmla.f32 q12, q5, d1[1]
\n\t
"
"vmla.f32 q14, q5, d0[0]
\n\t
"
"vext.32 q9, q5, q6, #2
\n\t
"
"vmla.f32 q12, q8, d2[0]
\n\t
"
"vmla.f32 q14, q8, d0[1]
\n\t
"
"vld1.f32 {d13-d15}, [%[in_ptr3]]
\n\t
"
"add %[in_ptr3], #16
\n\t
"
"vmla.f32 q12, q9, d2[1]
\n\t
"
"vmla.f32 q14, q9, d1[0]
\n\t
"
// in_ptr3 multiply
"vext.32 q10, q6, q7, #2
\n\t
"
"vmla.f32 q12, q7, d8[0]
\n\t
"
"vmla.f32 q14, q7, d2[1]
\n\t
"
"vext.32 q11, q6, q7, #3
\n\t
"
"vmla.f32 q12, q10, d3[0]
\n\t
"
"vmla.f32 q14, q10, d1[1]
\n\t
"
"vmla.f32 q12, q11, d3[1]
\n\t
"
"vmla.f32 q14, q11, d2[0]
\n\t
"
// store out_ptr
"subs %[loop], #1
\n\t
"
"vst1.f32 {d24, d25}, [%[out_ptr1]]!
\n\t
"
"vst1.f32 {d28, d29}, [%[out_ptr2]]!
\n\t
"
// cycle
"bne 0b
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr2
]
"+r"
(
out_ptr2
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
),
[
in_ptr4
]
"+r"
(
in_ptr4
)
:
[
f1
]
"r"
(
f1
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q14"
);
}
}
if
(
!
if_nopadding
&&
o_w
==
output_w
-
padding_w
)
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
in_ptr1
--
;
in_ptr2
--
;
in_ptr3
--
;
in_ptr4
--
;
}
#endif //__aarch64__
#endif // __ARM_NEON
// remain output_width
for
(;
o_w
<
output_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum2
=
0
;
if
(
issamefilter
)
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr2
,
_pad_filter1
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
float32x4_t
_in_ptr4
=
vld1q_f32
(
in_ptr4
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr4
,
_pad_filter3
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
sum2
+=
in_ptr4
[
0
]
*
pad_filter3
[
0
];
sum2
+=
in_ptr4
[
1
]
*
pad_filter3
[
1
];
sum2
+=
in_ptr4
[
2
]
*
pad_filter3
[
2
];
#endif
}
else
{
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter0
=
vld1q_f32
(
pad_filter0
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter0
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr2
,
_pad_filter1
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum2
=
vmlaq_f32
(
_sum2
,
_in_ptr3
,
_pad_filter2
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum2
=
vsetq_lane_f32
(
sum2
,
_sum2
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss2
=
vadd_f32
(
vget_low_f32
(
_sum2
),
vget_high_f32
(
_sum2
));
float32x2_t
_ssss1_ssss2
=
vpadd_f32
(
_ss1
,
_ss2
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss2
,
0
);
sum2
+=
vget_lane_f32
(
_ssss1_ssss2
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum2
+=
in_ptr1
[
0
]
*
pad_filter0
[
0
];
sum2
+=
in_ptr1
[
1
]
*
pad_filter0
[
1
];
sum2
+=
in_ptr1
[
2
]
*
pad_filter0
[
2
];
sum2
+=
in_ptr2
[
0
]
*
pad_filter1
[
0
];
sum2
+=
in_ptr2
[
1
]
*
pad_filter1
[
1
];
sum2
+=
in_ptr2
[
2
]
*
pad_filter1
[
2
];
sum2
+=
in_ptr3
[
0
]
*
pad_filter2
[
0
];
sum2
+=
in_ptr3
[
1
]
*
pad_filter2
[
1
];
sum2
+=
in_ptr3
[
2
]
*
pad_filter2
[
2
];
#endif
}
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr2
+=
sum2
;
out_ptr1
++
;
out_ptr2
++
;
}
if
(
if_nopadding
)
{
in_ptr1
+=
2
+
input_w
;
in_ptr2
+=
2
+
input_w
;
in_ptr3
+=
2
+
input_w
;
in_ptr4
+=
2
+
input_w
;
}
else
if
(
o_h
==
padding_h
-
1
||
o_h
==
output_h
-
padding_h
-
2
)
{
in_ptr1
+=
3
;
in_ptr2
+=
3
;
in_ptr3
+=
3
;
in_ptr4
+=
3
;
pad_filter0
-=
2
;
pad_filter1
-=
2
;
pad_filter2
-=
2
;
pad_filter3
-=
2
;
}
else
if
(
issamefilter
)
{
in_ptr1
+=
3
+
input_w
;
in_ptr2
+=
3
+
input_w
;
in_ptr3
+=
3
+
input_w
;
in_ptr4
+=
3
+
input_w
;
pad_filter0
+=
2
*
padding_w
+
1
;
pad_filter1
+=
2
*
padding_w
+
1
;
pad_filter2
+=
2
*
padding_w
+
1
;
pad_filter3
+=
2
*
padding_w
+
1
;
}
else
{
pad_filter0
-=
3
+
2
*
padding_w
+
2
;
pad_filter1
-=
3
+
2
*
padding_w
+
2
;
pad_filter2
-=
3
+
2
*
padding_w
+
2
;
pad_filter3
-=
3
+
2
*
padding_w
+
2
;
in_ptr1
-=
input_w
-
3
;
in_ptr2
-=
input_w
-
3
;
in_ptr3
-=
input_w
-
3
;
in_ptr4
-=
input_w
-
3
;
}
out_ptr1
+=
output_w
;
out_ptr2
+=
output_w
;
}
// remain output_height
for
(;
o_h
<
output_h
;
++
o_h
)
{
for
(
int
o_w
=
0
;
o_w
<
output_w
;
++
o_w
)
{
float
sum1
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ssss1_ssss1
=
vpadd_f32
(
_ss1
,
_ss1
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1
,
0
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
#endif
if
(
!
if_nopadding
&&
(
o_w
<
padding_w
||
o_w
>
output_w
-
padding_w
-
2
))
{
pad_filter0
--
;
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
}
else
{
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
in_ptr4
++
;
}
*
out_ptr1
+=
sum1
;
out_ptr1
++
;
}
out_ptr1
+=
output_w
;
}
filter_data_ch
+=
filter_ch_size
;
input_data_ch
+=
in_ch_size
;
}
}
input_data
+=
in_batch_size
;
output_data
+=
out_batch_size
;
}
}
template
<
>
void
SlidingwindowConv3x3s2
<
float
,
float
>
(
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
const
int
batch
=
input
->
dims
()[
0
];
const
int
input_ch
=
input
->
dims
()[
1
];
const
int
input_h
=
input
->
dims
()[
2
];
const
int
input_w
=
input
->
dims
()[
3
];
const
int
output_ch
=
output
->
dims
()[
1
];
const
int
output_h
=
output
->
dims
()[
2
];
const
int
output_w
=
output
->
dims
()[
3
];
const
int
padding_h
=
paddings
[
0
];
const
int
padding_w
=
paddings
[
1
];
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
float
*
filter_data
=
filter
->
data
<
float
>
();
const
int
in_ch_size
=
input_h
*
input_w
;
const
int
in_batch_size
=
input_ch
*
in_ch_size
;
const
int
out_ch_size
=
output_h
*
output_w
;
const
int
out_batch_size
=
output_ch
*
out_ch_size
;
const
int
out_size
=
batch
*
out_batch_size
;
const
int
filter_ch_size
=
9
;
const
int
pad_filter_ch_size
=
(
2
*
padding_h
+
3
)
*
(
2
*
padding_w
+
3
);
const
int
pad_filter_start
=
2
*
padding_h
*
(
2
*
padding_w
+
3
)
+
2
*
padding_w
;
const
int
pad_filter_w
=
3
+
padding_w
*
2
;
bool
if_nopadding
=
false
;
const
bool
if_exact_in_w
=
(
input_w
+
2
*
padding_w
-
3
)
%
2
==
0
;
const
bool
if_exact_in_h
=
(
input_h
+
2
*
padding_h
-
3
)
%
2
==
0
;
const
bool
if_odd_pad_w
=
padding_w
%
2
==
1
;
const
bool
if_odd_pad_h
=
padding_h
%
2
==
1
;
int
valid_w_start
=
padding_w
>>
1
;
int
valid_h_start
=
padding_h
>>
1
;
int
valid_w_end
=
output_w
-
valid_w_start
-
2
;
int
valid_h_end
=
output_h
-
valid_h_start
-
2
;
const
int
remain_stride_w
=
input_w
+
2
*
padding_w
-
2
*
output_w
;
#if __ARM_NEON
float
*
out_ptr
=
output_data
;
int
remain
=
out_size
&
0x3
;
float32x4_t
_zero
=
vdupq_n_f32
(
0.0
);
for
(
int
i
=
0
;
i
<
out_size
;
i
+=
4
)
{
vst1q_f32
(
out_ptr
,
_zero
);
out_ptr
+=
4
;
}
switch
(
remain
)
{
case
1
:
vst1q_lane_f32
(
out_ptr
,
_zero
,
0
);
break
;
case
2
:
vst1_f32
(
out_ptr
,
vget_low_f32
(
_zero
));
break
;
case
3
:
vst1_f32
(
out_ptr
,
vget_low_f32
(
_zero
));
vst1q_lane_f32
(
out_ptr
+
2
,
_zero
,
0
);
break
;
}
#else
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
out_size
;
++
i
)
{
output_data
[
i
]
=
0
;
}
#endif
if
(
padding_h
==
0
&&
padding_w
==
0
)
{
if_nopadding
=
true
;
valid_w_start
=
-
1
;
valid_h_start
=
-
1
;
valid_w_end
=
output_w
;
valid_h_end
=
output_h
;
}
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for
for
(
int
o_c
=
0
;
o_c
<
output_ch
-
7
;
o_c
+=
8
)
{
const
float
*
f1
;
const
float
*
in_ptr1
,
*
in_ptr2
,
*
in_ptr3
;
const
float
*
pad_filter1
,
*
pad_filter2
,
*
pad_filter3
;
const
float
*
pad_filter1_c2
,
*
pad_filter2_c2
,
*
pad_filter3_c2
;
const
float
*
pad_filter1_c3
,
*
pad_filter2_c3
,
*
pad_filter3_c3
;
const
float
*
pad_filter1_c4
,
*
pad_filter2_c4
,
*
pad_filter3_c4
;
const
float
*
pad_filter1_c5
,
*
pad_filter2_c5
,
*
pad_filter3_c5
;
const
float
*
pad_filter1_c6
,
*
pad_filter2_c6
,
*
pad_filter3_c6
;
const
float
*
pad_filter1_c7
,
*
pad_filter2_c7
,
*
pad_filter3_c7
;
const
float
*
pad_filter1_c8
,
*
pad_filter2_c8
,
*
pad_filter3_c8
;
float
reform_filter_arr
[
72
];
float
pad_filter_arr
[
pad_filter_ch_size
];
float
pad_filter_arr_c2
[
pad_filter_ch_size
];
float
pad_filter_arr_c3
[
pad_filter_ch_size
];
float
pad_filter_arr_c4
[
pad_filter_ch_size
];
float
pad_filter_arr_c5
[
pad_filter_ch_size
];
float
pad_filter_arr_c6
[
pad_filter_ch_size
];
float
pad_filter_arr_c7
[
pad_filter_ch_size
];
float
pad_filter_arr_c8
[
pad_filter_ch_size
];
float
*
output_data_ch
;
float
*
output_data_ch_2
;
float
*
output_data_ch_3
;
float
*
output_data_ch_4
;
float
*
output_data_ch_5
;
float
*
output_data_ch_6
;
float
*
output_data_ch_7
;
float
*
output_data_ch_8
;
const
float
*
input_data_ch
;
const
float
*
filter_data_ch
;
const
float
*
filter_data_ch_c2
;
const
float
*
filter_data_ch_c3
;
const
float
*
filter_data_ch_c4
;
const
float
*
filter_data_ch_c5
;
const
float
*
filter_data_ch_c6
;
const
float
*
filter_data_ch_c7
;
const
float
*
filter_data_ch_c8
;
filter_data_ch
=
filter_data
+
o_c
*
filter_ch_size
*
input_ch
;
filter_data_ch_c2
=
filter_data
+
(
o_c
+
1
)
*
filter_ch_size
*
input_ch
;
filter_data_ch_c3
=
filter_data
+
(
o_c
+
2
)
*
filter_ch_size
*
input_ch
;
filter_data_ch_c4
=
filter_data
+
(
o_c
+
3
)
*
filter_ch_size
*
input_ch
;
filter_data_ch_c5
=
filter_data
+
(
o_c
+
4
)
*
filter_ch_size
*
input_ch
;
filter_data_ch_c6
=
filter_data
+
(
o_c
+
5
)
*
filter_ch_size
*
input_ch
;
filter_data_ch_c7
=
filter_data
+
(
o_c
+
6
)
*
filter_ch_size
*
input_ch
;
filter_data_ch_c8
=
filter_data
+
(
o_c
+
7
)
*
filter_ch_size
*
input_ch
;
input_data_ch
=
input_data
;
output_data_ch
=
output_data
+
o_c
*
out_ch_size
;
output_data_ch_2
=
output_data
+
(
o_c
+
1
)
*
out_ch_size
;
output_data_ch_3
=
output_data
+
(
o_c
+
2
)
*
out_ch_size
;
output_data_ch_4
=
output_data
+
(
o_c
+
3
)
*
out_ch_size
;
output_data_ch_5
=
output_data
+
(
o_c
+
4
)
*
out_ch_size
;
output_data_ch_6
=
output_data
+
(
o_c
+
5
)
*
out_ch_size
;
output_data_ch_7
=
output_data
+
(
o_c
+
6
)
*
out_ch_size
;
output_data_ch_8
=
output_data
+
(
o_c
+
7
)
*
out_ch_size
;
for
(
int
i_c
=
0
;
i_c
<
input_ch
;
++
i_c
)
{
int
k
=
0
;
for
(
int
i
=
0
;
i
<
9
;
++
i
)
{
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
reform_filter_arr
[
k
++
]
=
filter_data_ch
[
i
+
input_ch
*
9
*
j
];
}
}
f1
=
reform_filter_arr
;
if
(
!
if_nopadding
)
{
memset
(
pad_filter_arr
,
0.
f
,
sizeof
(
pad_filter_arr
));
memset
(
pad_filter_arr_c2
,
0.
f
,
sizeof
(
pad_filter_arr_c2
));
memset
(
pad_filter_arr_c3
,
0.
f
,
sizeof
(
pad_filter_arr_c3
));
memset
(
pad_filter_arr_c4
,
0.
f
,
sizeof
(
pad_filter_arr_c4
));
memset
(
pad_filter_arr_c5
,
0.
f
,
sizeof
(
pad_filter_arr_c5
));
memset
(
pad_filter_arr_c6
,
0.
f
,
sizeof
(
pad_filter_arr_c6
));
memset
(
pad_filter_arr_c7
,
0.
f
,
sizeof
(
pad_filter_arr_c7
));
memset
(
pad_filter_arr_c8
,
0.
f
,
sizeof
(
pad_filter_arr_c8
));
for
(
int
i
=
0
;
i
<
9
;
++
i
)
{
int
j
=
i
/
3
*
(
2
*
padding_w
+
3
)
+
i
%
3
+
padding_h
*
3
+
padding_w
*
(
2
*
padding_h
+
1
);
pad_filter_arr
[
j
]
=
filter_data_ch
[
i
];
pad_filter_arr_c2
[
j
]
=
filter_data_ch_c2
[
i
];
pad_filter_arr_c3
[
j
]
=
filter_data_ch_c3
[
i
];
pad_filter_arr_c4
[
j
]
=
filter_data_ch_c4
[
i
];
pad_filter_arr_c5
[
j
]
=
filter_data_ch_c5
[
i
];
pad_filter_arr_c6
[
j
]
=
filter_data_ch_c6
[
i
];
pad_filter_arr_c7
[
j
]
=
filter_data_ch_c7
[
i
];
pad_filter_arr_c8
[
j
]
=
filter_data_ch_c8
[
i
];
}
pad_filter1
=
pad_filter_arr
;
pad_filter1
+=
pad_filter_start
;
pad_filter2
=
pad_filter1
+
pad_filter_w
;
pad_filter3
=
pad_filter2
+
pad_filter_w
;
pad_filter1_c2
=
pad_filter_arr_c2
;
pad_filter1_c2
+=
pad_filter_start
;
pad_filter2_c2
=
pad_filter1_c2
+
pad_filter_w
;
pad_filter3_c2
=
pad_filter2_c2
+
pad_filter_w
;
pad_filter1_c3
=
pad_filter_arr_c3
;
pad_filter1_c3
+=
pad_filter_start
;
pad_filter2_c3
=
pad_filter1_c3
+
pad_filter_w
;
pad_filter3_c3
=
pad_filter2_c3
+
pad_filter_w
;
pad_filter1_c4
=
pad_filter_arr_c4
;
pad_filter1_c4
+=
pad_filter_start
;
pad_filter2_c4
=
pad_filter1_c4
+
pad_filter_w
;
pad_filter3_c4
=
pad_filter2_c4
+
pad_filter_w
;
pad_filter1_c5
=
pad_filter_arr_c5
;
pad_filter1_c5
+=
pad_filter_start
;
pad_filter2_c5
=
pad_filter1_c5
+
pad_filter_w
;
pad_filter3_c5
=
pad_filter2_c5
+
pad_filter_w
;
pad_filter1_c6
=
pad_filter_arr_c6
;
pad_filter1_c6
+=
pad_filter_start
;
pad_filter2_c6
=
pad_filter1_c6
+
pad_filter_w
;
pad_filter3_c6
=
pad_filter2_c6
+
pad_filter_w
;
pad_filter1_c7
=
pad_filter_arr_c7
;
pad_filter1_c7
+=
pad_filter_start
;
pad_filter2_c7
=
pad_filter1_c7
+
pad_filter_w
;
pad_filter3_c7
=
pad_filter2_c7
+
pad_filter_w
;
pad_filter1_c8
=
pad_filter_arr_c8
;
pad_filter1_c8
+=
pad_filter_start
;
pad_filter2_c8
=
pad_filter1_c8
+
pad_filter_w
;
pad_filter3_c8
=
pad_filter2_c8
+
pad_filter_w
;
}
else
{
pad_filter1
=
filter_data_ch
;
pad_filter2
=
pad_filter1
+
3
;
pad_filter3
=
pad_filter2
+
3
;
pad_filter1_c2
=
filter_data_ch_c2
;
pad_filter2_c2
=
pad_filter1_c2
+
3
;
pad_filter3_c2
=
pad_filter2_c2
+
3
;
pad_filter1_c3
=
filter_data_ch_c3
;
pad_filter2_c3
=
pad_filter1_c3
+
3
;
pad_filter3_c3
=
pad_filter2_c3
+
3
;
pad_filter1_c4
=
filter_data_ch_c4
;
pad_filter2_c4
=
pad_filter1_c4
+
3
;
pad_filter3_c4
=
pad_filter2_c4
+
3
;
pad_filter1_c5
=
filter_data_ch_c5
;
pad_filter2_c5
=
pad_filter1_c5
+
3
;
pad_filter3_c5
=
pad_filter2_c5
+
3
;
pad_filter1_c6
=
filter_data_ch_c6
;
pad_filter2_c6
=
pad_filter1_c6
+
3
;
pad_filter3_c6
=
pad_filter2_c6
+
3
;
pad_filter1_c7
=
filter_data_ch_c7
;
pad_filter2_c7
=
pad_filter1_c7
+
3
;
pad_filter3_c7
=
pad_filter2_c7
+
3
;
pad_filter1_c8
=
filter_data_ch_c8
;
pad_filter2_c8
=
pad_filter1_c8
+
3
;
pad_filter3_c8
=
pad_filter2_c8
+
3
;
}
float
*
out_ptr1
;
float
*
out_ptr1_c2
;
float
*
out_ptr1_c3
;
float
*
out_ptr1_c4
;
float
*
out_ptr1_c5
;
float
*
out_ptr1_c6
;
float
*
out_ptr1_c7
;
float
*
out_ptr1_c8
;
out_ptr1
=
output_data_ch
;
out_ptr1_c2
=
output_data_ch_2
;
out_ptr1_c3
=
output_data_ch_3
;
out_ptr1_c4
=
output_data_ch_4
;
out_ptr1_c5
=
output_data_ch_5
;
out_ptr1_c6
=
output_data_ch_6
;
out_ptr1_c7
=
output_data_ch_7
;
out_ptr1_c8
=
output_data_ch_8
;
in_ptr1
=
input_data_ch
;
in_ptr2
=
in_ptr1
+
input_w
;
in_ptr3
=
in_ptr2
+
input_w
;
int
o_h
=
0
;
for
(;
o_h
<
output_h
;
++
o_h
)
{
int
o_w
=
0
;
// pad left
for
(;
o_w
<=
valid_w_start
;
++
o_w
)
{
float
sum1
=
0
;
float
sum1_c2
=
0
;
float
sum1_c3
=
0
;
float
sum1_c4
=
0
;
float
sum1_c5
=
0
;
float
sum1_c6
=
0
;
float
sum1_c7
=
0
;
float
sum1_c8
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_pad_filter1_c3
=
vld1q_f32
(
pad_filter1_c3
);
float32x4_t
_pad_filter1_c4
=
vld1q_f32
(
pad_filter1_c4
);
float32x4_t
_pad_filter1_c5
=
vld1q_f32
(
pad_filter1_c5
);
float32x4_t
_pad_filter1_c6
=
vld1q_f32
(
pad_filter1_c6
);
float32x4_t
_pad_filter1_c7
=
vld1q_f32
(
pad_filter1_c7
);
float32x4_t
_pad_filter1_c8
=
vld1q_f32
(
pad_filter1_c8
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_sum1_c3
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c3
);
float32x4_t
_sum1_c4
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c4
);
float32x4_t
_sum1_c5
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c5
);
float32x4_t
_sum1_c6
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c6
);
float32x4_t
_sum1_c7
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c7
);
float32x4_t
_sum1_c8
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c8
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
float32x4_t
_pad_filter2_c3
=
vld1q_f32
(
pad_filter2_c3
);
float32x4_t
_pad_filter2_c4
=
vld1q_f32
(
pad_filter2_c4
);
float32x4_t
_pad_filter2_c5
=
vld1q_f32
(
pad_filter2_c5
);
float32x4_t
_pad_filter2_c6
=
vld1q_f32
(
pad_filter2_c6
);
float32x4_t
_pad_filter2_c7
=
vld1q_f32
(
pad_filter2_c7
);
float32x4_t
_pad_filter2_c8
=
vld1q_f32
(
pad_filter2_c8
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
_sum1_c3
=
vmlaq_f32
(
_sum1_c3
,
_in_ptr2
,
_pad_filter2_c3
);
_sum1_c4
=
vmlaq_f32
(
_sum1_c4
,
_in_ptr2
,
_pad_filter2_c4
);
_sum1_c5
=
vmlaq_f32
(
_sum1_c5
,
_in_ptr2
,
_pad_filter2_c5
);
_sum1_c6
=
vmlaq_f32
(
_sum1_c6
,
_in_ptr2
,
_pad_filter2_c6
);
_sum1_c7
=
vmlaq_f32
(
_sum1_c7
,
_in_ptr2
,
_pad_filter2_c7
);
_sum1_c8
=
vmlaq_f32
(
_sum1_c8
,
_in_ptr2
,
_pad_filter2_c8
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
float32x4_t
_pad_filter3_c3
=
vld1q_f32
(
pad_filter3_c3
);
float32x4_t
_pad_filter3_c4
=
vld1q_f32
(
pad_filter3_c4
);
float32x4_t
_pad_filter3_c5
=
vld1q_f32
(
pad_filter3_c5
);
float32x4_t
_pad_filter3_c6
=
vld1q_f32
(
pad_filter3_c6
);
float32x4_t
_pad_filter3_c7
=
vld1q_f32
(
pad_filter3_c7
);
float32x4_t
_pad_filter3_c8
=
vld1q_f32
(
pad_filter3_c8
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum1_c3
=
vmlaq_f32
(
_sum1_c3
,
_in_ptr3
,
_pad_filter3_c3
);
_sum1_c4
=
vmlaq_f32
(
_sum1_c4
,
_in_ptr3
,
_pad_filter3_c4
);
_sum1_c5
=
vmlaq_f32
(
_sum1_c5
,
_in_ptr3
,
_pad_filter3_c5
);
_sum1_c6
=
vmlaq_f32
(
_sum1_c6
,
_in_ptr3
,
_pad_filter3_c6
);
_sum1_c7
=
vmlaq_f32
(
_sum1_c7
,
_in_ptr3
,
_pad_filter3_c7
);
_sum1_c8
=
vmlaq_f32
(
_sum1_c8
,
_in_ptr3
,
_pad_filter3_c8
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
_sum1_c3
=
vsetq_lane_f32
(
sum1_c3
,
_sum1_c3
,
3
);
_sum1_c4
=
vsetq_lane_f32
(
sum1_c4
,
_sum1_c4
,
3
);
_sum1_c5
=
vsetq_lane_f32
(
sum1_c5
,
_sum1_c5
,
3
);
_sum1_c6
=
vsetq_lane_f32
(
sum1_c6
,
_sum1_c6
,
3
);
_sum1_c7
=
vsetq_lane_f32
(
sum1_c7
,
_sum1_c7
,
3
);
_sum1_c8
=
vsetq_lane_f32
(
sum1_c8
,
_sum1_c8
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ss1_3
=
vadd_f32
(
vget_low_f32
(
_sum1_c3
),
vget_high_f32
(
_sum1_c3
));
float32x2_t
_ss1_4
=
vadd_f32
(
vget_low_f32
(
_sum1_c4
),
vget_high_f32
(
_sum1_c4
));
float32x2_t
_ss1_5
=
vadd_f32
(
vget_low_f32
(
_sum1_c5
),
vget_high_f32
(
_sum1_c5
));
float32x2_t
_ss1_6
=
vadd_f32
(
vget_low_f32
(
_sum1_c6
),
vget_high_f32
(
_sum1_c6
));
float32x2_t
_ss1_7
=
vadd_f32
(
vget_low_f32
(
_sum1_c7
),
vget_high_f32
(
_sum1_c7
));
float32x2_t
_ss1_8
=
vadd_f32
(
vget_low_f32
(
_sum1_c8
),
vget_high_f32
(
_sum1_c8
));
float32x2_t
_ssss1_ssss1_2
=
vpadd_f32
(
_ss1
,
_ss1_2
);
float32x2_t
_ssss1_3_ssss1_4
=
vpadd_f32
(
_ss1_3
,
_ss1_4
);
float32x2_t
_ssss1_5_ssss1_6
=
vpadd_f32
(
_ss1_5
,
_ss1_6
);
float32x2_t
_ssss1_7_ssss1_8
=
vpadd_f32
(
_ss1_7
,
_ss1_8
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
1
);
sum1_c3
+=
vget_lane_f32
(
_ssss1_3_ssss1_4
,
0
);
sum1_c4
+=
vget_lane_f32
(
_ssss1_3_ssss1_4
,
1
);
sum1_c5
+=
vget_lane_f32
(
_ssss1_5_ssss1_6
,
0
);
sum1_c6
+=
vget_lane_f32
(
_ssss1_5_ssss1_6
,
1
);
sum1_c7
+=
vget_lane_f32
(
_ssss1_7_ssss1_8
,
0
);
sum1_c8
+=
vget_lane_f32
(
_ssss1_7_ssss1_8
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
sum1_c3
+=
in_ptr1
[
0
]
*
pad_filter1_c3
[
0
];
sum1_c3
+=
in_ptr1
[
1
]
*
pad_filter1_c3
[
1
];
sum1_c3
+=
in_ptr1
[
2
]
*
pad_filter1_c3
[
2
];
sum1_c3
+=
in_ptr2
[
0
]
*
pad_filter2_c3
[
0
];
sum1_c3
+=
in_ptr2
[
1
]
*
pad_filter2_c3
[
1
];
sum1_c3
+=
in_ptr2
[
2
]
*
pad_filter2_c3
[
2
];
sum1_c3
+=
in_ptr3
[
0
]
*
pad_filter3_c3
[
0
];
sum1_c3
+=
in_ptr3
[
1
]
*
pad_filter3_c3
[
1
];
sum1_c3
+=
in_ptr3
[
2
]
*
pad_filter3_c3
[
2
];
sum1_c4
+=
in_ptr1
[
0
]
*
pad_filter1_c4
[
0
];
sum1_c4
+=
in_ptr1
[
1
]
*
pad_filter1_c4
[
1
];
sum1_c4
+=
in_ptr1
[
2
]
*
pad_filter1_c4
[
2
];
sum1_c4
+=
in_ptr2
[
0
]
*
pad_filter2_c4
[
0
];
sum1_c4
+=
in_ptr2
[
1
]
*
pad_filter2_c4
[
1
];
sum1_c4
+=
in_ptr2
[
2
]
*
pad_filter2_c4
[
2
];
sum1_c4
+=
in_ptr3
[
0
]
*
pad_filter3_c4
[
0
];
sum1_c4
+=
in_ptr3
[
1
]
*
pad_filter3_c4
[
1
];
sum1_c4
+=
in_ptr3
[
2
]
*
pad_filter3_c4
[
2
];
sum1_c5
+=
in_ptr1
[
0
]
*
pad_filter1_c5
[
0
];
sum1_c5
+=
in_ptr1
[
1
]
*
pad_filter1_c5
[
1
];
sum1_c5
+=
in_ptr1
[
2
]
*
pad_filter1_c5
[
2
];
sum1_c5
+=
in_ptr2
[
0
]
*
pad_filter2_c5
[
0
];
sum1_c5
+=
in_ptr2
[
1
]
*
pad_filter2_c5
[
1
];
sum1_c5
+=
in_ptr2
[
2
]
*
pad_filter2_c5
[
2
];
sum1_c5
+=
in_ptr3
[
0
]
*
pad_filter3_c5
[
0
];
sum1_c5
+=
in_ptr3
[
1
]
*
pad_filter3_c5
[
1
];
sum1_c5
+=
in_ptr3
[
2
]
*
pad_filter3_c5
[
2
];
sum1_c6
+=
in_ptr1
[
0
]
*
pad_filter1_c6
[
0
];
sum1_c6
+=
in_ptr1
[
1
]
*
pad_filter1_c6
[
1
];
sum1_c6
+=
in_ptr1
[
2
]
*
pad_filter1_c6
[
2
];
sum1_c6
+=
in_ptr2
[
0
]
*
pad_filter2_c6
[
0
];
sum1_c6
+=
in_ptr2
[
1
]
*
pad_filter2_c6
[
1
];
sum1_c6
+=
in_ptr2
[
2
]
*
pad_filter2_c6
[
2
];
sum1_c6
+=
in_ptr3
[
0
]
*
pad_filter3_c6
[
0
];
sum1_c6
+=
in_ptr3
[
1
]
*
pad_filter3_c6
[
1
];
sum1_c6
+=
in_ptr3
[
2
]
*
pad_filter3_c6
[
2
];
sum1_c7
+=
in_ptr1
[
0
]
*
pad_filter1_c7
[
0
];
sum1_c7
+=
in_ptr1
[
1
]
*
pad_filter1_c7
[
1
];
sum1_c7
+=
in_ptr1
[
2
]
*
pad_filter1_c7
[
2
];
sum1_c7
+=
in_ptr2
[
0
]
*
pad_filter2_c7
[
0
];
sum1_c7
+=
in_ptr2
[
1
]
*
pad_filter2_c7
[
1
];
sum1_c7
+=
in_ptr2
[
2
]
*
pad_filter2_c7
[
2
];
sum1_c7
+=
in_ptr3
[
0
]
*
pad_filter3_c7
[
0
];
sum1_c7
+=
in_ptr3
[
1
]
*
pad_filter3_c7
[
1
];
sum1_c7
+=
in_ptr3
[
2
]
*
pad_filter3_c7
[
2
];
sum1_c8
+=
in_ptr1
[
0
]
*
pad_filter1_c8
[
0
];
sum1_c8
+=
in_ptr1
[
1
]
*
pad_filter1_c8
[
1
];
sum1_c8
+=
in_ptr1
[
2
]
*
pad_filter1_c8
[
2
];
sum1_c8
+=
in_ptr2
[
0
]
*
pad_filter2_c8
[
0
];
sum1_c8
+=
in_ptr2
[
1
]
*
pad_filter2_c8
[
1
];
sum1_c8
+=
in_ptr2
[
2
]
*
pad_filter2_c8
[
2
];
sum1_c8
+=
in_ptr3
[
0
]
*
pad_filter3_c8
[
0
];
sum1_c8
+=
in_ptr3
[
1
]
*
pad_filter3_c8
[
1
];
sum1_c8
+=
in_ptr3
[
2
]
*
pad_filter3_c8
[
2
];
#endif
if
(
if_nopadding
)
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
else
if
(
input_w
>
3
&&
(
if_odd_pad_w
&&
o_w
==
valid_w_start
||
o_w
==
valid_w_end
&&
if_odd_pad_w
&&
if_exact_in_w
||
o_w
==
valid_w_end
+
1
&&
!
if_odd_pad_w
&&
!
if_exact_in_w
))
{
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
pad_filter1_c3
--
;
pad_filter2_c3
--
;
pad_filter3_c3
--
;
pad_filter1_c4
--
;
pad_filter2_c4
--
;
pad_filter3_c4
--
;
pad_filter1_c5
--
;
pad_filter2_c5
--
;
pad_filter3_c5
--
;
pad_filter1_c6
--
;
pad_filter2_c6
--
;
pad_filter3_c6
--
;
pad_filter1_c7
--
;
pad_filter2_c7
--
;
pad_filter3_c7
--
;
pad_filter1_c8
--
;
pad_filter2_c8
--
;
pad_filter3_c8
--
;
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
}
else
if
(
input_w
<=
3
||
o_w
<
valid_w_start
||
o_w
>
valid_w_end
)
{
pad_filter1
-=
2
;
pad_filter2
-=
2
;
pad_filter3
-=
2
;
pad_filter1_c2
-=
2
;
pad_filter2_c2
-=
2
;
pad_filter3_c2
-=
2
;
pad_filter1_c3
-=
2
;
pad_filter2_c3
-=
2
;
pad_filter3_c3
-=
2
;
pad_filter1_c4
-=
2
;
pad_filter2_c4
-=
2
;
pad_filter3_c4
-=
2
;
pad_filter1_c5
-=
2
;
pad_filter2_c5
-=
2
;
pad_filter3_c5
-=
2
;
pad_filter1_c6
-=
2
;
pad_filter2_c6
-=
2
;
pad_filter3_c6
-=
2
;
pad_filter1_c7
-=
2
;
pad_filter2_c7
-=
2
;
pad_filter3_c7
-=
2
;
pad_filter1_c8
-=
2
;
pad_filter2_c8
-=
2
;
pad_filter3_c8
-=
2
;
}
else
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr1_c2
+=
sum1_c2
;
*
out_ptr1_c3
+=
sum1_c3
;
*
out_ptr1_c4
+=
sum1_c4
;
*
out_ptr1_c5
+=
sum1_c5
;
*
out_ptr1_c6
+=
sum1_c6
;
*
out_ptr1_c7
+=
sum1_c7
;
*
out_ptr1_c8
+=
sum1_c8
;
out_ptr1
++
;
out_ptr1_c2
++
;
out_ptr1_c3
++
;
out_ptr1_c4
++
;
out_ptr1_c5
++
;
out_ptr1_c6
++
;
out_ptr1_c7
++
;
out_ptr1_c8
++
;
}
// valid
#if __ARM_NEON
#if __aarch64__
if
(
o_h
>
valid_h_start
&&
o_h
<=
valid_h_end
)
{
int
loop
=
(
valid_w_end
-
valid_w_start
-
1
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"prfm pldl1keep, [%[in_ptr1], #288]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"ld2 {v4.4s, v5.4s}, [%[in_ptr1]], #32
\n\t
"
"ld2 {v6.4s, v7.4s}, [%[in_ptr1]]
\n\t
"
"0:
\n\t
"
// load out_ptr
"prfm pldl1keep, [%[out_ptr1], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c2], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c3], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c4], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c5], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c6], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c7], #128]
\n\t
"
"prfm pldl1keep, [%[out_ptr1_c8], #128]
\n\t
"
"ld1 {v8.4s}, [%[out_ptr1]]
\n\t
"
"ld1 {v9.4s}, [%[out_ptr1_c2]]
\n\t
"
"ld1 {v10.4s}, [%[out_ptr1_c3]]
\n\t
"
"ld1 {v11.4s}, [%[out_ptr1_c4]]
\n\t
"
"ld1 {v12.4s}, [%[out_ptr1_c5]]
\n\t
"
"ld1 {v13.4s}, [%[out_ptr1_c6]]
\n\t
"
"ld1 {v14.4s}, [%[out_ptr1_c7]]
\n\t
"
"ld1 {v15.4s}, [%[out_ptr1_c8]]
\n\t
"
// in_ptr1 multiply
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v2.4s, v3.4s}, [%[f1]], #32
\n\t
"
"fmla v8.4s, v4.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v4.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v4.4s, v0.s[2]
\n\t
"
"fmla v11.4s, v4.4s, v0.s[3]
\n\t
"
"fmla v12.4s, v4.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v4.4s, v1.s[1]
\n\t
"
"fmla v14.4s, v4.4s, v1.s[2]
\n\t
"
"fmla v15.4s, v4.4s, v1.s[3]
\n\t
"
"ext v7.16b, v4.16b, v6.16b, #4
\n\t
"
"fmla v8.4s, v5.4s, v2.s[0]
\n\t
"
"fmla v9.4s, v5.4s, v2.s[1]
\n\t
"
"fmla v10.4s, v5.4s, v2.s[2]
\n\t
"
"fmla v11.4s, v5.4s, v2.s[3]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"fmla v12.4s, v5.4s, v3.s[0]
\n\t
"
"fmla v13.4s, v5.4s, v3.s[1]
\n\t
"
"fmla v14.4s, v5.4s, v3.s[2]
\n\t
"
"fmla v15.4s, v5.4s, v3.s[3]
\n\t
"
"prfm pldl1keep, [%[in_ptr2], #288]
\n\t
"
"ld2 {v4.4s, v5.4s}, [%[in_ptr2]], #32
\n\t
"
"fmla v8.4s, v7.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v7.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v7.4s, v0.s[2]
\n\t
"
"fmla v11.4s, v7.4s, v0.s[3]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v2.4s, v3.4s}, [%[f1]], #32
\n\t
"
"fmla v12.4s, v7.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v7.4s, v1.s[1]
\n\t
"
"fmla v14.4s, v7.4s, v1.s[2]
\n\t
"
"fmla v15.4s, v7.4s, v1.s[3]
\n\t
"
// in_ptr2 multiply
"ld2 {v6.4s, v7.4s}, [%[in_ptr2]]
\n\t
"
"fmla v8.4s, v4.4s, v2.s[0]
\n\t
"
"fmla v9.4s, v4.4s, v2.s[1]
\n\t
"
"fmla v10.4s, v4.4s, v2.s[2]
\n\t
"
"fmla v11.4s, v4.4s, v2.s[3]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"fmla v12.4s, v4.4s, v3.s[0]
\n\t
"
"fmla v13.4s, v4.4s, v3.s[1]
\n\t
"
"fmla v14.4s, v4.4s, v3.s[2]
\n\t
"
"fmla v15.4s, v4.4s, v3.s[3]
\n\t
"
"ext v7.16b, v4.16b, v6.16b, #4
\n\t
"
"fmla v8.4s, v5.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v5.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v5.4s, v0.s[2]
\n\t
"
"fmla v11.4s, v5.4s, v0.s[3]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v2.4s, v3.4s}, [%[f1]], #32
\n\t
"
"fmla v12.4s, v5.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v5.4s, v1.s[1]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"prfm pldl1keep, [%[in_ptr3], #288]
\n\t
"
"fmla v14.4s, v5.4s, v1.s[2]
\n\t
"
"fmla v15.4s, v5.4s, v1.s[3]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"ld2 {v4.4s, v5.4s}, [%[in_ptr3]], #32
\n\t
"
"fmla v8.4s, v7.4s, v2.s[0]
\n\t
"
"fmla v9.4s, v7.4s, v2.s[1]
\n\t
"
"fmla v10.4s, v7.4s, v2.s[2]
\n\t
"
"fmla v11.4s, v7.4s, v2.s[3]
\n\t
"
"fmla v12.4s, v7.4s, v3.s[0]
\n\t
"
"fmla v13.4s, v7.4s, v3.s[1]
\n\t
"
"fmla v14.4s, v7.4s, v3.s[2]
\n\t
"
"fmla v15.4s, v7.4s, v3.s[3]
\n\t
"
// in_ptr3 multiply
"ld2 {v6.4s, v7.4s}, [%[in_ptr3]]
\n\t
"
"fmla v8.4s, v4.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v4.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v4.4s, v0.s[2]
\n\t
"
"fmla v11.4s, v4.4s, v0.s[3]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v2.4s, v3.4s}, [%[f1]], #32
\n\t
"
"fmla v12.4s, v4.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v4.4s, v1.s[1]
\n\t
"
"fmla v14.4s, v4.4s, v1.s[2]
\n\t
"
"fmla v15.4s, v4.4s, v1.s[3]
\n\t
"
"ext v7.16b, v4.16b, v6.16b, #4
\n\t
"
"fmla v8.4s, v5.4s, v2.s[0]
\n\t
"
"fmla v9.4s, v5.4s, v2.s[1]
\n\t
"
"fmla v10.4s, v5.4s, v2.s[2]
\n\t
"
"fmla v11.4s, v5.4s, v2.s[3]
\n\t
"
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"fmla v12.4s, v5.4s, v3.s[0]
\n\t
"
"fmla v13.4s, v5.4s, v3.s[1]
\n\t
"
"fmla v14.4s, v5.4s, v3.s[2]
\n\t
"
"fmla v15.4s, v5.4s, v3.s[3]
\n\t
"
"sub %[f1], %[f1], #288
\n\t
"
"fmla v8.4s, v7.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v7.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v7.4s, v0.s[2]
\n\t
"
"fmla v11.4s, v7.4s, v0.s[3]
\n\t
"
"fmla v12.4s, v7.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v7.4s, v1.s[1]
\n\t
"
"fmla v14.4s, v7.4s, v1.s[2]
\n\t
"
"fmla v15.4s, v7.4s, v1.s[3]
\n\t
"
// store out_ptr
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"prfm pldl1keep, [%[in_ptr1], #288]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]], #32
\n\t
"
"ld2 {v4.4s, v5.4s}, [%[in_ptr1]], #32
\n\t
"
"st1 {v8.4s}, [%[out_ptr1]], #16
\n\t
"
"st1 {v9.4s}, [%[out_ptr1_c2]], #16
\n\t
"
"st1 {v10.4s}, [%[out_ptr1_c3]], #16
\n\t
"
"st1 {v11.4s}, [%[out_ptr1_c4]], #16
\n\t
"
"st1 {v12.4s}, [%[out_ptr1_c5]], #16
\n\t
"
"st1 {v13.4s}, [%[out_ptr1_c6]], #16
\n\t
"
"ld2 {v6.4s, v7.4s}, [%[in_ptr1]]
\n\t
"
"st1 {v14.4s}, [%[out_ptr1_c7]], #16
\n\t
"
"subs %[loop], %[loop], #1
\n\t
"
"st1 {v15.4s}, [%[out_ptr1_c8]], #16
\n\t
"
// cycle
"bne 0b
\n\t
"
"sub %[f1], %[in_ptr1], #32
\n\t
"
"sub %[in_ptr1], %[in_ptr1], #32
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr1_c2
]
"+r"
(
out_ptr1_c2
),
[
out_ptr1_c3
]
"+r"
(
out_ptr1_c3
),
[
out_ptr1_c4
]
"+r"
(
out_ptr1_c4
),
[
out_ptr1_c5
]
"+r"
(
out_ptr1_c5
),
[
out_ptr1_c6
]
"+r"
(
out_ptr1_c6
),
[
out_ptr1_c7
]
"+r"
(
out_ptr1_c7
),
[
out_ptr1_c8
]
"+r"
(
out_ptr1_c8
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
)
:
[
f1
]
"r"
(
f1
)
:
"cc"
,
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
);
}
}
#else
if
(
o_h
>
valid_h_start
&&
o_h
<=
valid_h_end
)
{
int
loop
=
(
valid_w_end
-
valid_w_start
-
1
)
>>
2
;
o_w
+=
loop
*
4
;
int
in_stride
=
(
input_w
-
8
)
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"pld [%[f1], #256]
\n\t
"
"pld [%[in_ptr1], #288]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]!
\n\t
"
"vld2.f32 {d8-d11}, [%[in_ptr1]]!
\n\t
"
"vld2.f32 {d12, d13}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], %[in_stride]
\n\t
"
"0:
\n\t
"
// load out_ptr
"pld [%[out_ptr1], #128]
\n\t
"
"pld [%[out_ptr1_c2], #128]
\n\t
"
"pld [%[out_ptr1_c3], #128]
\n\t
"
"pld [%[out_ptr1_c4], #128]
\n\t
"
"pld [%[out_ptr1_c5], #128]
\n\t
"
"pld [%[out_ptr1_c6], #128]
\n\t
"
"pld [%[out_ptr1_c7], #128]
\n\t
"
"pld [%[out_ptr1_c8], #128]
\n\t
"
"vld1.f32 {d16, d17}, [%[out_ptr1]]
\n\t
"
"vld1.f32 {d18, d19}, [%[out_ptr1_c2]]
\n\t
"
"vld1.f32 {d20, d21}, [%[out_ptr1_c3]]
\n\t
"
"vld1.f32 {d22, d23}, [%[out_ptr1_c4]]
\n\t
"
"vld1.f32 {d24, d25}, [%[out_ptr1_c5]]
\n\t
"
"vld1.f32 {d26, d27}, [%[out_ptr1_c6]]
\n\t
"
"vld1.f32 {d28, d29}, [%[out_ptr1_c7]]
\n\t
"
"vld1.f32 {d30, d31}, [%[out_ptr1_c8]]
\n\t
"
// in_ptr1 multiply
"pld [%[f1], #256]
\n\t
"
"vld1.f32 {d4-d7}, [%[f1]]!
\n\t
"
"vmla.f32 q8, q4, d0[0]
\n\t
"
"vmla.f32 q9, q4, d0[1]
\n\t
"
"vmla.f32 q10, q4, d1[0]
\n\t
"
"vmla.f32 q11, q4, d1[1]
\n\t
"
"vmla.f32 q12, q4, d2[0]
\n\t
"
"vmla.f32 q13, q4, d2[1]
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vmla.f32 q14, q4, d3[0]
\n\t
"
"vmla.f32 q15, q4, d3[1]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]!
\n\t
"
"vmla.f32 q8, q5, d4[0]
\n\t
"
"vmla.f32 q9, q5, d4[1]
\n\t
"
"vext.32 q7, q4, q6, #1
\n\t
"
"vmla.f32 q10, q5, d5[0]
\n\t
"
"vmla.f32 q11, q5, d5[1]
\n\t
"
"vmla.f32 q12, q5, d6[0]
\n\t
"
"vmla.f32 q13, q5, d6[1]
\n\t
"
"pld [%[in_ptr1], #288]
\n\t
"
"vmla.f32 q14, q5, d7[0]
\n\t
"
"vmla.f32 q15, q5, d7[1]
\n\t
"
"vld2.f32 {d8-d11}, [%[in_ptr1]]!
\n\t
"
"vmla.f32 q8, q7, d0[0]
\n\t
"
"vmla.f32 q9, q7, d0[1]
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vld1.f32 {d4-d7}, [%[f1]]!
\n\t
"
"vmla.f32 q10, q7, d1[0]
\n\t
"
"vmla.f32 q11, q7, d1[1]
\n\t
"
"vld2.f32 {d12, d13}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], %[in_stride]
\n\t
"
"vmla.f32 q12, q7, d2[0]
\n\t
"
"vmla.f32 q13, q7, d2[1]
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vmla.f32 q14, q7, d3[0]
\n\t
"
"vmla.f32 q15, q7, d3[1]
\n\t
"
// in_ptr2 multiply
"vld1.f32 {d0-d3}, [%[f1]]!
\n\t
"
"vmla.f32 q8, q4, d4[0]
\n\t
"
"vmla.f32 q9, q4, d4[1]
\n\t
"
"vmla.f32 q10, q4, d5[0]
\n\t
"
"vmla.f32 q11, q4, d5[1]
\n\t
"
"vmla.f32 q12, q4, d6[0]
\n\t
"
"vmla.f32 q13, q4, d6[1]
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vmla.f32 q14, q4, d7[0]
\n\t
"
"vmla.f32 q15, q4, d7[1]
\n\t
"
"vld1.f32 {d4-d7}, [%[f1]]!
\n\t
"
"vmla.f32 q8, q5, d0[0]
\n\t
"
"vmla.f32 q9, q5, d0[1]
\n\t
"
"vext.32 q7, q4, q6, #1
\n\t
"
"vmla.f32 q10, q5, d1[0]
\n\t
"
"vmla.f32 q11, q5, d1[1]
\n\t
"
"vmla.f32 q12, q5, d2[0]
\n\t
"
"vmla.f32 q13, q5, d2[1]
\n\t
"
"pld [%[in_ptr1], #288]
\n\t
"
"vmla.f32 q14, q5, d3[0]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"vld2.f32 {d8-d11}, [%[in_ptr1]]!
\n\t
"
"vmla.f32 q8, q7, d4[0]
\n\t
"
"vmla.f32 q9, q7, d4[1]
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]!
\n\t
"
"vmla.f32 q10, q7, d5[0]
\n\t
"
"vmla.f32 q11, q7, d5[1]
\n\t
"
"vld2.f32 {d12, d13}, [%[in_ptr1]]
\n\t
"
"sub %[in_ptr1], %[in_stride]
\n\t
"
"sub %[in_ptr1], %[in_stride]
\n\t
"
"vmla.f32 q12, q7, d6[0]
\n\t
"
"vmla.f32 q13, q7, d6[1]
\n\t
"
"sub %[in_ptr1], #64
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vmla.f32 q14, q7, d7[0]
\n\t
"
"vmla.f32 q15, q7, d7[1]
\n\t
"
// in_ptr3 multiply
"vld1.f32 {d4-d7}, [%[f1]]!
\n\t
"
"vmla.f32 q8, q4, d0[0]
\n\t
"
"vmla.f32 q9, q4, d0[1]
\n\t
"
"vmla.f32 q10, q4, d1[0]
\n\t
"
"vmla.f32 q11, q4, d1[1]
\n\t
"
"vmla.f32 q12, q4, d2[0]
\n\t
"
"vmla.f32 q13, q4, d2[1]
\n\t
"
"pld [%[f1], #256]
\n\t
"
"vmla.f32 q14, q4, d3[0]
\n\t
"
"vmla.f32 q15, q4, d3[1]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]!
\n\t
"
"vmla.f32 q8, q5, d4[0]
\n\t
"
"vmla.f32 q9, q5, d4[1]
\n\t
"
"vext.32 q7, q4, q6, #1
\n\t
"
"vmla.f32 q10, q5, d5[0]
\n\t
"
"vmla.f32 q11, q5, d5[1]
\n\t
"
"vmla.f32 q12, q5, d6[0]
\n\t
"
"vmla.f32 q13, q5, d6[1]
\n\t
"
"vmla.f32 q14, q5, d7[0]
\n\t
"
"vmla.f32 q15, q5, d7[1]
\n\t
"
"sub %[f1], %[f1], #288
\n\t
"
"vmla.f32 q8, q7, d0[0]
\n\t
"
"vmla.f32 q9, q7, d0[1]
\n\t
"
"vmla.f32 q10, q7, d1[0]
\n\t
"
"vmla.f32 q11, q7, d1[1]
\n\t
"
"vmla.f32 q12, q7, d2[0]
\n\t
"
"vmla.f32 q13, q7, d2[1]
\n\t
"
"vmla.f32 q14, q7, d3[0]
\n\t
"
"vmla.f32 q15, q7, d3[1]
\n\t
"
// store out_ptr
"pld [%[f1], #256]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]!
\n\t
"
"pld [%[in_ptr1], #288]
\n\t
"
"vld2.f32 {d8-d11}, [%[in_ptr1]]!
\n\t
"
"vst1.f32 {d16, d17}, [%[out_ptr1]]!
\n\t
"
"vst1.f32 {d18, d19}, [%[out_ptr1_c2]]!
\n\t
"
"vst1.f32 {d20, d21}, [%[out_ptr1_c3]]!
\n\t
"
"vst1.f32 {d22, d23}, [%[out_ptr1_c4]]!
\n\t
"
"vst1.f32 {d24, d25}, [%[out_ptr1_c5]]!
\n\t
"
"vst1.f32 {d26, d27}, [%[out_ptr1_c6]]!
\n\t
"
"vld2.f32 {d12, d13}, [%[in_ptr1]]
\n\t
"
"add %[in_ptr1], %[in_stride]
\n\t
"
"vst1.f32 {d28, d29}, [%[out_ptr1_c7]]!
\n\t
"
"subs %[loop], #1
\n\t
"
"vst1.f32 {d30, d31}, [%[out_ptr1_c8]]!
\n\t
"
// cycle
"bne 0b
\n\t
"
"sub %[f1], %[f1], #32
\n\t
"
"sub %[in_ptr1], %[in_ptr1], #32
\n\t
"
"sub %[in_ptr1], %[in_stride]
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
out_ptr1_c2
]
"+r"
(
out_ptr1_c2
),
[
out_ptr1_c3
]
"+r"
(
out_ptr1_c3
),
[
out_ptr1_c4
]
"+r"
(
out_ptr1_c4
),
[
out_ptr1_c5
]
"+r"
(
out_ptr1_c5
),
[
out_ptr1_c6
]
"+r"
(
out_ptr1_c6
),
[
out_ptr1_c7
]
"+r"
(
out_ptr1_c7
),
[
out_ptr1_c8
]
"+r"
(
out_ptr1_c8
),
[
in_ptr1
]
"+r"
(
in_ptr1
)
:
[
f1
]
"r"
(
f1
),
[
in_stride
]
"r"
(
in_stride
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
in_ptr2
=
in_ptr1
+
input_w
;
in_ptr3
=
in_ptr2
+
input_w
;
}
}
#endif //__aarch64__
#endif // __ARM_NEON
// remain output_width
for
(;
o_w
<
output_w
;
++
o_w
)
{
float
sum1
=
0
;
float
sum1_c2
=
0
;
float
sum1_c3
=
0
;
float
sum1_c4
=
0
;
float
sum1_c5
=
0
;
float
sum1_c6
=
0
;
float
sum1_c7
=
0
;
float
sum1_c8
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_pad_filter1_c2
=
vld1q_f32
(
pad_filter1_c2
);
float32x4_t
_pad_filter1_c3
=
vld1q_f32
(
pad_filter1_c3
);
float32x4_t
_pad_filter1_c4
=
vld1q_f32
(
pad_filter1_c4
);
float32x4_t
_pad_filter1_c5
=
vld1q_f32
(
pad_filter1_c5
);
float32x4_t
_pad_filter1_c6
=
vld1q_f32
(
pad_filter1_c6
);
float32x4_t
_pad_filter1_c7
=
vld1q_f32
(
pad_filter1_c7
);
float32x4_t
_pad_filter1_c8
=
vld1q_f32
(
pad_filter1_c8
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_sum1_c2
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c2
);
float32x4_t
_sum1_c3
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c3
);
float32x4_t
_sum1_c4
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c4
);
float32x4_t
_sum1_c5
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c5
);
float32x4_t
_sum1_c6
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c6
);
float32x4_t
_sum1_c7
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c7
);
float32x4_t
_sum1_c8
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1_c8
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
float32x4_t
_pad_filter2_c2
=
vld1q_f32
(
pad_filter2_c2
);
float32x4_t
_pad_filter2_c3
=
vld1q_f32
(
pad_filter2_c3
);
float32x4_t
_pad_filter2_c4
=
vld1q_f32
(
pad_filter2_c4
);
float32x4_t
_pad_filter2_c5
=
vld1q_f32
(
pad_filter2_c5
);
float32x4_t
_pad_filter2_c6
=
vld1q_f32
(
pad_filter2_c6
);
float32x4_t
_pad_filter2_c7
=
vld1q_f32
(
pad_filter2_c7
);
float32x4_t
_pad_filter2_c8
=
vld1q_f32
(
pad_filter2_c8
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr2
,
_pad_filter2_c2
);
_sum1_c3
=
vmlaq_f32
(
_sum1_c3
,
_in_ptr2
,
_pad_filter2_c3
);
_sum1_c4
=
vmlaq_f32
(
_sum1_c4
,
_in_ptr2
,
_pad_filter2_c4
);
_sum1_c5
=
vmlaq_f32
(
_sum1_c5
,
_in_ptr2
,
_pad_filter2_c5
);
_sum1_c6
=
vmlaq_f32
(
_sum1_c6
,
_in_ptr2
,
_pad_filter2_c6
);
_sum1_c7
=
vmlaq_f32
(
_sum1_c7
,
_in_ptr2
,
_pad_filter2_c7
);
_sum1_c8
=
vmlaq_f32
(
_sum1_c8
,
_in_ptr2
,
_pad_filter2_c8
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
float32x4_t
_pad_filter3_c2
=
vld1q_f32
(
pad_filter3_c2
);
float32x4_t
_pad_filter3_c3
=
vld1q_f32
(
pad_filter3_c3
);
float32x4_t
_pad_filter3_c4
=
vld1q_f32
(
pad_filter3_c4
);
float32x4_t
_pad_filter3_c5
=
vld1q_f32
(
pad_filter3_c5
);
float32x4_t
_pad_filter3_c6
=
vld1q_f32
(
pad_filter3_c6
);
float32x4_t
_pad_filter3_c7
=
vld1q_f32
(
pad_filter3_c7
);
float32x4_t
_pad_filter3_c8
=
vld1q_f32
(
pad_filter3_c8
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1_c2
=
vmlaq_f32
(
_sum1_c2
,
_in_ptr3
,
_pad_filter3_c2
);
_sum1_c3
=
vmlaq_f32
(
_sum1_c3
,
_in_ptr3
,
_pad_filter3_c3
);
_sum1_c4
=
vmlaq_f32
(
_sum1_c4
,
_in_ptr3
,
_pad_filter3_c4
);
_sum1_c5
=
vmlaq_f32
(
_sum1_c5
,
_in_ptr3
,
_pad_filter3_c5
);
_sum1_c6
=
vmlaq_f32
(
_sum1_c6
,
_in_ptr3
,
_pad_filter3_c6
);
_sum1_c7
=
vmlaq_f32
(
_sum1_c7
,
_in_ptr3
,
_pad_filter3_c7
);
_sum1_c8
=
vmlaq_f32
(
_sum1_c8
,
_in_ptr3
,
_pad_filter3_c8
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
_sum1_c2
=
vsetq_lane_f32
(
sum1_c2
,
_sum1_c2
,
3
);
_sum1_c3
=
vsetq_lane_f32
(
sum1_c3
,
_sum1_c3
,
3
);
_sum1_c4
=
vsetq_lane_f32
(
sum1_c4
,
_sum1_c4
,
3
);
_sum1_c5
=
vsetq_lane_f32
(
sum1_c5
,
_sum1_c5
,
3
);
_sum1_c6
=
vsetq_lane_f32
(
sum1_c6
,
_sum1_c6
,
3
);
_sum1_c7
=
vsetq_lane_f32
(
sum1_c7
,
_sum1_c7
,
3
);
_sum1_c8
=
vsetq_lane_f32
(
sum1_c8
,
_sum1_c8
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ss1_2
=
vadd_f32
(
vget_low_f32
(
_sum1_c2
),
vget_high_f32
(
_sum1_c2
));
float32x2_t
_ss1_3
=
vadd_f32
(
vget_low_f32
(
_sum1_c3
),
vget_high_f32
(
_sum1_c3
));
float32x2_t
_ss1_4
=
vadd_f32
(
vget_low_f32
(
_sum1_c4
),
vget_high_f32
(
_sum1_c4
));
float32x2_t
_ss1_5
=
vadd_f32
(
vget_low_f32
(
_sum1_c5
),
vget_high_f32
(
_sum1_c5
));
float32x2_t
_ss1_6
=
vadd_f32
(
vget_low_f32
(
_sum1_c6
),
vget_high_f32
(
_sum1_c6
));
float32x2_t
_ss1_7
=
vadd_f32
(
vget_low_f32
(
_sum1_c7
),
vget_high_f32
(
_sum1_c7
));
float32x2_t
_ss1_8
=
vadd_f32
(
vget_low_f32
(
_sum1_c8
),
vget_high_f32
(
_sum1_c8
));
float32x2_t
_ssss1_ssss1_2
=
vpadd_f32
(
_ss1
,
_ss1_2
);
float32x2_t
_ssss1_3_ssss1_4
=
vpadd_f32
(
_ss1_3
,
_ss1_4
);
float32x2_t
_ssss1_5_ssss1_6
=
vpadd_f32
(
_ss1_5
,
_ss1_6
);
float32x2_t
_ssss1_7_ssss1_8
=
vpadd_f32
(
_ss1_7
,
_ss1_8
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
0
);
sum1_c2
+=
vget_lane_f32
(
_ssss1_ssss1_2
,
1
);
sum1_c3
+=
vget_lane_f32
(
_ssss1_3_ssss1_4
,
0
);
sum1_c4
+=
vget_lane_f32
(
_ssss1_3_ssss1_4
,
1
);
sum1_c5
+=
vget_lane_f32
(
_ssss1_5_ssss1_6
,
0
);
sum1_c6
+=
vget_lane_f32
(
_ssss1_5_ssss1_6
,
1
);
sum1_c7
+=
vget_lane_f32
(
_ssss1_7_ssss1_8
,
0
);
sum1_c8
+=
vget_lane_f32
(
_ssss1_7_ssss1_8
,
1
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
sum1_c2
+=
in_ptr1
[
0
]
*
pad_filter1_c2
[
0
];
sum1_c2
+=
in_ptr1
[
1
]
*
pad_filter1_c2
[
1
];
sum1_c2
+=
in_ptr1
[
2
]
*
pad_filter1_c2
[
2
];
sum1_c2
+=
in_ptr2
[
0
]
*
pad_filter2_c2
[
0
];
sum1_c2
+=
in_ptr2
[
1
]
*
pad_filter2_c2
[
1
];
sum1_c2
+=
in_ptr2
[
2
]
*
pad_filter2_c2
[
2
];
sum1_c2
+=
in_ptr3
[
0
]
*
pad_filter3_c2
[
0
];
sum1_c2
+=
in_ptr3
[
1
]
*
pad_filter3_c2
[
1
];
sum1_c2
+=
in_ptr3
[
2
]
*
pad_filter3_c2
[
2
];
sum1_c3
+=
in_ptr1
[
0
]
*
pad_filter1_c3
[
0
];
sum1_c3
+=
in_ptr1
[
1
]
*
pad_filter1_c3
[
1
];
sum1_c3
+=
in_ptr1
[
2
]
*
pad_filter1_c3
[
2
];
sum1_c3
+=
in_ptr2
[
0
]
*
pad_filter2_c3
[
0
];
sum1_c3
+=
in_ptr2
[
1
]
*
pad_filter2_c3
[
1
];
sum1_c3
+=
in_ptr2
[
2
]
*
pad_filter2_c3
[
2
];
sum1_c3
+=
in_ptr3
[
0
]
*
pad_filter3_c3
[
0
];
sum1_c3
+=
in_ptr3
[
1
]
*
pad_filter3_c3
[
1
];
sum1_c3
+=
in_ptr3
[
2
]
*
pad_filter3_c3
[
2
];
sum1_c4
+=
in_ptr1
[
0
]
*
pad_filter1_c4
[
0
];
sum1_c4
+=
in_ptr1
[
1
]
*
pad_filter1_c4
[
1
];
sum1_c4
+=
in_ptr1
[
2
]
*
pad_filter1_c4
[
2
];
sum1_c4
+=
in_ptr2
[
0
]
*
pad_filter2_c4
[
0
];
sum1_c4
+=
in_ptr2
[
1
]
*
pad_filter2_c4
[
1
];
sum1_c4
+=
in_ptr2
[
2
]
*
pad_filter2_c4
[
2
];
sum1_c4
+=
in_ptr3
[
0
]
*
pad_filter3_c4
[
0
];
sum1_c4
+=
in_ptr3
[
1
]
*
pad_filter3_c4
[
1
];
sum1_c4
+=
in_ptr3
[
2
]
*
pad_filter3_c4
[
2
];
sum1_c5
+=
in_ptr1
[
0
]
*
pad_filter1_c5
[
0
];
sum1_c5
+=
in_ptr1
[
1
]
*
pad_filter1_c5
[
1
];
sum1_c5
+=
in_ptr1
[
2
]
*
pad_filter1_c5
[
2
];
sum1_c5
+=
in_ptr2
[
0
]
*
pad_filter2_c5
[
0
];
sum1_c5
+=
in_ptr2
[
1
]
*
pad_filter2_c5
[
1
];
sum1_c5
+=
in_ptr2
[
2
]
*
pad_filter2_c5
[
2
];
sum1_c5
+=
in_ptr3
[
0
]
*
pad_filter3_c5
[
0
];
sum1_c5
+=
in_ptr3
[
1
]
*
pad_filter3_c5
[
1
];
sum1_c5
+=
in_ptr3
[
2
]
*
pad_filter3_c5
[
2
];
sum1_c6
+=
in_ptr1
[
0
]
*
pad_filter1_c6
[
0
];
sum1_c6
+=
in_ptr1
[
1
]
*
pad_filter1_c6
[
1
];
sum1_c6
+=
in_ptr1
[
2
]
*
pad_filter1_c6
[
2
];
sum1_c6
+=
in_ptr2
[
0
]
*
pad_filter2_c6
[
0
];
sum1_c6
+=
in_ptr2
[
1
]
*
pad_filter2_c6
[
1
];
sum1_c6
+=
in_ptr2
[
2
]
*
pad_filter2_c6
[
2
];
sum1_c6
+=
in_ptr3
[
0
]
*
pad_filter3_c6
[
0
];
sum1_c6
+=
in_ptr3
[
1
]
*
pad_filter3_c6
[
1
];
sum1_c6
+=
in_ptr3
[
2
]
*
pad_filter3_c6
[
2
];
sum1_c7
+=
in_ptr1
[
0
]
*
pad_filter1_c7
[
0
];
sum1_c7
+=
in_ptr1
[
1
]
*
pad_filter1_c7
[
1
];
sum1_c7
+=
in_ptr1
[
2
]
*
pad_filter1_c7
[
2
];
sum1_c7
+=
in_ptr2
[
0
]
*
pad_filter2_c7
[
0
];
sum1_c7
+=
in_ptr2
[
1
]
*
pad_filter2_c7
[
1
];
sum1_c7
+=
in_ptr2
[
2
]
*
pad_filter2_c7
[
2
];
sum1_c7
+=
in_ptr3
[
0
]
*
pad_filter3_c7
[
0
];
sum1_c7
+=
in_ptr3
[
1
]
*
pad_filter3_c7
[
1
];
sum1_c7
+=
in_ptr3
[
2
]
*
pad_filter3_c7
[
2
];
sum1_c8
+=
in_ptr1
[
0
]
*
pad_filter1_c8
[
0
];
sum1_c8
+=
in_ptr1
[
1
]
*
pad_filter1_c8
[
1
];
sum1_c8
+=
in_ptr1
[
2
]
*
pad_filter1_c8
[
2
];
sum1_c8
+=
in_ptr2
[
0
]
*
pad_filter2_c8
[
0
];
sum1_c8
+=
in_ptr2
[
1
]
*
pad_filter2_c8
[
1
];
sum1_c8
+=
in_ptr2
[
2
]
*
pad_filter2_c8
[
2
];
sum1_c8
+=
in_ptr3
[
0
]
*
pad_filter3_c8
[
0
];
sum1_c8
+=
in_ptr3
[
1
]
*
pad_filter3_c8
[
1
];
sum1_c8
+=
in_ptr3
[
2
]
*
pad_filter3_c8
[
2
];
#endif
if
(
if_nopadding
)
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
else
if
(
input_w
>
3
&&
(
if_odd_pad_w
&&
o_w
==
valid_w_start
||
o_w
==
valid_w_end
&&
if_odd_pad_w
&&
if_exact_in_w
||
o_w
==
valid_w_end
+
1
&&
!
if_odd_pad_w
&&
!
if_exact_in_w
))
{
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
pad_filter1_c2
--
;
pad_filter2_c2
--
;
pad_filter3_c2
--
;
pad_filter1_c3
--
;
pad_filter2_c3
--
;
pad_filter3_c3
--
;
pad_filter1_c4
--
;
pad_filter2_c4
--
;
pad_filter3_c4
--
;
pad_filter1_c5
--
;
pad_filter2_c5
--
;
pad_filter3_c5
--
;
pad_filter1_c6
--
;
pad_filter2_c6
--
;
pad_filter3_c6
--
;
pad_filter1_c7
--
;
pad_filter2_c7
--
;
pad_filter3_c7
--
;
pad_filter1_c8
--
;
pad_filter2_c8
--
;
pad_filter3_c8
--
;
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
}
else
if
(
input_w
<=
3
||
o_w
<
valid_w_start
||
o_w
>
valid_w_end
)
{
pad_filter1
-=
2
;
pad_filter2
-=
2
;
pad_filter3
-=
2
;
pad_filter1_c2
-=
2
;
pad_filter2_c2
-=
2
;
pad_filter3_c2
-=
2
;
pad_filter1_c3
-=
2
;
pad_filter2_c3
-=
2
;
pad_filter3_c3
-=
2
;
pad_filter1_c4
-=
2
;
pad_filter2_c4
-=
2
;
pad_filter3_c4
-=
2
;
pad_filter1_c5
-=
2
;
pad_filter2_c5
-=
2
;
pad_filter3_c5
-=
2
;
pad_filter1_c6
-=
2
;
pad_filter2_c6
-=
2
;
pad_filter3_c6
-=
2
;
pad_filter1_c7
-=
2
;
pad_filter2_c7
-=
2
;
pad_filter3_c7
-=
2
;
pad_filter1_c8
-=
2
;
pad_filter2_c8
-=
2
;
pad_filter3_c8
-=
2
;
}
else
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
*
out_ptr1
+=
sum1
;
*
out_ptr1_c2
+=
sum1_c2
;
*
out_ptr1_c3
+=
sum1_c3
;
*
out_ptr1_c4
+=
sum1_c4
;
*
out_ptr1_c5
+=
sum1_c5
;
*
out_ptr1_c6
+=
sum1_c6
;
*
out_ptr1_c7
+=
sum1_c7
;
*
out_ptr1_c8
+=
sum1_c8
;
out_ptr1
++
;
out_ptr1_c2
++
;
out_ptr1_c3
++
;
out_ptr1_c4
++
;
out_ptr1_c5
++
;
out_ptr1_c6
++
;
out_ptr1_c7
++
;
out_ptr1_c8
++
;
}
if
(
if_nopadding
)
{
in_ptr1
+=
remain_stride_w
+
input_w
;
in_ptr2
+=
remain_stride_w
+
input_w
;
in_ptr3
+=
remain_stride_w
+
input_w
;
}
else
if
(
input_h
>
3
&&
(
if_odd_pad_h
&&
o_h
==
valid_h_start
||
o_h
==
valid_h_end
&&
if_odd_pad_h
&&
if_exact_in_h
||
o_h
==
valid_h_end
+
1
&&
!
if_odd_pad_h
&&
!
if_exact_in_h
))
{
in_ptr1
+=
3
;
in_ptr2
+=
3
;
in_ptr3
+=
3
;
pad_filter1
-=
remain_stride_w
;
pad_filter2
-=
remain_stride_w
;
pad_filter3
-=
remain_stride_w
;
pad_filter1_c2
-=
remain_stride_w
;
pad_filter2_c2
-=
remain_stride_w
;
pad_filter3_c2
-=
remain_stride_w
;
pad_filter1_c3
-=
remain_stride_w
;
pad_filter2_c3
-=
remain_stride_w
;
pad_filter3_c3
-=
remain_stride_w
;
pad_filter1_c4
-=
remain_stride_w
;
pad_filter2_c4
-=
remain_stride_w
;
pad_filter3_c4
-=
remain_stride_w
;
pad_filter1_c5
-=
remain_stride_w
;
pad_filter2_c5
-=
remain_stride_w
;
pad_filter3_c5
-=
remain_stride_w
;
pad_filter1_c6
-=
remain_stride_w
;
pad_filter2_c6
-=
remain_stride_w
;
pad_filter3_c6
-=
remain_stride_w
;
pad_filter1_c7
-=
remain_stride_w
;
pad_filter2_c7
-=
remain_stride_w
;
pad_filter3_c7
-=
remain_stride_w
;
pad_filter1_c8
-=
remain_stride_w
;
pad_filter2_c8
-=
remain_stride_w
;
pad_filter3_c8
-=
remain_stride_w
;
}
else
if
(
input_h
<=
3
||
o_h
<
valid_h_start
||
o_h
>
valid_h_end
)
{
in_ptr1
-=
input_w
-
3
;
in_ptr2
-=
input_w
-
3
;
in_ptr3
-=
input_w
-
3
;
pad_filter1
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c2
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c2
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c2
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c3
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c3
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c3
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c4
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c4
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c4
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c5
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c5
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c5
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c6
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c6
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c6
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c7
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c7
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c7
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter1_c8
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2_c8
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3_c8
-=
3
+
2
*
padding_w
+
remain_stride_w
;
}
else
{
pad_filter1
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c2
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c2
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c2
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c3
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c3
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c3
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c4
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c4
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c4
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c5
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c5
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c5
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c6
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c6
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c6
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c7
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c7
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c7
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter1_c8
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2_c8
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3_c8
+=
3
+
2
*
padding_w
-
remain_stride_w
;
in_ptr1
+=
input_w
+
3
;
in_ptr2
+=
input_w
+
3
;
in_ptr3
+=
input_w
+
3
;
}
}
filter_data_ch
+=
filter_ch_size
;
filter_data_ch_c2
+=
filter_ch_size
;
filter_data_ch_c3
+=
filter_ch_size
;
filter_data_ch_c4
+=
filter_ch_size
;
filter_data_ch_c5
+=
filter_ch_size
;
filter_data_ch_c6
+=
filter_ch_size
;
filter_data_ch_c7
+=
filter_ch_size
;
filter_data_ch_c8
+=
filter_ch_size
;
input_data_ch
+=
in_ch_size
;
}
}
int
out_ch_remain_start
=
output_ch
-
output_ch
%
8
;
// remain output_channel
#pragma omp parallel for
for
(
int
o_c
=
out_ch_remain_start
;
o_c
<
output_ch
;
++
o_c
)
{
const
float
*
f1
,
*
f9
;
const
float
*
in_ptr1
,
*
in_ptr2
,
*
in_ptr3
;
const
float
*
pad_filter1
,
*
pad_filter2
,
*
pad_filter3
;
float
pad_filter_arr
[
pad_filter_ch_size
];
float
*
output_data_ch
;
const
float
*
input_data_ch
;
const
float
*
filter_data_ch
;
filter_data_ch
=
filter_data
+
o_c
*
filter_ch_size
*
input_ch
;
input_data_ch
=
input_data
;
output_data_ch
=
output_data
+
o_c
*
out_ch_size
;
for
(
int
i_c
=
0
;
i_c
<
input_ch
;
++
i_c
)
{
f1
=
filter_data_ch
;
f9
=
f1
+
8
;
if
(
!
if_nopadding
)
{
memset
(
pad_filter_arr
,
0.
f
,
sizeof
(
pad_filter_arr
));
for
(
int
i
=
0
;
i
<
9
;
++
i
)
{
int
j
=
i
/
3
*
(
2
*
padding_w
+
3
)
+
i
%
3
+
padding_h
*
3
+
padding_w
*
(
2
*
padding_h
+
1
);
pad_filter_arr
[
j
]
=
filter_data_ch
[
i
];
}
pad_filter1
=
pad_filter_arr
;
pad_filter1
+=
pad_filter_start
;
pad_filter2
=
pad_filter1
+
pad_filter_w
;
pad_filter3
=
pad_filter2
+
pad_filter_w
;
}
else
{
pad_filter1
=
filter_data_ch
;
pad_filter2
=
pad_filter1
+
3
;
pad_filter3
=
pad_filter2
+
3
;
}
float
*
out_ptr1
;
out_ptr1
=
output_data_ch
;
in_ptr1
=
input_data_ch
;
in_ptr2
=
in_ptr1
+
input_w
;
in_ptr3
=
in_ptr2
+
input_w
;
int
o_h
=
0
;
for
(;
o_h
<
output_h
;
++
o_h
)
{
int
o_w
=
0
;
// pad left
for
(;
o_w
<=
valid_w_start
;
++
o_w
)
{
float
sum1
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ssss1_ssss1
=
vpadd_f32
(
_ss1
,
_ss1
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1
,
0
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
#endif
if
(
if_nopadding
)
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
else
if
(
input_w
>
3
&&
(
if_odd_pad_w
&&
o_w
==
valid_w_start
||
o_w
==
valid_w_end
&&
if_odd_pad_w
&&
if_exact_in_w
||
o_w
==
valid_w_end
+
1
&&
!
if_odd_pad_w
&&
!
if_exact_in_w
))
{
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
}
else
if
(
input_w
<=
3
||
o_w
<
valid_w_start
||
o_w
>
valid_w_end
)
{
pad_filter1
-=
2
;
pad_filter2
-=
2
;
pad_filter3
-=
2
;
}
else
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
*
out_ptr1
+=
sum1
;
out_ptr1
++
;
}
// valid
#if __ARM_NEON
#if __aarch64__
if
(
o_h
>
valid_h_start
&&
o_h
<
valid_h_end
)
{
int
loop
=
(
valid_w_end
-
valid_w_start
-
1
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"prfm pldl1keep, [%[f1], #256]
\n\t
"
"prfm pldl1keep, [%[f9], #256]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[f1]]
\n\t
"
"ld1 {v4.s}[0], [%[f9]]
\n\t
"
"0:
\n\t
"
// load out_ptr
"prfm pldl1keep, [%[out_ptr1], #128]
\n\t
"
"ld1 {v12.4s}, [%[out_ptr1]]
\n\t
"
// in_ptr1 multiply
"prfm pldl1keep, [%[in_ptr1], #256]
\n\t
"
"ld2 {v5.4s, v6.4s}, [%[in_ptr1]], #32
\n\t
"
"ld2 {v7.4s, v8.4s}, [%[in_ptr1]]
\n\t
"
"fmla v12.4s, v5.4s, v0.s[0]
\n\t
"
"fmla v14.4s, v5.4s, v2.s[0]
\n\t
"
"ext v8.16b, v5.16b, v7.16b, #4
\n\t
"
"fmul v13.4s, v6.4s, v0.s[1]
\n\t
"
"fmla v12.4s, v8.4s, v0.s[2]
\n\t
"
"ld2 {v5.4s, v6.4s}, [%[in_ptr2]], #32
\n\t
"
"ld2 {v7.4s, v8.4s}, [%[in_ptr2]]
\n\t
"
// in_ptr2 multiply
"fmla v13.4s, v5.4s, v0.s[3]
\n\t
"
"ext v8.16b, v5.16b, v7.16b, #4
\n\t
"
"fmla v12.4s, v6.4s, v1.s[0]
\n\t
"
"fmla v13.4s, v8.4s, v1.s[1]
\n\t
"
"ld2 {v5.4s, v6.4s}, [%[in_ptr3]], #32
\n\t
"
"ld2 {v7.4s, v8.4s}, [%[in_ptr3]]
\n\t
"
// in_ptr3 multiply
"fmla v12.4s, v5.4s, v1.s[2]
\n\t
"
"ext v8.16b, v5.16b, v7.16b, #4
\n\t
"
"fmla v13.4s, v6.4s, v1.s[3]
\n\t
"
"fmla v12.4s, v8.4s, v4.s[0]
\n\t
"
// store out_ptr
"fadd v12.4s, v12.4s, v13.4s
\n\t
"
"st1 {v12.4s}, [%[out_ptr1]], #16
\n\t
"
// cycle
"subs %[loop], %[loop], #1
\n\t
"
"bne 0b
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
)
:
[
f1
]
"r"
(
f1
),
[
f9
]
"r"
(
f9
)
:
"cc"
,
"memory"
,
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v12"
,
"v13"
);
}
}
#else
if
(
o_h
>
valid_h_start
&&
o_h
<
valid_h_end
)
{
int
loop
=
(
valid_w_end
-
valid_w_start
-
1
)
>>
2
;
o_w
+=
loop
*
4
;
if
(
loop
>
0
)
{
asm
volatile
(
"pld [%[f1], #256]
\n\t
"
"pld [%[f9], #256]
\n\t
"
"vld1.f32 {d0-d3}, [%[f1]]
\n\t
"
"vld1.f32 {d8[0]}, [%[f9]]
\n\t
"
"pld [%[in_ptr1], #256]
\n\t
"
"vld2.f32 {d10-d13}, [%[in_ptr1]]!
\n\t
"
"vld2.f32 {d14, d15}, [%[in_ptr1]]
\n\t
"
"0:
\n\t
"
// load out_ptr
"pld [%[out_ptr1], #128]
\n\t
"
"vld1.f32 {d24, d25}, [%[out_ptr1]]
\n\t
"
// in_ptr1 multiply
"pld [%[in_ptr2], #256]
\n\t
"
"vld2.f32 {d4-d7}, [%[in_ptr2]]!
\n\t
"
"vmla.f32 q12, q5, d0[0]
\n\t
"
"vld2.f32 {d20, d21}, [%[in_ptr2]]
\n\t
"
"vext.32 q8, q5, q7, #1
\n\t
"
"pld [%[in_ptr3], #256]
\n\t
"
"vmul.f32 q13, q6, d0[1]
\n\t
"
"vld2.f32 {d10-d13}, [%[in_ptr3]]!
\n\t
"
"vmul.f32 q14, q8, d1[0]
\n\t
"
"vld2.f32 {d14, d15}, [%[in_ptr3]]
\n\t
"
// in_ptr2 multiply
"vmul.f32 q15, q2, d1[1]
\n\t
"
"vext.32 q8, q2, q10, #1
\n\t
"
"vmla.f32 q12, q3, d2[0]
\n\t
"
"vmla.f32 q13, q8, d2[1]
\n\t
"
// in_ptr3 multiply
"vmla.f32 q14, q5, d3[0]
\n\t
"
"vext.32 q8, q5, q7, #1
\n\t
"
"pld [%[in_ptr1], #256]
\n\t
"
"vmla.f32 q15, q6, d3[1]
\n\t
"
"vld2.f32 {d10-d13}, [%[in_ptr1]]!
\n\t
"
"vmla.f32 q13, q8, d8[0]
\n\t
"
// store out_ptr
"vld2.f32 {d14, d15}, [%[in_ptr1]]
\n\t
"
"vadd.f32 q12, q12, q13
\n\t
"
"subs %[loop], #1
\n\t
"
"vadd.f32 q14, q14, q15
\n\t
"
"vadd.f32 q12, q12, q14
\n\t
"
"vst1.f32 {d24, d25}, [%[out_ptr1]]!
\n\t
"
// cycle
"bne 0b
\n\t
"
"subs %[in_ptr1], %[in_ptr1], #32
\n\t
"
:
[
loop
]
"+r"
(
loop
),
[
out_ptr1
]
"+r"
(
out_ptr1
),
[
in_ptr1
]
"+r"
(
in_ptr1
),
[
in_ptr2
]
"+r"
(
in_ptr2
),
[
in_ptr3
]
"+r"
(
in_ptr3
)
:
[
f1
]
"r"
(
f1
),
[
f9
]
"r"
(
f9
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q10"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
}
#endif //__aarch64__
#endif // __ARM_NEON
out_ptr1
-=
4
;
out_ptr1
+=
4
;
// remain output_width
for
(;
o_w
<
output_w
;
++
o_w
)
{
float
sum1
=
0
;
#if __ARM_NEON
float32x4_t
_in_ptr1
=
vld1q_f32
(
in_ptr1
);
float32x4_t
_pad_filter1
=
vld1q_f32
(
pad_filter1
);
float32x4_t
_sum1
=
vmulq_f32
(
_in_ptr1
,
_pad_filter1
);
float32x4_t
_in_ptr2
=
vld1q_f32
(
in_ptr2
);
float32x4_t
_pad_filter2
=
vld1q_f32
(
pad_filter2
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr2
,
_pad_filter2
);
float32x4_t
_in_ptr3
=
vld1q_f32
(
in_ptr3
);
float32x4_t
_pad_filter3
=
vld1q_f32
(
pad_filter3
);
_sum1
=
vmlaq_f32
(
_sum1
,
_in_ptr3
,
_pad_filter3
);
_sum1
=
vsetq_lane_f32
(
sum1
,
_sum1
,
3
);
float32x2_t
_ss1
=
vadd_f32
(
vget_low_f32
(
_sum1
),
vget_high_f32
(
_sum1
));
float32x2_t
_ssss1_ssss1
=
vpadd_f32
(
_ss1
,
_ss1
);
sum1
+=
vget_lane_f32
(
_ssss1_ssss1
,
0
);
#else
sum1
+=
in_ptr1
[
0
]
*
pad_filter1
[
0
];
sum1
+=
in_ptr1
[
1
]
*
pad_filter1
[
1
];
sum1
+=
in_ptr1
[
2
]
*
pad_filter1
[
2
];
sum1
+=
in_ptr2
[
0
]
*
pad_filter2
[
0
];
sum1
+=
in_ptr2
[
1
]
*
pad_filter2
[
1
];
sum1
+=
in_ptr2
[
2
]
*
pad_filter2
[
2
];
sum1
+=
in_ptr3
[
0
]
*
pad_filter3
[
0
];
sum1
+=
in_ptr3
[
1
]
*
pad_filter3
[
1
];
sum1
+=
in_ptr3
[
2
]
*
pad_filter3
[
2
];
#endif
if
(
if_nopadding
)
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
else
if
(
input_w
>
3
&&
(
if_odd_pad_w
&&
o_w
==
valid_w_start
||
o_w
==
valid_w_end
&&
if_odd_pad_w
&&
if_exact_in_w
||
o_w
==
valid_w_end
+
1
&&
!
if_odd_pad_w
&&
!
if_exact_in_w
))
{
pad_filter1
--
;
pad_filter2
--
;
pad_filter3
--
;
in_ptr1
++
;
in_ptr2
++
;
in_ptr3
++
;
}
else
if
(
input_w
<=
3
||
o_w
<
valid_w_start
||
o_w
>
valid_w_end
)
{
pad_filter1
-=
2
;
pad_filter2
-=
2
;
pad_filter3
-=
2
;
}
else
{
in_ptr1
+=
2
;
in_ptr2
+=
2
;
in_ptr3
+=
2
;
}
*
out_ptr1
+=
sum1
;
out_ptr1
++
;
}
if
(
if_nopadding
)
{
in_ptr1
+=
remain_stride_w
+
input_w
;
in_ptr2
+=
remain_stride_w
+
input_w
;
in_ptr3
+=
remain_stride_w
+
input_w
;
}
else
if
(
input_h
>
3
&&
(
if_odd_pad_h
&&
o_h
==
valid_h_start
||
o_h
==
valid_h_end
&&
if_odd_pad_h
&&
if_exact_in_h
||
o_h
==
valid_h_end
+
1
&&
!
if_odd_pad_h
&&
!
if_exact_in_h
))
{
in_ptr1
+=
3
;
in_ptr2
+=
3
;
in_ptr3
+=
3
;
pad_filter1
-=
remain_stride_w
;
pad_filter2
-=
remain_stride_w
;
pad_filter3
-=
remain_stride_w
;
}
else
if
(
input_h
<=
3
||
o_h
<
valid_h_start
||
o_h
>
valid_h_end
)
{
in_ptr1
-=
input_w
-
3
;
in_ptr2
-=
input_w
-
3
;
in_ptr3
-=
input_w
-
3
;
pad_filter1
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter2
-=
3
+
2
*
padding_w
+
remain_stride_w
;
pad_filter3
-=
3
+
2
*
padding_w
+
remain_stride_w
;
}
else
{
pad_filter1
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter2
+=
3
+
2
*
padding_w
-
remain_stride_w
;
pad_filter3
+=
3
+
2
*
padding_w
-
remain_stride_w
;
in_ptr1
+=
input_w
+
3
;
in_ptr2
+=
input_w
+
3
;
in_ptr3
+=
input_w
+
3
;
}
}
filter_data_ch
+=
filter_ch_size
;
input_data_ch
+=
in_ch_size
;
}
}
input_data
+=
in_batch_size
;
output_data
+=
out_batch_size
;
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/slidingwindow_conv3x3.h
0 → 100644
浏览文件 @
85ba3b69
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
template
<
typename
Itype
,
typename
Otype
>
void
SlidingwindowConv3x3s1
(
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
template
<
typename
Itype
,
typename
Otype
>
void
SlidingwindowConv3x3s2
(
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/op_param.h
浏览文件 @
85ba3b69
...
@@ -476,6 +476,8 @@ class ConvParam : public OpParam {
...
@@ -476,6 +476,8 @@ class ConvParam : public OpParam {
EXEC_GEMM_INT8
,
EXEC_GEMM_INT8
,
EXEC_DEPTHWISE3x3_INT8
,
EXEC_DEPTHWISE3x3_INT8
,
EXEC_DEPTHWISE5x5_INT8
,
EXEC_DEPTHWISE5x5_INT8
,
EXEC_SLIDINGWINDOW3x3S1_FLOAT
,
EXEC_SLIDINGWINDOW3x3S2_FLOAT
,
};
};
ExecMode
&
ExecMode
()
const
{
return
exec_mode_
;
}
ExecMode
&
ExecMode
()
const
{
return
exec_mode_
;
}
...
...
test/fpga/test_marker.cpp
浏览文件 @
85ba3b69
...
@@ -12,17 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <iostream>
#ifndef PADDLE_MOBILE_FPGA
#define PADDLE_MOBILE_FPGA
#endif
#include "../test_helper.h"
#include "../test_helper.h"
#include "../test_include.h"
#include "../test_include.h"
#ifdef PADDLE_MOBILE_FPGA_V1
#ifdef PADDLE_MOBILE_FPGA_V1
#include "fpga/V1/api.h"
#include "fpga/V1/api.h"
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA_V2
#ifdef PADDLE_MOBILE_FPGA_V2
#include "fpga/V2/api.h"
#include "fpga/V2/api.h"
#endif
#endif
#include <string>
#include <fstream>
#include <iostream>
#include "../../src/io/paddle_inference_api.h"
using
namespace
paddle_mobile
;
// NOLINT
using
namespace
paddle_mobile
::
fpga
;
// NOLINT
static
const
char
*
g_image
=
"../models/marker/marker1/image.bin"
;
static
const
char
*
g_model
=
"../models/marker/marker1/model"
;
static
const
char
*
g_param
=
"../models/marker/marker1/params"
;
void
readStream
(
std
::
string
filename
,
char
*
buf
)
{
void
readStream
(
std
::
string
filename
,
char
*
buf
)
{
std
::
ifstream
in
;
std
::
ifstream
in
;
...
@@ -36,132 +48,78 @@ void readStream(std::string filename, char *buf) {
...
@@ -36,132 +48,78 @@ void readStream(std::string filename, char *buf) {
auto
length
=
in
.
tellg
();
// report location (this is the length)
auto
length
=
in
.
tellg
();
// report location (this is the length)
in
.
seekg
(
0
,
std
::
ios
::
beg
);
// go back to the beginning
in
.
seekg
(
0
,
std
::
ios
::
beg
);
// go back to the beginning
in
.
read
(
buf
,
length
);
in
.
read
(
buf
,
length
);
DLOG
<<
length
;
in
.
close
();
in
.
close
();
}
}
void
convert_to_chw
(
int16_t
**
data_in
,
int
channel
,
int
height
,
int
width
,
PaddleMobileConfig
GetConfig
()
{
int
num
,
int16_t
*
data_tmp
)
{
PaddleMobileConfig
config
;
int64_t
amount_per_side
=
width
*
height
;
config
.
precision
=
PaddleMobileConfig
::
FP32
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
config
.
device
=
PaddleMobileConfig
::
kFPGA
;
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
config
.
prog_file
=
g_model
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
config
.
param_file
=
g_param
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
config
.
thread_num
=
1
;
*
(
data_tmp
+
n
*
amount_per_side
*
channel
+
c
*
amount_per_side
+
config
.
batch_size
=
1
;
width
*
h
+
w
)
=
*
((
*
data_in
)
++
);
config
.
optimize
=
true
;
}
config
.
lod_mode
=
true
;
}
config
.
quantification
=
false
;
}
return
config
;
}
}
}
void
dump_stride_half
(
std
::
string
filename
,
Tensor
input_tensor
,
int
main
()
{
const
int
dumpnum
,
bool
use_chw
)
{
open_device
();
// bool use_chw = true;
if
(
input_tensor
.
dims
().
size
()
!=
4
)
return
;
PaddleMobileConfig
config
=
GetConfig
();
int
c
=
(
input_tensor
.
dims
())[
1
];
auto
predictor
=
int
h
=
(
input_tensor
.
dims
())[
2
];
CreatePaddlePredictor
<
PaddleMobileConfig
,
int
w
=
(
input_tensor
.
dims
())[
3
];
PaddleEngineKind
::
kPaddleMobile
>
(
config
);
int
n
=
(
input_tensor
.
dims
())[
0
];
auto
data_ptr
=
input_tensor
.
get_data
();
std
::
cout
<<
"Finishing loading model"
<<
std
::
endl
;
auto
*
data_ptr_16
=
reinterpret_cast
<
half
*>
(
data_ptr
);
auto
data_tmp
=
data_ptr_16
;
float
img_info
[
3
]
=
{
432
,
1280
,
1.0
f
};
if
(
use_chw
)
{
int
img_length
=
432
*
1280
*
3
;
data_tmp
=
auto
img
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
img_length
*
sizeof
(
float
)));
reinterpret_cast
<
half
*>
(
malloc
(
n
*
c
*
h
*
w
*
sizeof
(
int16_t
)));
readStream
(
g_image
,
reinterpret_cast
<
char
*>
(
img
));
convert_to_chw
(
&
data_ptr_16
,
c
,
h
,
w
,
n
,
data_tmp
);
}
std
::
cout
<<
"Finishing initializing data"
<<
std
::
endl
;
std
::
ofstream
out
(
filename
.
c_str
());
struct
PaddleTensor
t_img_info
,
t_img
;
float
result
=
0
;
t_img
.
dtypeid
=
typeid
(
float
);
int
stride
=
input_tensor
.
numel
()
/
dumpnum
;
t_img_info
.
layout
=
LAYOUT_HWC
;
stride
=
stride
>
0
?
stride
:
1
;
t_img_info
.
shape
=
std
::
vector
<
int
>
({
1
,
3
});
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
i
+=
stride
)
{
t_img_info
.
name
=
"Image information"
;
result
=
paddle_mobile
::
fpga
::
fp16_2_fp32
(
data_tmp
[
i
]);
t_img_info
.
data
.
Reset
(
img_info
,
3
*
sizeof
(
float
));
out
<<
result
<<
std
::
endl
;
}
t_img
.
dtypeid
=
typeid
(
float
);
out
.
close
();
t_img
.
layout
=
LAYOUT_HWC
;
if
(
data_tmp
!=
data_ptr_16
)
{
t_img
.
shape
=
std
::
vector
<
int
>
({
1
,
432
,
1280
,
3
});
free
(
data_tmp
);
t_img
.
name
=
"Image information"
;
t_img
.
data
.
Reset
(
img
,
img_length
*
sizeof
(
float
));
predictor
->
FeedPaddleTensors
({
t_img_info
,
t_img
});
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
predictor
->
Predict_From_To
(
0
,
-
1
);
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
PaddleTensor
>
v
;
// No need to initialize v
predictor
->
FetchPaddleTensors
(
&
v
);
// Old data in v will be cleared
for
(
int
i
=
0
;
i
<
v
.
size
();
++
i
)
{
auto
p
=
reinterpret_cast
<
float
*>
(
v
[
i
].
data
.
data
());
int
len
=
v
[
i
].
data
.
length
();
float
result
=
0.0
f
;
std
::
string
str
=
"fetch"
+
std
::
to_string
(
i
);
fpga
::
savefile
<
float
>
(
str
,
p
,
len
,
result
);
}
}
}
void
dump_stride_float
(
std
::
string
filename
,
Tensor
input_tensor
,
std
::
cout
<<
"Finish getting vector values"
<<
std
::
endl
;
const
int
dumpnum
)
{
auto
data_ptr
=
reinterpret_cast
<
float
*>
(
input_tensor
.
get_data
());
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
int
stride
=
input_tensor
.
numel
()
/
dumpnum
;
stride
=
stride
>
0
?
stride
:
1
;
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
i
+=
stride
)
{
result
=
data_ptr
[
i
];
out
<<
result
<<
std
::
endl
;
}
out
.
close
();
}
void
dump_stride
(
std
::
string
filename
,
Tensor
input_tensor
,
const
int
dumpnum
,
////////////////////////////////////////////////////
bool
use_chw
)
{
static
int
i
=
0
;
if
(
input_tensor
.
numel
()
==
0
)
{
return
;
}
if
(
input_tensor
.
type
()
==
typeid
(
float
))
{
DLOG
<<
"op: "
<<
i
++
<<
", float data "
<<
input_tensor
.
numel
();
dump_stride_float
(
filename
,
input_tensor
,
dumpnum
);
}
else
{
DLOG
<<
"op: "
<<
i
++
<<
", half data "
<<
input_tensor
.
numel
();
dump_stride_half
(
filename
,
input_tensor
,
dumpnum
,
use_chw
);
}
DLOG
<<
"dump input address: "
<<
input_tensor
.
get_data
();
}
static
const
char
*
g_marker_combine
=
"../models/marker/model"
;
static
const
char
*
g_image_src_float
=
"../models/marker/model/input_0.bin"
;
int
main
()
{
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
// if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
// std::string(g_rfcn_combine) + "/params", true, false,
// 1, true)) {
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_marker_combine
),
true
))
{
float
img_info
[
3
]
=
{
720
,
1280
,
800.0
f
/
960.0
f
};
auto
img
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
720
*
1280
*
3
*
sizeof
(
float
)));
readStream
(
g_image_src_float
,
reinterpret_cast
<
char
*>
(
img
));
std
::
vector
<
void
*>
v
(
3
,
nullptr
);
paddle_mobile
.
FeedData
({
img
});
paddle_mobile
.
Predict_To
(
-
1
);
for
(
int
i
=
47
;
i
<
52
;
i
++
)
{
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
i
);
std
::
string
saveName
=
"marker_"
+
std
::
to_string
(
i
);
// if(i != 58)
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
get_data
(),
tensor_ptr
->
numel
()
*
sizeof
(
float
));
// tensor_ptr->numel() * sizeof(float));
dump_stride
(
saveName
,
(
*
tensor_ptr
),
tensor_ptr
->
numel
(),
true
);
// 20);//tensor_ptr->numel());
/* float result = 0;
std::string str = "softmax_input_data";
float* data =
static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() *
sizeof(float))); str = "softmax_output_data"; auto output_ptr =
static_cast<half*>((*tensor_ptr).get_data()); for (int idx = 0; idx <
tensor_ptr->numel(); ++idx)
{
data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
}
fpga::savefile<float>(str,data, tensor_ptr->numel(), result ); */
}
// paddle_mobile.GetResults(&v);
// PaddleTensor tensor;
DLOG
<<
"Computation done"
;
// predictor->GetPaddleTensor("fetch2", &tensor);
fpga
::
fpga_free
(
img
);
// for (int i = 0; i < post_nms; i++) {
}
// auto p = reinterpret_cast<float *>(tensor.data.data());
// std::cout << p[+i] << std::endl;
// }
return
0
;
return
0
;
}
}
test/fpga/test_marker_api.cpp
浏览文件 @
85ba3b69
...
@@ -15,12 +15,15 @@ limitations under the License. */
...
@@ -15,12 +15,15 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_FPGA
#ifndef PADDLE_MOBILE_FPGA
#define PADDLE_MOBILE_FPGA
#define PADDLE_MOBILE_FPGA
#endif
#endif
#include <sys/time.h>
#include <time.h>
#include <fstream>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iostream>
#include "../../src/io/paddle_inference_api.h"
#include "../../src/io/paddle_inference_api.h"
using
namespace
paddle_mobile
;
using
namespace
paddle_mobile
;
// NOLINT
using
namespace
paddle_mobile
::
fpga
;
using
namespace
paddle_mobile
::
fpga
;
// NOLINT
static
const
char
*
g_image
=
"../models/marker/model/image.bin"
;
static
const
char
*
g_image
=
"../models/marker/model/image.bin"
;
static
const
char
*
g_model
=
"../models/marker/model/model"
;
static
const
char
*
g_model
=
"../models/marker/model/model"
;
...
@@ -136,44 +139,6 @@ PaddleMobileConfig GetConfig1() {
...
@@ -136,44 +139,6 @@ PaddleMobileConfig GetConfig1() {
int
main
()
{
int
main
()
{
open_device
();
open_device
();
PaddleMobileConfig
config1
=
GetConfig1
();
auto
predictor1
=
CreatePaddlePredictor
<
PaddleMobileConfig
,
PaddleEngineKind
::
kPaddleMobile
>
(
config1
);
std
::
cout
<<
"Finishing loading model"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
int
img_length1
=
144
*
14
*
14
;
auto
img1
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
img_length1
*
sizeof
(
float
)));
readStream
(
g_image1
,
reinterpret_cast
<
char
*>
(
img1
));
std
::
cout
<<
"Finishing initializing data"
<<
std
::
endl
;
struct
PaddleTensor
t_img1
;
t_img1
.
dtypeid
=
typeid
(
float
);
t_img1
.
layout
=
LAYOUT_HWC
;
t_img1
.
shape
=
std
::
vector
<
int
>
({
1
,
14
,
14
,
144
});
t_img1
.
name
=
"Image information"
;
t_img1
.
data
.
Reset
(
img1
,
img_length1
*
sizeof
(
float
));
predictor1
->
FeedPaddleTensors
({
t_img1
});
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
predictor1
->
Predict_From_To
(
0
,
-
1
);
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v1
;
// No need to initialize v
predictor1
->
FetchPaddleTensors
(
&
v1
);
// Old data in v will be cleared
std
::
cout
<<
"Output number is "
<<
v1
.
size
()
<<
std
::
endl
;
for
(
int
fetchNum
=
0
;
fetchNum
<
v1
.
size
();
fetchNum
++
)
{
std
::
string
dumpName
=
"marker2_api_fetch_"
+
std
::
to_string
(
fetchNum
);
dump_stride
(
dumpName
,
v1
[
fetchNum
]);
}
}
/////////////////////////////////////
PaddleMobileConfig
config
=
GetConfig
();
PaddleMobileConfig
config
=
GetConfig
();
auto
predictor
=
auto
predictor
=
CreatePaddlePredictor
<
PaddleMobileConfig
,
CreatePaddlePredictor
<
PaddleMobileConfig
,
...
@@ -207,7 +172,16 @@ int main() {
...
@@ -207,7 +172,16 @@ int main() {
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
timeval
start11
,
end11
;
long
dif_sec
,
dif_usec
;
// NOLINT
gettimeofday
(
&
start11
,
NULL
);
predictor
->
Predict_From_To
(
0
,
-
1
);
predictor
->
Predict_From_To
(
0
,
-
1
);
gettimeofday
(
&
end11
,
NULL
);
dif_sec
=
end11
.
tv_sec
-
start11
.
tv_sec
;
dif_usec
=
end11
.
tv_usec
-
start11
.
tv_usec
;
std
::
cout
<<
"marker1 total"
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
" us"
<<
std
::
endl
;
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v
;
// No need to initialize v
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v
;
// No need to initialize v
...
@@ -217,5 +191,48 @@ int main() {
...
@@ -217,5 +191,48 @@ int main() {
std
::
string
dumpName
=
"marker_api_fetch_"
+
std
::
to_string
(
fetchNum
);
std
::
string
dumpName
=
"marker_api_fetch_"
+
std
::
to_string
(
fetchNum
);
dump_stride
(
dumpName
,
v
[
fetchNum
]);
dump_stride
(
dumpName
,
v
[
fetchNum
]);
}
}
PaddleMobileConfig
config1
=
GetConfig1
();
auto
predictor1
=
CreatePaddlePredictor
<
PaddleMobileConfig
,
PaddleEngineKind
::
kPaddleMobile
>
(
config1
);
std
::
cout
<<
"Finishing loading model"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
int
img_length1
=
144
*
14
*
14
;
auto
img1
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
img_length1
*
sizeof
(
float
)));
readStream
(
g_image1
,
reinterpret_cast
<
char
*>
(
img1
));
std
::
cout
<<
"Finishing initializing data"
<<
std
::
endl
;
struct
PaddleTensor
t_img1
;
t_img1
.
dtypeid
=
typeid
(
float
);
t_img1
.
layout
=
LAYOUT_HWC
;
t_img1
.
shape
=
std
::
vector
<
int
>
({
1
,
14
,
14
,
144
});
t_img1
.
name
=
"Image information"
;
t_img1
.
data
.
Reset
(
img1
,
img_length1
*
sizeof
(
float
));
predictor1
->
FeedPaddleTensors
({
t_img1
});
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
gettimeofday
(
&
start11
,
NULL
);
predictor1
->
Predict_From_To
(
0
,
-
1
);
gettimeofday
(
&
end11
,
NULL
);
dif_sec
=
end11
.
tv_sec
-
start11
.
tv_sec
;
dif_usec
=
end11
.
tv_usec
-
start11
.
tv_usec
;
std
::
cout
<<
"marker2 total"
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
" us"
<<
std
::
endl
;
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v1
;
// No need to initialize v
predictor1
->
FetchPaddleTensors
(
&
v1
);
// Old data in v will be cleared
std
::
cout
<<
"Output number is "
<<
v1
.
size
()
<<
std
::
endl
;
for
(
int
fetchNum
=
0
;
fetchNum
<
v1
.
size
();
fetchNum
++
)
{
std
::
string
dumpName
=
"marker2_api_fetch_"
+
std
::
to_string
(
fetchNum
);
dump_stride
(
dumpName
,
v1
[
fetchNum
]);
}
}
return
0
;
return
0
;
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录