Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
afa836d9
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
afa836d9
编写于
12月 09, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor pooling implementation
上级
889c8ebc
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
1043 addition
and
2211 deletion
+1043
-2211
src/common/types.h
src/common/types.h
+5
-0
src/operators/kernel/central-arm-func/pool_arm_func.h
src/operators/kernel/central-arm-func/pool_arm_func.h
+26
-76
src/operators/math/pool_2x2.cpp
src/operators/math/pool_2x2.cpp
+0
-304
src/operators/math/pool_2x2.h
src/operators/math/pool_2x2.h
+0
-37
src/operators/math/pool_3x3.cpp
src/operators/math/pool_3x3.cpp
+0
-904
src/operators/math/pool_3x3.h
src/operators/math/pool_3x3.h
+0
-50
src/operators/math/pool_3x3_int8.cpp
src/operators/math/pool_3x3_int8.cpp
+0
-564
src/operators/math/pooling.cpp
src/operators/math/pooling.cpp
+50
-69
src/operators/math/pooling.h
src/operators/math/pooling.h
+121
-53
src/operators/math/pooling3x3.cpp
src/operators/math/pooling3x3.cpp
+819
-0
test/operators/test_pool_op.cpp
test/operators/test_pool_op.cpp
+22
-154
未找到文件。
src/common/types.h
浏览文件 @
afa836d9
...
...
@@ -102,6 +102,11 @@ enum ActivationType {
Sigmoid
=
6
,
};
enum
PoolingType
{
Max
=
0
,
Avg
=
1
,
};
extern
const
char
*
G_OP_TYPE_CONV
;
extern
const
char
*
G_OP_TYPE_BATCHNORM
;
extern
const
char
*
G_OP_TYPE_BOX_CODER
;
...
...
src/operators/kernel/central-arm-func/pool_arm_func.h
浏览文件 @
afa836d9
...
...
@@ -17,103 +17,53 @@ limitations under the License. */
#include <string>
#include <vector>
#include "common/types.h"
#include "operators/math/pooling.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
Tensor
;
template
<
typename
T
,
typename
S
>
void
PoolBasic
(
std
::
string
pooling_type
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
)
{
if
(
pooling_type
==
"max"
)
{
math
::
PoolFunctor
<
CPU
,
math
::
MaxPool
<
T
>
,
T
>
pool2d_forward
;
math
::
MaxPool
<
T
>
pool_process
;
pool2d_forward
(
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
PoolFunctor
<
CPU
,
math
::
AvgPool
<
T
,
S
>
,
T
>
pool2d_forward
;
math
::
AvgPool
<
T
,
S
>
pool_process
;
pool2d_forward
(
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
}
}
template
<
typename
P
>
void
PoolCompute
(
const
PoolParam
<
CPU
>
&
param
)
{
const
Tensor
*
in_x
=
param
.
Input
();
Tensor
*
out
=
param
.
Output
();
std
::
string
pooling_type
=
param
.
PoolingType
();
const
framework
::
Tensor
*
input
=
param
.
Input
();
framework
::
Tensor
*
output
=
param
.
Output
();
const
std
::
string
&
pooling_type
=
param
.
PoolingType
();
std
::
vector
<
int
>
ksize
=
param
.
Ksize
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
if
(
ksize
.
size
()
!=
2
)
{
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
<<
"Pool op only supports 2D and 3D input."
;
}
if
(
param
.
isGlobalPooling
())
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
paddings
[
i
]
=
0
;
ksize
[
i
]
=
static_cast
<
int
>
(
in
_x
->
dims
()[
i
+
2
]);
ksize
[
i
]
=
static_cast
<
int
>
(
in
put
->
dims
()[
i
+
2
]);
}
}
if
(
in_x
->
type
()
==
typeid
(
int8_t
)
)
{
if
(
pooling_type
==
"max"
&&
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
)
{
math
::
Pool
3x3Maxs1_int8
(
in_x
,
out
,
paddings
[
0
],
paddings
[
1
]
);
}
else
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
2
)
{
math
::
Pool
3x3Maxs2_int8
(
in_x
,
out
,
paddings
[
0
],
paddings
[
1
]
);
if
(
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
]
)
{
if
(
pooling_type
==
"max"
&&
strides
[
0
]
==
strides
[
1
])
{
if
(
strides
[
0
]
==
1
)
{
math
::
Pool
ing3x3
<
Max
,
1
>
()(
*
input
,
paddings
,
output
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
Pool
ing3x3
<
Max
,
2
>
()(
*
input
,
paddings
,
output
);
}
else
{
math
::
Pool
3x3Max_int8
(
strides
,
paddings
,
in_x
,
o
ut
);
math
::
Pool
ing
<
Max
>
()(
*
input
,
ksize
,
strides
,
paddings
,
outp
ut
);
}
}
else
if
(
pooling_type
==
"avg"
&&
strides
[
0
]
==
strides
[
1
])
{
if
(
strides
[
0
]
==
1
)
{
math
::
Pooling3x3
<
Avg
,
1
>
()(
*
input
,
paddings
,
output
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
Pooling3x3
<
Avg
,
2
>
()(
*
input
,
paddings
,
output
);
}
else
{
PoolBasic
<
int8_t
,
int32_t
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
math
::
Pooling
<
Avg
>
()(
*
input
,
ksize
,
strides
,
paddings
,
output
);
}
}
else
{
if
(
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
pooling_type
==
"max"
)
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
1
)
{
math
::
Pool3x3Maxs1p1
(
in_x
,
out
);
}
else
{
math
::
Pool3x3Max
(
strides
,
paddings
,
in_x
,
out
);
// Others
}
}
else
if
(
pooling_type
==
"avg"
)
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
1
)
{
math
::
Pool3x3Avgs1p1
(
in_x
,
out
);
}
else
{
math
::
Pool3x3Avg
(
strides
,
paddings
,
in_x
,
out
);
}
}
}
else
if
(
ksize
[
0
]
==
2
&&
ksize
[
0
]
==
ksize
[
1
]
&&
strides
[
0
]
==
2
&&
strides
[
0
]
==
strides
[
1
]
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
0
)
{
#if __ARM_NEON
#if __aarch64__
PoolBasic
<
float
,
float
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
#else
/// todo: fix bug in Pool2x2
if
(
pooling_type
==
"max"
)
{
math
::
Pool2x2Maxs2p0
(
strides
,
paddings
,
in_x
,
o
ut
);
math
::
Pooling
<
Max
>
()(
*
input
,
ksize
,
strides
,
paddings
,
outp
ut
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2x2Avgs2p0
(
strides
,
paddings
,
in_x
,
out
);
}
#endif
#else
PoolBasic
<
float
,
float
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
#endif // __ARM_NEON
math
::
Pooling
<
Avg
>
()(
*
input
,
ksize
,
strides
,
paddings
,
output
);
}
else
{
PoolBasic
<
float
,
float
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
// Others
}
}
}
...
...
src/operators/math/pool_2x2.cpp
已删除
100644 → 0
浏览文件 @
889c8ebc
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "operators/math/pool_2x2.h"
#include <algorithm>
#include <vector>
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
#define FLT_MAX __FLT_MAX__
void
Pool2x2Maxs2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
2
;
const
int
ksize_width
=
2
;
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
w1
=
input_width
/
16
;
int
_w1
=
input_width
%
16
;
int
w2
=
_w1
/
4
;
int
_w2
=
_w1
%
4
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
input_height
;
ph
+=
2
)
{
const
float
*
in_ptr1
=
input_data
+
i
*
input_batch_stride
+
c
*
input_channel_stride
+
ph
*
input_width
;
const
float
*
in_ptr2
=
in_ptr1
+
input_width
;
if
(
ph
!=
input_height
&&
ph
+
1
>=
input_height
)
{
in_ptr2
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
input_width
));
memset
(
static_cast
<
void
*>
(
const_cast
<
float
*>
(
in_ptr2
)),
-
FLT_MAX
,
sizeof
(
float
)
*
input_width
);
}
float
*
out_ptr
=
output_data
+
i
*
output_batch_stride
+
c
*
output_channel_stride
+
ph
/
2
*
output_width
;
#if __ARM_NEON
#if __aarch64__
#else
asm
volatile
(
"subs %[w1], %[w1], #1
\n\t
"
"blt end_w1_%=
\n\t
"
"loop_w1_%=:
\n\t
"
"pld [%[in_ptr1], #64]
\n\t
"
"pld [%[in_ptr2], #64]
\n\t
"
"vld1.f32 {q0, q1}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q2, q3}, [%[in_ptr2]]!
\n\t
"
"vld1.f32 {q6, q7}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q8, q9}, [%[in_ptr2]]!
\n\t
"
"vmax.f32 q0, q0, q2
\n\t
"
"vmax.f32 q1, q1, q3
\n\t
"
"vmax.f32 q6, q6, q8
\n\t
"
"vmax.f32 q7, q7, q9
\n\t
"
"vpmax.f32 d8, d0, d1
\n\t
"
"vpmax.f32 d9, d2, d3
\n\t
"
"vpmax.f32 d10, d12, d13
\n\t
"
"vpmax.f32 d11, d14, d15
\n\t
"
"vst1.32 {q4, q5}, [%[out_ptr]]!
\n\t
"
"subs %[w1], %[w1], #1
\n\t
"
"bge loop_w1_%=
\n\t
"
"end_w1_%=:
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"blt end_w2_%=
\n\t
"
"loop_w2_%=:
\n\t
"
"vld1.f32 {q0}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q1}, [%[in_ptr2]]!
\n\t
"
"vmax.f32 q0, q0, q1
\n\t
"
"vpmax.f32 d4, d0, d1
\n\t
"
"vst1.32 {d4}, [%[out_ptr]]!
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"bge loop_w2_%=
\n\t
"
"end_w2_%=:
\n\t
"
:
:
[
w1
]
"r"
(
w1
),
[
w2
]
"r"
(
w2
),
[
in_ptr1
]
"r"
(
in_ptr1
),
[
in_ptr2
]
"r"
(
in_ptr2
),
[
out_ptr
]
"r"
(
out_ptr
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
);
#endif
#endif
if
(
_w2
!=
0
)
{
in_ptr1
=
input_data
+
i
*
input_batch_stride
+
c
*
input_channel_stride
+
ph
*
input_width
+
16
*
w1
+
4
*
w2
;
in_ptr2
=
in_ptr1
+
input_width
;
out_ptr
=
output_data
+
i
*
output_batch_stride
+
c
*
output_channel_stride
+
ph
/
2
*
output_width
+
8
*
w1
+
2
*
w2
;
if
(
_w2
==
1
)
{
*
out_ptr
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
}
else
if
(
_w2
==
2
)
{
float
temp
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
in_ptr1
++
;
in_ptr2
++
;
float
temp1
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
*
out_ptr
=
(
temp
>
temp1
)
?
temp
:
temp1
;
}
else
if
(
_w2
==
3
)
{
float
temp
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
in_ptr1
++
;
in_ptr2
++
;
float
temp1
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
in_ptr1
++
;
in_ptr2
++
;
*
out_ptr
=
(
temp
>
temp1
)
?
temp
:
temp1
;
out_ptr
++
;
*
out_ptr
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
}
}
}
}
}
}
void
Pool2x2Avgs2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
2
;
const
int
ksize_width
=
2
;
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
w1
=
input_width
/
16
;
int
_w1
=
input_width
%
16
;
int
w2
=
_w1
/
4
;
int
_w2
=
_w1
%
4
;
float
quarter
=
0.25
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
input_height
;
ph
+=
2
)
{
const
float
*
in_ptr1
=
input_data
+
i
*
input_batch_stride
+
c
*
input_channel_stride
+
ph
*
input_width
;
const
float
*
in_ptr2
=
in_ptr1
+
input_width
;
if
(
ph
+
1
>=
input_height
)
{
in_ptr2
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
input_width
));
memset
(
static_cast
<
void
*>
(
const_cast
<
float
*>
(
in_ptr2
)),
0
,
sizeof
(
float
)
*
input_width
);
}
float
*
out_ptr
=
output_data
+
i
*
output_batch_stride
+
c
*
output_channel_stride
+
ph
/
2
*
output_width
;
#if __ARM_NEON
#if __aarch64__
#else
asm
volatile
(
"subs %[w1], %[w1], #1
\n\t
"
"blt end_w1_%=
\n\t
"
"loop_w1_%=:
\n\t
"
"pld [%[in_ptr1], #64]
\n\t
"
"pld [%[in_ptr2], #64]
\n\t
"
"vmov.f32 d0[0], %[quarter]
\n\t
"
"vld1.f32 {q1, q2}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q3, q4}, [%[in_ptr2]]!
\n\t
"
"vld1.f32 {q7, q8}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q9, q10}, [%[in_ptr2]]!
\n\t
"
"vadd.f32 q1, q1, q3
\n\t
"
"vadd.f32 q2, q2, q4
\n\t
"
"vadd.f32 q7, q7, q9
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"vpadd.f32 d10, d2, d3
\n\t
"
"vpadd.f32 d11, d4, d5
\n\t
"
"vpadd.f32 d12, d14, d15
\n\t
"
"vpadd.f32 d13, d16, d17
\n\t
"
"vmul.f32 q5, q5, d0[0]
\n\t
"
"vmul.f32 q6, q6, d0[0]
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"subs %[w1], %[w1], #1
\n\t
"
"bge loop_w1_%=
\n\t
"
"end_w1_%=:
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"blt end_w2_%=
\n\t
"
"loop_w2_%=:
\n\t
"
"vld1.f32 {q1}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q2}, [%[in_ptr2]]!
\n\t
"
"vadd.f32 q1, q1, q2
\n\t
"
"vpadd.f32 d4, d2, d3
\n\t
"
"vmul.f32 d4, d4, d0[0]
\n\t
"
"vst1.32 {d4}, [%[out_ptr]]!
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"bge loop_w2_%=
\n\t
"
"end_w2_%=:
\n\t
"
:
:
[
w1
]
"r"
(
w1
),
[
w2
]
"r"
(
w2
),
[
in_ptr1
]
"r"
(
in_ptr1
),
[
in_ptr2
]
"r"
(
in_ptr2
),
[
out_ptr
]
"r"
(
out_ptr
),
[
quarter
]
"r"
(
quarter
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
);
#endif
#endif
if
(
_w2
!=
0
)
{
in_ptr1
=
input_data
+
i
*
input_batch_stride
+
c
*
input_channel_stride
+
ph
*
input_width
+
16
*
w1
+
4
*
w2
;
in_ptr2
=
in_ptr1
+
input_width
;
out_ptr
=
output_data
+
i
*
output_batch_stride
+
c
*
output_channel_stride
+
ph
/
2
*
output_width
+
8
*
w1
+
2
*
w2
;
if
(
_w2
==
1
)
{
*
out_ptr
=
0.5
*
(
*
in_ptr1
+
*
in_ptr2
);
}
else
if
(
_w2
==
2
)
{
float
temp
=
0
;
temp
+=
*
in_ptr1
;
temp
+=
*
in_ptr2
;
in_ptr1
++
;
in_ptr2
++
;
temp
+=
*
in_ptr1
;
temp
+=
*
in_ptr2
;
*
out_ptr
=
0.25
*
temp
;
}
else
if
(
_w2
==
3
)
{
float
temp
=
0
;
temp
+=
*
in_ptr1
++
;
temp
+=
*
in_ptr2
++
;
temp
+=
*
in_ptr1
++
;
temp
+=
*
in_ptr2
++
;
*
out_ptr
=
0.25
*
temp
;
out_ptr
++
;
*
out_ptr
=
0.5
*
(
*
in_ptr1
+
*
in_ptr2
);
}
}
}
}
}
}
//}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool_2x2.h
已删除
100644 → 0
浏览文件 @
889c8ebc
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "framework/tensor.h"
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
vector
;
void
Pool2x2Maxs2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool2x2Avgs2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool_3x3.cpp
已删除
100644 → 0
浏览文件 @
889c8ebc
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
#include <climits>
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
max
;
using
std
::
min
;
using
std
::
vector
;
void
Pool3x3Avgs1p1
(
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
const
int
input_channel
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
const
int
input_height
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
const
int
input_width
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
output_height
=
static_cast
<
int
>
(
output
->
dims
()[
2
]);
const
int
output_width
=
static_cast
<
int
>
(
output
->
dims
()[
3
]);
output
->
mutable_data
<
float
>
();
const
int
hxw
=
input_height
*
input_width
;
const
int
l
=
input_height
;
const
float
coef
=
1.0
/
9.0
;
const
float
coef1
=
1.0
/
6.0
;
const
float
coef2
=
1.0
/
4.0
;
float32x4_t
v_coef
=
vdupq_n_f32
(
coef
);
float32x4_t
v_coef1
=
vdupq_n_f32
(
coef1
);
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
input_channel
;
c
++
)
{
const
float
*
input_data
=
input
->
data
<
float
>
()
+
c
*
hxw
;
float
*
output_data
=
output
->
data
<
float
>
()
+
c
*
hxw
;
for
(
int
i
=
1
;
i
<
output_height
-
1
;
i
++
)
{
float
*
output_ptr
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
out0
;
for
(
int
m
=
1
;
m
<
output_width
-
4
;
m
+=
4
)
{
output_ptr
=
output_data
+
i
*
output_width
+
m
;
in0
=
vld1q_f32
(
input_data
+
(
i
-
1
)
*
input_width
+
m
-
1
);
in1
=
vld1q_f32
(
input_data
+
(
i
-
1
)
*
input_width
+
m
+
3
);
in2
=
vld1q_f32
(
input_data
+
i
*
input_width
+
m
-
1
);
in3
=
vld1q_f32
(
input_data
+
i
*
input_width
+
m
+
3
);
in4
=
vld1q_f32
(
input_data
+
(
i
+
1
)
*
input_width
+
m
-
1
);
in5
=
vld1q_f32
(
input_data
+
(
i
+
1
)
*
input_width
+
m
+
3
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
tmp4
=
vextq_f32
(
in4
,
in5
,
1
);
tmp5
=
vextq_f32
(
in4
,
in5
,
2
);
out0
=
in0
;
out0
=
vaddq_f32
(
out0
,
tmp0
);
out0
=
vaddq_f32
(
out0
,
tmp1
);
out0
=
vaddq_f32
(
out0
,
in2
);
out0
=
vaddq_f32
(
out0
,
tmp2
);
out0
=
vaddq_f32
(
out0
,
tmp3
);
out0
=
vaddq_f32
(
out0
,
in4
);
out0
=
vaddq_f32
(
out0
,
tmp4
);
out0
=
vaddq_f32
(
out0
,
tmp5
);
vst1q_f32
(
output_ptr
,
vmulq_f32
(
out0
,
v_coef
));
}
int
m
;
for
(
m
=
1
;
(
m
+
3
)
<
output_width
-
1
;
m
=
m
+
4
)
{
}
for
(
int
j
=
m
;
j
<
output_width
-
1
;
j
++
)
{
output_data
[
i
*
output_width
+
j
]
=
input_data
[(
i
-
1
)
*
input_width
+
j
-
1
]
+
input_data
[(
i
-
1
)
*
input_width
+
j
]
+
input_data
[(
i
-
1
)
*
input_width
+
j
+
1
]
+
input_data
[(
i
)
*
input_width
+
j
-
1
]
+
input_data
[(
i
)
*
input_width
+
j
]
+
input_data
[(
i
)
*
input_width
+
j
+
1
]
+
input_data
[(
i
+
1
)
*
input_width
+
j
-
1
]
+
input_data
[(
i
+
1
)
*
input_width
+
j
]
+
input_data
[(
i
+
1
)
*
input_width
+
j
+
1
];
output_data
[
i
*
output_width
+
j
]
=
output_data
[
i
*
output_width
+
j
]
*
coef
;
}
}
output_data
[
0
]
=
input_data
[
0
]
+
input_data
[
1
]
+
input_data
[
l
]
+
input_data
[
l
+
1
];
output_data
[
l
-
1
]
=
input_data
[
l
-
2
]
+
input_data
[
l
-
1
]
+
input_data
[
2
*
l
-
2
]
+
input_data
[
2
*
l
-
1
];
output_data
[(
l
-
1
)
*
l
]
=
input_data
[(
l
-
2
)
*
l
]
+
input_data
[(
l
-
2
)
*
l
+
1
]
+
input_data
[(
l
-
1
)
*
l
]
+
input_data
[(
l
-
1
)
*
l
+
1
];
output_data
[
l
*
l
-
1
]
=
input_data
[(
l
-
2
)
*
(
l
+
1
)]
+
input_data
[(
l
-
2
)
*
(
l
+
1
)
+
1
]
+
input_data
[
l
*
l
-
2
]
+
input_data
[
l
*
l
-
1
];
output_data
[
0
]
=
output_data
[
0
]
*
coef2
;
output_data
[
l
-
1
]
=
output_data
[
l
-
1
]
*
coef2
;
output_data
[(
l
-
1
)
*
l
]
=
output_data
[(
l
-
1
)
*
l
]
*
coef2
;
output_data
[
l
*
l
-
1
]
=
output_data
[
l
*
l
-
1
]
*
coef2
;
for
(
int
i
=
1
;
i
<
l
-
1
;
++
i
)
{
output_data
[
i
*
l
]
=
input_data
[
i
*
l
-
l
]
+
input_data
[
i
*
l
-
l
+
1
]
+
input_data
[
i
*
l
]
+
input_data
[
i
*
l
+
1
]
+
input_data
[
i
*
l
+
l
]
+
input_data
[
i
*
l
+
l
+
1
];
output_data
[
i
*
l
+
l
-
1
]
=
input_data
[
i
*
l
+
l
-
1
-
l
-
1
]
+
input_data
[
i
*
l
+
l
-
1
-
l
]
+
input_data
[
i
*
l
+
l
-
1
-
1
]
+
input_data
[
i
*
l
+
l
-
1
]
+
input_data
[
i
*
l
+
l
-
1
+
l
-
1
]
+
input_data
[
i
*
l
+
l
-
1
+
l
];
output_data
[
i
*
l
]
=
output_data
[
i
*
l
]
*
coef1
;
output_data
[
i
*
l
+
l
-
1
]
=
output_data
[
i
*
l
+
l
-
1
]
*
coef1
;
}
int
m
;
for
(
m
=
1
;
m
<
output_width
-
4
;
m
+=
4
)
{
float
*
output_ptr
=
output_data
+
m
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
out0
;
in0
=
vld1q_f32
(
input_data
+
m
-
1
);
in1
=
vld1q_f32
(
input_data
+
m
+
3
);
in2
=
vld1q_f32
(
input_data
+
input_width
+
m
-
1
);
in3
=
vld1q_f32
(
input_data
+
input_width
+
m
+
3
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
out0
=
in0
;
out0
=
vaddq_f32
(
out0
,
tmp0
);
out0
=
vaddq_f32
(
out0
,
tmp1
);
out0
=
vaddq_f32
(
out0
,
in2
);
out0
=
vaddq_f32
(
out0
,
tmp2
);
out0
=
vaddq_f32
(
out0
,
tmp3
);
vst1q_f32
(
output_ptr
,
vmulq_f32
(
out0
,
v_coef1
));
}
for
(
m
=
1
;
(
m
+
3
)
<
output_width
-
1
;
m
+=
4
)
{
}
for
(
int
j
=
m
;
j
<
output_width
-
1
;
j
++
)
{
output_data
[
j
]
=
input_data
[
j
-
1
]
+
input_data
[
j
]
+
input_data
[
j
+
1
]
+
input_data
[
input_width
+
j
-
1
]
+
input_data
[
input_width
+
j
]
+
input_data
[
input_width
+
j
+
1
];
output_data
[
j
]
=
output_data
[
j
]
*
coef1
;
}
for
(
m
=
1
;
m
<
output_width
-
4
;
m
+=
4
)
{
float
*
output_ptr
=
output_data
+
(
output_height
-
1
)
*
output_width
+
m
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
out0
;
in0
=
vld1q_f32
(
input_data
+
(
output_height
-
2
)
*
input_width
+
m
-
1
);
in1
=
vld1q_f32
(
input_data
+
(
output_height
-
2
)
*
input_width
+
m
+
3
);
in2
=
vld1q_f32
(
input_data
+
(
output_height
-
1
)
*
input_width
+
m
-
1
);
in3
=
vld1q_f32
(
input_data
+
(
output_height
-
1
)
*
input_width
+
m
+
3
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
out0
=
in0
;
out0
=
vaddq_f32
(
out0
,
tmp0
);
out0
=
vaddq_f32
(
out0
,
tmp1
);
out0
=
vaddq_f32
(
out0
,
in2
);
out0
=
vaddq_f32
(
out0
,
tmp2
);
out0
=
vaddq_f32
(
out0
,
tmp3
);
vst1q_f32
(
output_ptr
,
vmulq_f32
(
out0
,
v_coef1
));
}
for
(
m
=
1
;
(
m
+
3
)
<
output_width
-
1
;
m
=
m
+
4
)
{
}
for
(
int
j
=
m
;
j
<
output_width
-
1
;
j
++
)
{
output_data
[(
output_height
-
1
)
*
input_width
+
j
]
=
input_data
[(
output_height
-
2
)
*
input_width
+
j
-
1
]
+
input_data
[(
output_height
-
2
)
*
input_width
+
j
]
+
input_data
[(
output_height
-
2
)
*
input_width
+
j
+
1
]
+
input_data
[(
output_height
-
1
)
*
input_width
+
j
-
1
]
+
input_data
[(
output_height
-
1
)
*
input_width
+
j
]
+
input_data
[(
output_height
-
1
)
*
input_width
+
j
+
1
];
output_data
[(
output_height
-
1
)
*
output_width
+
j
]
=
output_data
[(
output_height
-
1
)
*
output_width
+
j
]
*
coef1
;
}
}
}
// const int batch_size = input->dims()[0];
//
// const int h_in = input->dims()[2];
//
// const int w_in = input->dims()[3];
//
// const int output_channels = output->dims()[1];
//
// const int h_out = output->dims()[2];
// const int w_out = output->dims()[3];
// const int outputdata_channel_stride = h_out * w_out;
// const int inputdata_channel_stride = h_in * w_in;
// const int input_batch_stride = output_channels * inputdata_channel_stride;
// const int output_batch_stride = output_channels *
// outputdata_channel_stride; float *out_data = output->data<float>(); const
// float *input_data = input->data<float>();
//
// const float coef = 1.0 / 9.0;
// for (int k = 0; k < batch_size; ++k) {
// #pragma omp parallel for
// for (int c = 0; c < output_channels; ++c) {
// const float *input_seg = input_data + c * inputdata_channel_stride;
// float *output_seg = out_data + c * outputdata_channel_stride;
// // four corner point
// output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
// input_seg[w_in + 1]) *
// coef;
// output_seg[w_out - 1] =
// (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 -
// 2] +
// input_seg[2 * w_in - 1]) *
// coef;
// output_seg[(h_out - 1) * w_out] =
// (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
// input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1])
// *
// coef;
// output_seg[h_out * w_out - 1] =
// (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
// input_seg[(h_in - 1) * w_in - 1] +
// input_seg[(h_in - 1) * w_in - 2]) *
// coef;
// // left side & right side
// for (int i = 1; i < h_in - 1; ++i) {
// output_seg[i * w_out] =
// (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
// input_seg[i * w_in] + input_seg[i * w_in + 1] +
// input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
// coef;
// output_seg[i * w_out + w_out - 1] =
// (input_seg[i * w_in - w_in + w_in - 2] +
// input_seg[i * w_in - w_in + 1 + w_in - 2] +
// input_seg[i * w_in + w_in - 2] +
// input_seg[i * w_in + 1 + w_in - 2] +
// input_seg[i * w_in + w_in + w_in - 2] +
// input_seg[i * w_in + w_in + 1 + w_in - 2]) *
// coef;
// }
// // top 1 row & bottom 1 row
// const float *input_tmp = input_seg;
//
// float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
// tmp3, tmp4, tmp5, sum, out0;
// float32x4_t v_coef = vdupq_n_f32(coef);
// in0 = vld1q_f32(input_tmp);
// in2 = vld1q_f32(input_tmp + w_in);
// const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
// in4 = vld1q_f32(input_tmp_end);
// in6 = vld1q_f32(input_tmp_end + w_in);
// int c_mid = w_out - 2;
// auto output_ptr = output_seg + 1;
// for (; c_mid > 3; c_mid -= 4) {
// in1 = vld1q_f32(input_tmp + 4);
// in3 = vld1q_f32(input_tmp + w_in + 4);
//
// tmp0 = vextq_f32(in0, in1, 1);
// tmp1 = vextq_f32(in0, in1, 2);
//
// tmp2 = vextq_f32(in2, in3, 1);
// tmp3 = vextq_f32(in2, in3, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
//
// vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
//
// in5 = vld1q_f32(input_tmp_end + 4);
// in7 = vld1q_f32(input_tmp_end + w_in + 4);
//
// tmp0 = vextq_f32(in4, in5, 1);
// tmp1 = vextq_f32(in4, in5, 2);
// tmp2 = vextq_f32(in6, in7, 1);
// tmp3 = vextq_f32(in6, in7, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
//
// vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
//
// // can optimize to each 8 stride.
// input_tmp += 4;
// input_tmp_end += 4;
// output_ptr += 4;
// in0 = in1;
// in2 = in3;
// in4 = in5;
// in6 = in7;
// }
// // top right remain
// float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
// float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
//
// tmp0 = vextq_f32(in0, pad0, 1);
// tmp1 = vextq_f32(in0, pad0, 2);
// tmp2 = vextq_f32(in2, pad1, 2);
// tmp3 = vextq_f32(in2, pad1, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// out0 = vmulq_f32(sum, v_coef);
//
// for (int i = 0; i < c_mid; ++i) {
// if (i == 0) {
// vst1q_lane_f32(output_ptr + i, out0, 0);
// }
// if (i == 1) {
// vst1q_lane_f32(output_ptr + i, out0, 1);
// }
// if (i == 2) {
// vst1q_lane_f32(output_ptr + i, out0, 2);
// }
// }
//
// // bottom_right remain
// float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
// float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
//
// tmp0 = vextq_f32(in4, pad2, 1);
// tmp1 = vextq_f32(in4, pad2, 2);
// tmp2 = vextq_f32(in6, pad3, 2);
// tmp3 = vextq_f32(in6, pad3, 2);
//
// sum = vaddq_f32(in4, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in6);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// out0 = vmulq_f32(sum, v_coef);
//
// for (int i = 0; i < c_mid; ++i) {
// if (i == 0) {
// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
// }
// if (i == 1) {
// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
// }
// if (i == 2) {
// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
// }
// }
// // mid
// for (int j = 0; j < h_out - 2; ++j) {
// output_ptr = output_seg + w_out * (j + 1) + 1;
// input_tmp = input_seg + j * w_in;
//
// in0 = vld1q_f32(input_tmp);
// in2 = vld1q_f32(input_tmp + w_in);
// in4 = vld1q_f32(input_tmp + 2 * w_in);
// c_mid = w_out - 2;
// for (; c_mid > 3; c_mid -= 4) {
// in1 = vld1q_f32(input_tmp + 4);
// in3 = vld1q_f32(input_tmp + w_in + 4);
// in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
//
// tmp0 = vextq_f32(in0, in1, 1);
// tmp1 = vextq_f32(in0, in1, 2);
// tmp2 = vextq_f32(in2, in3, 1);
// tmp3 = vextq_f32(in2, in3, 2);
// tmp4 = vextq_f32(in4, in5, 1);
// tmp5 = vextq_f32(in4, in5, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// sum = vaddq_f32(sum, in4);
// sum = vaddq_f32(sum, tmp4);
// sum = vaddq_f32(sum, tmp5);
//
// out0 = vmulq_f32(sum, v_coef);
// vst1q_f32(output_ptr, out0);
// output_ptr += 4;
// input_tmp += 4;
// in0 = in1;
// in2 = in3;
// in4 = in5;
// }
// // mid remain
// float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
// float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
// float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
//
// tmp0 = vextq_f32(in0, pad0, 1);
// tmp1 = vextq_f32(in0, pad0, 2);
// tmp2 = vextq_f32(in2, pad1, 1);
// tmp3 = vextq_f32(in2, pad1, 2);
// tmp4 = vextq_f32(in4, pad2, 1);
// tmp5 = vextq_f32(in4, pad2, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// sum = vaddq_f32(sum, in4);
// sum = vaddq_f32(sum, tmp4);
// sum = vaddq_f32(sum, tmp5);
// out0 = vmulq_f32(sum, v_coef);
//
// for (int i = 0; i < c_mid; ++i) {
// if (i == 0) {
// vst1q_lane_f32(output_ptr + i, out0, 0);
// }
// if (i == 1) {
// vst1q_lane_f32(output_ptr + i, out0, 1);
// }
// if (i == 2) {
// vst1q_lane_f32(output_ptr + i, out0, 2);
// }
// }
// }
// // input_data += inputdata_channel_stride;
// // out_data += outputdata_channel_stride;
// }
// input_data += input_batch_stride;
// out_data += output_batch_stride;
// }
#endif
}
void
Pool3x3Maxs1p1
(
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
h_in
=
input
->
dims
()[
2
];
const
int
w_in
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
h_out
=
output
->
dims
()[
2
];
const
int
w_out
=
output
->
dims
()[
3
];
const
int
outputdata_channel_stride
=
h_out
*
w_out
;
const
int
inputdata_channel_stride
=
h_in
*
w_in
;
const
int
input_batch_stride
=
output_channels
*
inputdata_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
outputdata_channel_stride
;
float
*
out_data
=
output
->
mutable_data
<
float
>
();
const
float
*
input_data
=
input
->
data
<
float
>
();
for
(
int
k
=
0
;
k
<
batch_size
;
++
k
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_seg
=
input_data
+
c
*
inputdata_channel_stride
;
float
*
output_seg
=
out_data
+
c
*
outputdata_channel_stride
;
// four corner point
output_seg
[
0
]
=
std
::
max
(
std
::
max
(
input_seg
[
0
],
input_seg
[
1
]),
std
::
max
(
input_seg
[
w_in
],
input_seg
[
w_in
+
1
]));
output_seg
[
w_out
-
1
]
=
std
::
max
(
std
::
max
(
input_seg
[
w_in
-
2
],
input_seg
[
w_in
-
1
]),
std
::
max
(
input_seg
[
w_in
*
2
-
2
],
input_seg
[
2
*
w_in
-
1
]));
output_seg
[(
h_out
-
1
)
*
w_out
]
=
std
::
max
(
std
::
max
(
input_seg
[(
h_in
-
2
)
*
w_in
],
input_seg
[(
h_in
-
2
)
*
w_in
+
1
]),
std
::
max
(
input_seg
[(
h_in
-
1
)
*
w_in
],
input_seg
[(
h_in
-
1
)
*
w_in
+
1
]));
output_seg
[
h_out
*
w_out
-
1
]
=
std
::
max
(
std
::
max
(
input_seg
[(
h_in
-
1
)
*
w_in
-
1
],
input_seg
[(
h_in
-
1
)
*
w_in
-
2
]),
std
::
max
(
input_seg
[
h_in
*
w_in
-
1
],
input_seg
[
h_in
*
w_in
-
2
]));
// left side & right side
for
(
int
i
=
1
;
i
<
h_in
-
1
;
++
i
)
{
float
max1
=
std
::
max
(
input_seg
[
i
*
w_in
-
w_in
],
input_seg
[
i
*
w_in
-
w_in
+
1
]);
float
max2
=
std
::
max
(
input_seg
[
i
*
w_in
],
input_seg
[
i
*
w_in
+
1
]);
float
max3
=
std
::
max
(
input_seg
[
i
*
w_in
+
w_in
],
input_seg
[
i
*
w_in
+
w_in
+
1
]);
output_seg
[
i
*
w_out
]
=
std
::
max
(
std
::
max
(
max1
,
max2
),
max3
);
max1
=
std
::
max
(
input_seg
[
i
*
w_in
-
w_in
+
w_in
-
2
],
input_seg
[
i
*
w_in
-
w_in
+
1
+
w_in
-
2
]);
max2
=
std
::
max
(
input_seg
[
i
*
w_in
+
w_in
-
2
],
input_seg
[
i
*
w_in
+
1
+
w_in
-
2
]);
max3
=
std
::
max
(
input_seg
[
i
*
w_in
+
w_in
+
w_in
-
2
],
input_seg
[
i
*
w_in
+
w_in
+
1
+
w_in
-
2
]);
output_seg
[
i
*
w_out
+
w_out
-
1
]
=
std
::
max
(
std
::
max
(
max1
,
max2
),
max3
);
}
// top 1 row & bottom 1 row
const
float
*
input_tmp
=
input_seg
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
max
;
in0
=
vld1q_f32
(
input_tmp
);
in2
=
vld1q_f32
(
input_tmp
+
w_in
);
const
float
*
input_tmp_end
=
input_tmp
+
(
h_in
-
2
)
*
w_in
;
in4
=
vld1q_f32
(
input_tmp_end
);
in6
=
vld1q_f32
(
input_tmp_end
+
w_in
);
int
c_mid
=
w_out
-
2
;
auto
output_ptr
=
output_seg
+
1
;
for
(;
c_mid
>
3
;
c_mid
-=
4
)
{
in1
=
vld1q_f32
(
input_tmp
+
4
);
in3
=
vld1q_f32
(
input_tmp
+
w_in
+
4
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
max
=
vmaxq_f32
(
in0
,
tmp0
);
max
=
vmaxq_f32
(
max
,
tmp1
);
max
=
vmaxq_f32
(
max
,
in2
);
max
=
vmaxq_f32
(
max
,
tmp2
);
max
=
vmaxq_f32
(
max
,
tmp3
);
vst1q_f32
(
output_ptr
,
max
);
in5
=
vld1q_f32
(
input_tmp_end
+
4
);
in7
=
vld1q_f32
(
input_tmp_end
+
w_in
+
4
);
tmp0
=
vextq_f32
(
in4
,
in5
,
1
);
tmp1
=
vextq_f32
(
in4
,
in5
,
2
);
tmp2
=
vextq_f32
(
in6
,
in7
,
1
);
tmp3
=
vextq_f32
(
in6
,
in7
,
2
);
max
=
vmaxq_f32
(
in4
,
tmp0
);
max
=
vmaxq_f32
(
max
,
tmp1
);
max
=
vmaxq_f32
(
max
,
in6
);
max
=
vmaxq_f32
(
max
,
tmp2
);
max
=
vmaxq_f32
(
max
,
tmp3
);
vst1q_f32
(
output_ptr
+
(
h_out
-
1
)
*
w_out
,
max
);
input_tmp
+=
4
;
input_tmp_end
+=
4
;
output_ptr
+=
4
;
in0
=
in1
;
in2
=
in3
;
in4
=
in5
;
in6
=
in7
;
}
// top right remain
float32x4_t
pad0
=
vdupq_n_f32
(
input_seg
[
w_in
-
1
]);
float32x4_t
pad1
=
vdupq_n_f32
(
input_seg
[
2
*
w_in
-
1
]);
tmp0
=
vextq_f32
(
in0
,
pad0
,
1
);
tmp1
=
vextq_f32
(
in0
,
pad0
,
2
);
tmp2
=
vextq_f32
(
in2
,
pad1
,
1
);
tmp3
=
vextq_f32
(
in2
,
pad1
,
2
);
max
=
vmaxq_f32
(
in0
,
tmp0
);
max
=
vmaxq_f32
(
max
,
tmp1
);
max
=
vmaxq_f32
(
max
,
in2
);
max
=
vmaxq_f32
(
max
,
tmp2
);
max
=
vmaxq_f32
(
max
,
tmp3
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
max
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
max
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
max
,
2
);
}
}
// bottom_right remain
float32x4_t
pad2
=
vdupq_n_f32
(
input_seg
[(
h_in
-
1
)
*
w_in
-
1
]);
float32x4_t
pad3
=
vdupq_n_f32
(
input_seg
[
h_in
*
w_in
-
1
]);
tmp0
=
vextq_f32
(
in4
,
pad2
,
1
);
tmp1
=
vextq_f32
(
in4
,
pad2
,
2
);
tmp2
=
vextq_f32
(
in6
,
pad3
,
1
);
tmp3
=
vextq_f32
(
in6
,
pad3
,
2
);
max
=
vmaxq_f32
(
in4
,
tmp0
);
max
=
vmaxq_f32
(
max
,
tmp1
);
max
=
vmaxq_f32
(
max
,
in6
);
max
=
vmaxq_f32
(
max
,
tmp2
);
max
=
vmaxq_f32
(
max
,
tmp3
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
(
h_out
-
1
)
*
w_out
+
i
,
max
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
(
h_out
-
1
)
*
w_out
+
i
,
max
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
(
h_out
-
1
)
*
w_out
+
i
,
max
,
2
);
}
}
// mid
for
(
int
j
=
0
;
j
<
h_out
-
2
;
++
j
)
{
output_ptr
=
output_seg
+
(
j
+
1
)
*
w_out
+
1
;
input_tmp
=
input_seg
+
j
*
w_in
;
in0
=
vld1q_f32
(
input_tmp
);
in2
=
vld1q_f32
(
input_tmp
+
w_in
);
in4
=
vld1q_f32
(
input_tmp
+
2
*
w_in
);
c_mid
=
w_out
-
2
;
for
(;
c_mid
>
3
;
c_mid
-=
4
)
{
in1
=
vld1q_f32
(
input_tmp
+
4
);
in3
=
vld1q_f32
(
input_tmp
+
w_in
+
4
);
in5
=
vld1q_f32
(
input_tmp
+
2
*
w_in
+
4
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
tmp4
=
vextq_f32
(
in4
,
in5
,
1
);
tmp5
=
vextq_f32
(
in4
,
in5
,
2
);
max
=
vmaxq_f32
(
in0
,
tmp0
);
max
=
vmaxq_f32
(
max
,
tmp1
);
max
=
vmaxq_f32
(
max
,
in2
);
max
=
vmaxq_f32
(
max
,
tmp2
);
max
=
vmaxq_f32
(
max
,
tmp3
);
max
=
vmaxq_f32
(
max
,
in4
);
max
=
vmaxq_f32
(
max
,
tmp4
);
max
=
vmaxq_f32
(
max
,
tmp5
);
vst1q_f32
(
output_ptr
,
max
);
output_ptr
+=
4
;
input_tmp
+=
4
;
in0
=
in1
;
in2
=
in3
;
in4
=
in5
;
}
// mid remain
float32x4_t
pad0
=
vdupq_n_f32
(
input_seg
[(
j
+
1
)
*
w_in
-
1
]);
float32x4_t
pad1
=
vdupq_n_f32
(
input_seg
[(
j
+
2
)
*
w_in
-
1
]);
float32x4_t
pad2
=
vdupq_n_f32
(
input_seg
[(
j
+
3
)
*
w_in
-
1
]);
tmp0
=
vextq_f32
(
in0
,
pad0
,
1
);
tmp1
=
vextq_f32
(
in0
,
pad0
,
2
);
tmp2
=
vextq_f32
(
in2
,
pad1
,
1
);
tmp3
=
vextq_f32
(
in2
,
pad1
,
2
);
tmp4
=
vextq_f32
(
in4
,
pad2
,
1
);
tmp5
=
vextq_f32
(
in4
,
pad2
,
2
);
max
=
vmaxq_f32
(
in0
,
tmp0
);
max
=
vmaxq_f32
(
max
,
tmp1
);
max
=
vmaxq_f32
(
max
,
in2
);
max
=
vmaxq_f32
(
max
,
tmp2
);
max
=
vmaxq_f32
(
max
,
tmp3
);
max
=
vmaxq_f32
(
max
,
in4
);
max
=
vmaxq_f32
(
max
,
tmp4
);
max
=
vmaxq_f32
(
max
,
tmp5
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
max
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
max
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
max
,
2
);
}
}
}
// input_data += inputdata_channel_stride;
// out_data += outputdata_channel_stride;
}
input_data
+=
input_batch_stride
;
out_data
+=
output_batch_stride
;
}
#else
#endif
}
void
Pool3x3Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
// const int _kernel_size = 3;
const
int
stride
=
strides
[
0
];
// const int stride_width = strides[1];
const
int
padding
=
paddings
[
0
];
// const int padding_width = paddings[1];
const
float
negative_max
=
-
INT_MAX
;
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
const
float
*
pos1
,
*
output_ptr
;
int
hstart
,
wstart
,
hend
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_seg
=
input_data
+
c
*
input_channel_stride
;
float
*
output_seg
=
output_data
+
c
*
output_channel_stride
;
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
int
hstart
=
ph
*
stride
-
padding
;
int
hend
=
min
(
hstart
+
3
,
input_height
);
hstart
=
max
(
hstart
,
0
);
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
int
wstart
=
pw
*
stride
-
padding
;
int
wend
=
min
(
wstart
+
3
,
input_width
);
wstart
=
max
(
wstart
,
0
);
const
float
*
pos1
=
input_seg
+
hstart
*
input_width
+
wstart
;
const
float
*
pos2
=
input_seg
+
(
hstart
+
1
)
*
input_width
+
wstart
;
const
float
*
pos3
=
input_seg
+
(
hstart
+
2
)
*
input_width
+
wstart
;
output_ptr
=
output_seg
+
ph
*
output_width
+
pw
;
if
(
hend
-
hstart
!=
3
||
wend
-
wstart
!=
3
)
{
float
max_value
=
-
INT_MAX
;
for
(
int
h
=
hstart
;
h
<
hend
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
w
++
)
{
float
value
=
input_seg
[
h
*
input_width
+
w
];
if
(
value
>
max_value
)
{
max_value
=
value
;
}
}
}
output_seg
[
ph
*
output_width
+
pw
]
=
max_value
;
}
else
{
#if __aarch64__
const
float32x4_t
data1
=
vld1q_f32
(
pos1
);
const
float32x4_t
data2
=
vld1q_f32
(
pos1
+
input_width
);
const
float32x4_t
data3
=
vld1q_f32
(
pos1
+
2
*
input_width
);
const
float32x4_t
max_data
=
vmaxq_f32
(
vmaxq_f32
(
data1
,
data2
),
data3
);
float32x2_t
res
=
vpmax_f32
(
vget_high_f32
(
vsetq_lane_f32
(
-
INT_MAX
,
max_data
,
3
)),
vget_low_f32
(
max_data
));
res
=
vpmax_f32
(
res
,
res
);
output_seg
[
ph
*
output_width
+
pw
]
=
vget_lane_f32
(
res
,
0
);
#else
asm
volatile
(
"vld1.32 {q1}, [%[pos1]]
\n\t
"
"vld1.32 {q2}, [%[pos2]]
\n\t
"
"vld1.32 {q3}, [%[pos3]]
\n\t
"
"vmax.f32 q1, q1, q2
\n\t
"
"vmax.f32 q2, q1, q3
\n\t
"
"vmov.f32 d5[1], %[negative_max]
\n\t
"
"vpmax.f32 d6, d4, d5
\n\t
"
"vpmax.f32 d7, d6, d6
\n\t
"
"vst1.32 {d7[0]},[%[output_ptr]]
\n\t
"
:
:
[
input_seg
]
"r"
(
input_seg
),
[
pos1
]
"r"
(
pos1
),
[
pos2
]
"r"
(
pos2
),
[
pos3
]
"r"
(
pos3
),
[
output_ptr
]
"r"
(
output_ptr
),
[
negative_max
]
"r"
(
negative_max
)
:
"memory"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
);
#endif
}
}
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
}
void
Pool3x3Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
stride
=
strides
[
0
];
const
int
padding
=
paddings
[
0
];
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
float
zero
=
0
;
const
float
nine
=
1.0
/
9.0
;
const
float
nine_ptr
[]
=
{
nine
,
nine
};
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_seg
=
input_data
+
c
*
input_channel_stride
;
float
*
output_seg
=
output_data
+
c
*
output_channel_stride
;
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
int
hstart
=
ph
*
stride
-
padding
;
int
wstart
=
pw
*
stride
-
padding
;
int
hend
=
min
(
hstart
+
3
,
input_height
+
padding
);
int
wend
=
min
(
wstart
+
3
,
input_width
+
padding
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
hend
=
min
(
hend
,
input_height
);
wend
=
min
(
wend
,
input_width
);
const
float
*
pos1
=
input_seg
+
hstart
*
input_width
+
wstart
;
const
float
*
pos2
=
input_seg
+
(
hstart
+
1
)
*
input_width
+
wstart
;
const
float
*
pos3
=
input_seg
+
(
hstart
+
2
)
*
input_width
+
wstart
;
float
*
output_ptr
=
output_seg
+
ph
*
output_width
+
pw
;
if
(
hend
-
hstart
!=
3
||
wend
-
wstart
!=
3
)
{
float
sum
=
0
;
for
(
int
h
=
hstart
;
h
<
hend
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
w
++
)
{
sum
+=
input_seg
[
h
*
input_width
+
w
];
}
}
output_seg
[
ph
*
output_width
+
pw
]
=
sum
/
((
hend
-
hstart
)
*
(
wend
-
wstart
)
*
1.0
);
}
else
{
#if __aarch64__
#else
asm
volatile
(
"vld1.32 {q1}, [%[pos1]]
\n\t
"
"vld1.32 {q2}, [%[pos2]]
\n\t
"
"vld1.32 {q3}, [%[pos3]]
\n\t
"
"vadd.f32 q1, q1, q2
\n\t
"
"vadd.f32 q2, q1, q3
\n\t
"
"vmov.f32 d5[1], %[zero]
\n\t
"
"vpadd.f32 d6, d4, d5
\n\t
"
"vpadd.f32 d6, d6, d6
\n\t
"
"vld1.f32 d7, [%[nine_ptr]]!
\n\t
"
"vmul.f32 d6,d7
\n\t
"
"vst1.32 {d6[0]},[%[output_ptr]]
\n\t
"
:
:
[
input_seg
]
"r"
(
input_seg
),
[
pos1
]
"r"
(
pos1
),
[
pos2
]
"r"
(
pos2
),
[
pos3
]
"r"
(
pos3
),
[
output_ptr
]
"r"
(
output_ptr
),
[
zero
]
"r"
(
zero
),
[
nine_ptr
]
"r"
(
nine_ptr
)
:
"memory"
,
"r6"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
);
#endif
const
float32x4_t
data1
=
vld1q_f32
(
pos1
);
const
float32x4_t
data2
=
vld1q_f32
(
pos2
);
const
float32x4_t
data3
=
vld1q_f32
(
pos3
);
const
float32x4_t
sum_data
=
vaddq_f32
(
vaddq_f32
(
data1
,
data3
),
data2
);
float32x2_t
res
=
vpadd_f32
(
vget_high_f32
(
vsetq_lane_f32
(
0
,
sum_data
,
3
)),
vget_low_f32
(
sum_data
));
res
=
vpadd_f32
(
res
,
res
);
output_seg
[
ph
*
output_width
+
pw
]
=
vget_lane_f32
(
res
,
0
)
/
9.0
;
}
}
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#else
#endif
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool_3x3.h
已删除
100644 → 0
浏览文件 @
889c8ebc
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#ifdef _OPENMP
#include <omp.h>
#endif
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
void
Pool3x3Avgs1p1
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
void
Pool3x3Maxs1p1
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
void
Pool3x3Max
(
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
void
Pool3x3Avg
(
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
framework
::
Tensor
*
in_x
,
framework
::
Tensor
*
out
);
void
Pool3x3Maxs1_int8
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
);
void
Pool3x3Maxs2_int8
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
);
void
Pool3x3Max_int8
(
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool_3x3_int8.cpp
已删除
100644 → 0
浏览文件 @
889c8ebc
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
#include <climits>
#include <iostream>
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
max
;
using
std
::
min
;
using
std
::
vector
;
template
<
typename
T
>
static
void
make_paddings
(
const
Tensor
*
input
,
Tensor
*
padded_input
,
int32_t
top
,
int32_t
bottom
,
int32_t
left
,
int32_t
right
,
T
value
)
{
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
c_in
=
input
->
dims
()[
1
];
const
int32_t
h_in
=
input
->
dims
()[
2
];
const
int32_t
w_in
=
input
->
dims
()[
3
];
const
int32_t
h_padded
=
h_in
+
top
+
bottom
;
const
int32_t
w_padded
=
w_in
+
left
+
right
;
padded_input
->
Resize
({
batch_size
,
c_in
,
h_padded
,
w_padded
});
T
*
padded_input_data
=
padded_input
->
mutable_data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
const
int32_t
input_channel_stride
=
h_in
*
w_in
;
const
int32_t
input_batch_stride
=
c_in
*
input_channel_stride
;
const
int32_t
padded_channel_stride
=
h_padded
*
w_padded
;
const
int32_t
padded_batch_stride
=
c_in
*
padded_channel_stride
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
c_in
;
++
j
)
{
const
T
*
img_in
=
input_data
+
j
*
input_channel_stride
;
T
*
img_padded
=
padded_input_data
+
j
*
padded_channel_stride
;
int
k
=
0
;
for
(;
k
<
top
;
++
k
)
{
for
(
int
l
=
0
;
l
<
w_padded
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
img_padded
+=
w_padded
;
}
for
(;
k
<
top
+
h_in
;
++
k
)
{
int
l
=
0
;
for
(;
l
<
left
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
memcpy
(
img_padded
+
left
,
img_in
,
w_in
*
sizeof
(
T
));
l
+=
w_in
;
img_in
+=
w_in
;
for
(;
l
<
w_padded
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
img_padded
+=
w_padded
;
}
for
(;
k
<
h_padded
;
++
k
)
{
for
(
int
l
=
0
;
l
<
w_padded
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
img_padded
+=
w_padded
;
}
}
input_data
+=
input_batch_stride
;
padded_input_data
+=
padded_batch_stride
;
}
// input_data = input->data<T>();
// std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// const T *img_in = input_data + j * input_channel_stride;
// for (int k = 0; k < h_in; ++k) {
// for (int l = 0; l < w_in; ++l) {
// std::cout << (int32_t)*img_in << "\t";
// img_in++;
// }
// std::cout << std::endl;
// }
// }
// input_data += input_batch_stride;
// }
// std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
// std::endl;
//
// padded_input_data = padded_input->data<T>();
// std::cout << "******************Padding begin**********************"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// T *img_padded = padded_input_data + j * padded_channel_stride;
// for (int k = 0; k < h_padded; ++k) {
// for (int l = 0; l < w_padded; ++l) {
// std::cout << (int32_t)*img_padded << "\t";
// img_padded++;
// }
// std::cout << std::endl;
// }
// }
// padded_input_data += padded_batch_stride;
// }
// std::cout << "******************Padding end**********************"
// << std::endl;
}
void
Pool3x3Maxs1_int8
(
const
Tensor
*
input
,
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
)
{
Tensor
padded_input
;
if
(
pad_h
!=
0
&&
pad_w
!=
0
)
{
int8_t
value
=
-
SCHAR_MAX
;
make_paddings
(
input
,
&
padded_input
,
pad_h
,
pad_h
,
pad_w
,
pad_w
,
value
);
input
=
&
padded_input
;
}
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
h_in
=
input
->
dims
()[
2
];
const
int32_t
w_in
=
input
->
dims
()[
3
];
const
int8_t
*
input_data
=
input
->
data
<
int8_t
>
();
const
int32_t
output_channels
=
output
->
dims
()[
1
];
const
int32_t
h_out
=
output
->
dims
()[
2
];
const
int32_t
w_out
=
output
->
dims
()[
3
];
int8_t
*
output_data
=
output
->
mutable_data
<
int8_t
>
();
const
int32_t
outputdata_channel_stride
=
h_out
*
w_out
;
const
int32_t
inputdata_channel_stride
=
h_in
*
w_in
;
const
int32_t
input_batch_stride
=
output_channels
*
inputdata_channel_stride
;
const
int32_t
output_batch_stride
=
output_channels
*
outputdata_channel_stride
;
// std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
output_channels
;
++
j
)
{
const
int8_t
*
img_in
=
input_data
+
j
*
inputdata_channel_stride
;
int8_t
*
img_out
=
output_data
+
j
*
outputdata_channel_stride
;
for
(
int
k
=
0
;
k
<
h_out
;
++
k
)
{
const
int8_t
*
row0
=
img_in
+
k
*
w_in
;
const
int8_t
*
row1
=
img_in
+
(
k
+
1
)
*
w_in
;
const
int8_t
*
row2
=
img_in
+
(
k
+
2
)
*
w_in
;
#if __ARM_NEON
int32_t
nw
=
w_out
>>
4
;
int32_t
left_w
=
w_out
&
0xf
;
int32_t
nw1
=
left_w
>>
3
;
int32_t
left_w1
=
left_w
&
0x7
;
#if __aarch64__
// TODO
#else
if
(
nw
>
0
)
{
#define LOOP_LABEL "1"
// result: q15
asm
volatile
(
"vld1.8 {q0}, [%[row0]]!
\n\t
"
// q0=0-15
"vld1.8 {q2}, [%[row1]]!
\n\t
"
"vld1.8 {q4}, [%[row2]]!
\n\t
"
LOOP_LABEL
":
\n\t
"
"vld1.8 {q1}, [%[row0]]!
\n\t
"
// q1=16-31
"vext.8 q6, q0, q1, #1
\n\t
"
"vext.8 q7, q0, q1, #2
\n\t
"
"vld1.8 {q3}, [%[row1]]!
\n\t
"
"vmax.s8 q15, q0, q6
\n\t
"
"vmax.s8 q15, q15, q7
\n\t
"
"vext.8 q6, q2, q3, #1
\n\t
"
"vext.8 q7, q2, q3, #2
\n\t
"
"vld1.8 {q5}, [%[row2]]!
\n\t
"
"vmax.s8 q14, q2, q6
\n\t
"
"vmax.s8 q14, q14, q7
\n\t
"
"vext.8 q6, q4, q5, #1
\n\t
"
"vext.8 q7, q4, q5, #2
\n\t
"
"vmax.s8 q13, q4, q6
\n\t
"
"vmax.s8 q13, q13, q7
\n\t
"
"vmax.s8 q15, q15, q14
\n\t
"
"vmax.s8 q15, q15, q13
\n\t
"
"vmov.s8 q0, q1
\n\t
"
"vmov.s8 q2, q3
\n\t
"
"vmov.s8 q4, q5
\n\t
"
"vst1.8 {q15}, [%[img_out]]!
\n\t
"
"subs %[nw], #1
\n\t
"
"bne "
LOOP_LABEL
"b
\n\t
"
"sub %[row0], #16
\n\t
"
"sub %[row1], #16
\n\t
"
"sub %[row2], #16
\n\t
"
:
[
nw
]
"+r"
(
nw
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q13"
,
"q14"
,
"q15"
);
#undef LOOP_LABEL
}
if
(
nw1
>
0
||
left_w1
>
0
)
{
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm
volatile
(
"vld1.8 {d0}, [%[row0]]!
\n\t
"
// d0=0-8
"vld1.8 {d2}, [%[row1]]!
\n\t
"
"vld1.8 {d4}, [%[row2]]!
\n\t
"
"mov r0, #1
\n\t
"
"cmp %[nw1], #0
\n\t
"
"beq "
PADDLE_LABEL_LESS8
"f
\n\t
"
"vld1.8 {d1}, [%[row0]]!
\n\t
"
// d1=9-15
"vext.8 d6, d0, d1, #1
\n\t
"
"vext.8 d7, d0, d1, #2
\n\t
"
"vld1.8 {d3}, [%[row1]]!
\n\t
"
"vmax.s8 d15, d0, d6
\n\t
"
"vmax.s8 d15, d15, d7
\n\t
"
"vext.8 d6, d2, d3, #1
\n\t
"
"vext.8 d7, d2, d3, #2
\n\t
"
"vld1.8 {d5}, [%[row2]]!
\n\t
"
"vmax.s8 d14, d2, d6
\n\t
"
"vmax.s8 d14, d14, d7
\n\t
"
"vext.8 d6, d4, d5, #1
\n\t
"
"vext.8 d7, d4, d5, #2
\n\t
"
"vmax.s8 d13, d4, d6
\n\t
"
"vmax.s8 d13, d13, d7
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
"vmov.s8 d0, d1
\n\t
"
"vmov.s8 d2, d3
\n\t
"
"vmov.s8 d4, d5
\n\t
"
"vst1.8 {d15}, [%[img_out]]!
\n\t
"
PADDLE_LABEL_LESS8
":
\n\t
"
"cmp %[left_w1], #0
\n\t
"
"beq "
PADDLE_LABEL_OVER
"f
\n\t
"
"vld1.8 {d1}, [%[row0]]
\n\t
"
// d1=9-15
"vext.8 d6, d0, d1, #1
\n\t
"
"vext.8 d7, d0, d1, #2
\n\t
"
"vld1.8 {d3}, [%[row1]]
\n\t
"
"vmax.s8 d15, d0, d6
\n\t
"
"vmax.s8 d15, d15, d7
\n\t
"
"vext.8 d6, d2, d3, #1
\n\t
"
"vext.8 d7, d2, d3, #2
\n\t
"
"vld1.8 {d5}, [%[row2]]
\n\t
"
"vmax.s8 d14, d2, d6
\n\t
"
"vmax.s8 d14, d14, d7
\n\t
"
"vext.8 d6, d4, d5, #1
\n\t
"
"vext.8 d7, d4, d5, #2
\n\t
"
"vmax.s8 d13, d4, d6
\n\t
"
"vmax.s8 d13, d13, d7
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
PADDLE_LABEL_LESS8_SAVE
":
\n\t
"
"vst1.8 {d15[0]}, [%[img_out]], r0
\n\t
"
"add %[row0], %[row0], #1
\n\t
"
"add %[row1], %[row1], #1
\n\t
"
"add %[row2], %[row2], #1
\n\t
"
"vext.8 d15, d15, d15, #1
\n\t
"
"subs %[left_w1], #1
\n\t
"
"bgt "
PADDLE_LABEL_LESS8_SAVE
"b
\n\t
"
PADDLE_LABEL_OVER
":
\n\t
"
:
[
nw1
]
"+r"
(
nw1
),
[
left_w1
]
"+r"
(
left_w1
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"r0"
,
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d13"
,
"d14"
,
"d15"
);
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t
left
=
w_out
;
while
(
left
>
0
)
{
const
int8_t
max0
=
std
::
max
(
std
::
max
(
row0
[
0
],
row0
[
1
]),
row0
[
2
]);
const
int8_t
max1
=
std
::
max
(
std
::
max
(
row1
[
0
],
row1
[
1
]),
row1
[
2
]);
const
int8_t
max2
=
std
::
max
(
std
::
max
(
row2
[
0
],
row2
[
1
]),
row2
[
2
]);
*
img_out
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
row0
+=
1
;
row1
+=
1
;
row2
+=
1
;
img_out
++
;
left
--
;
}
#endif // __ARM_NEON
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
}
void
Pool3x3Maxs2_int8
(
const
Tensor
*
input
,
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
)
{
Tensor
padded_input
;
if
(
pad_h
!=
0
&&
pad_w
!=
0
)
{
int8_t
value
=
-
SCHAR_MAX
;
make_paddings
(
input
,
&
padded_input
,
pad_h
,
pad_h
,
pad_w
,
pad_w
,
value
);
input
=
&
padded_input
;
}
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
h_in
=
input
->
dims
()[
2
];
const
int32_t
w_in
=
input
->
dims
()[
3
];
const
int32_t
output_channels
=
output
->
dims
()[
1
];
const
int32_t
h_out
=
output
->
dims
()[
2
];
const
int32_t
w_out
=
output
->
dims
()[
3
];
const
int32_t
outputdata_channel_stride
=
h_out
*
w_out
;
const
int32_t
inputdata_channel_stride
=
h_in
*
w_in
;
const
int32_t
output_batch_stride
=
output_channels
*
outputdata_channel_stride
;
const
int32_t
input_batch_stride
=
output_channels
*
inputdata_channel_stride
;
const
int8_t
*
input_data
=
input
->
data
<
int8_t
>
();
int8_t
*
output_data
=
output
->
mutable_data
<
int8_t
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
output_channels
;
++
j
)
{
const
int8_t
*
img_in
=
input_data
+
j
*
inputdata_channel_stride
;
int8_t
*
img_out
=
output_data
+
j
*
outputdata_channel_stride
;
for
(
int
k
=
0
;
k
<
h_out
;
++
k
)
{
const
int8_t
*
row0
=
img_in
+
2
*
k
*
w_in
;
const
int8_t
*
row1
=
img_in
+
(
2
*
k
+
1
)
*
w_in
;
const
int8_t
*
row2
=
img_in
+
(
2
*
k
+
2
)
*
w_in
;
#if __ARM_NEON
int32_t
nw
=
w_out
>>
4
;
int32_t
left_w
=
w_out
&
0xf
;
int32_t
nw1
=
left_w
>>
3
;
int32_t
left_w1
=
left_w
&
0x7
;
#if __aarch64__
// TODO
#else
if
(
nw
>
0
)
{
#define LOOP_LABEL "1"
// result: q15
asm
volatile
(
"vld2.8 {q0, q1}, [%[row0]]!
\n\t
"
// q0=0-30, q1=1-31
"vld2.8 {q2, q3}, [%[row1]]!
\n\t
"
"vld2.8 {q4, q5}, [%[row2]]!
\n\t
"
LOOP_LABEL
":
\n\t
"
"vmax.s8 q15, q0, q1
\n\t
"
"vld2.8 {q6, q7}, [%[row0]]!
\n\t
"
// q0=32-62, q1=33-63
"vmax.s8 q14, q2, q3
\n\t
"
"vmax.s8 q13, q4, q5
\n\t
"
"vld2.8 {q8, q9}, [%[row1]]!
\n\t
"
"vext.8 q0, q0, q6, #1
\n\t
"
"vmax.s8 q15, q15, q0
\n\t
"
"vld2.8 {q10, q11}, [%[row2]]!
\n\t
"
"vext.8 q2, q2, q8, #1
\n\t
"
"vmax.s8 q14, q14, q2
\n\t
"
"vext.8 q4, q4, q10, #1
\n\t
"
"vmax.s8 q13, q13, q4
\n\t
"
"vmax.s8 q15, q15, q14
\n\t
"
"vmax.s8 q15, q15, q13
\n\t
"
"vmov.s8 q0, q6
\n\t
"
"vmov.s8 q1, q7
\n\t
"
"vmov.s8 q2, q8
\n\t
"
"vmov.s8 q3, q9
\n\t
"
"vmov.s8 q4, q10
\n\t
"
"vmov.s8 q5, q11
\n\t
"
"vst1.8 {q15}, [%[img_out]]!
\n\t
"
"subs %[nw], #1
\n\t
"
"bne "
LOOP_LABEL
"b
\n\t
"
"sub %[row0], #32
\n\t
"
"sub %[row1], #32
\n\t
"
"sub %[row2], #32
\n\t
"
:
[
nw
]
"+r"
(
nw
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q13"
,
"q14"
,
"q15"
);
#undef LOOP_LABEL
}
if
(
nw1
>
0
||
left_w1
>
0
)
{
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm
volatile
(
"vld2.8 {d0, d1}, [%[row0]]!
\n\t
"
// d0=0-14, d1=1-15
"vld2.8 {d2, d3}, [%[row1]]!
\n\t
"
"vld2.8 {d4, d5}, [%[row2]]!
\n\t
"
"mov r0, #1
\n\t
"
"cmp %[nw1], #0
\n\t
"
"beq "
PADDLE_LABEL_LESS8
"f
\n\t
"
"vmax.s8 d15, d0, d1
\n\t
"
"vld2.8 {d6, d7}, [%[row0]]!
\n\t
"
// d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3
\n\t
"
"vmax.s8 d13, d4, d5
\n\t
"
"vld2.8 {d8, d9}, [%[row1]]!
\n\t
"
"vext.8 d0, d0, d6, #1
\n\t
"
"vmax.s8 d15, d15, d0
\n\t
"
"vld2.8 {d10, d11}, [%[row2]]!
\n\t
"
"vext.8 d2, d2, d8, #1
\n\t
"
"vmax.s8 d14, d14, d2
\n\t
"
"vext.8 d4, d4, d10, #1
\n\t
"
"vmax.s8 d13, d13, d4
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
"vmov.s8 d0, d6
\n\t
"
"vmov.s8 d1, d7
\n\t
"
"vmov.s8 d2, d8
\n\t
"
"vmov.s8 d3, d9
\n\t
"
"vmov.s8 d4, d10
\n\t
"
"vmov.s8 d5, d11
\n\t
"
"vst1.8 {d15}, [%[img_out]]!
\n\t
"
PADDLE_LABEL_LESS8
":
\n\t
"
"cmp %[left_w1], #0
\n\t
"
"beq "
PADDLE_LABEL_OVER
"f
\n\t
"
"vmax.s8 d15, d0, d1
\n\t
"
"vld2.8 {d6, d7}, [%[row0]]
\n\t
"
// d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3
\n\t
"
"vmax.s8 d13, d4, d5
\n\t
"
"vld2.8 {d8, d9}, [%[row1]]
\n\t
"
"vext.8 d0, d0, d6, #1
\n\t
"
"vmax.s8 d15, d15, d0
\n\t
"
"vld2.8 {d10, d11}, [%[row2]]
\n\t
"
"vext.8 d2, d2, d8, #1
\n\t
"
"vmax.s8 d14, d14, d2
\n\t
"
"vext.8 d4, d4, d10, #1
\n\t
"
"vmax.s8 d13, d13, d4
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
PADDLE_LABEL_LESS8_SAVE
":
\n\t
"
"vst1.8 {d15[0]}, [%[img_out]], r0
\n\t
"
"add %[row0], %[row0], #2
\n\t
"
"add %[row1], %[row1], #2
\n\t
"
"add %[row2], %[row2], #2
\n\t
"
"vext.8 d15, d15, d15, #1
\n\t
"
"subs %[left_w1], #1
\n\t
"
"bgt "
PADDLE_LABEL_LESS8_SAVE
"b
\n\t
"
PADDLE_LABEL_OVER
":
\n\t
"
:
[
nw1
]
"+r"
(
nw1
),
[
left_w1
]
"+r"
(
left_w1
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"r0"
,
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d13"
,
"d14"
,
"d15"
);
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t
left
=
w_out
;
while
(
left
>
0
)
{
const
int8_t
max0
=
std
::
max
(
std
::
max
(
row0
[
0
],
row0
[
1
]),
row0
[
2
]);
const
int8_t
max1
=
std
::
max
(
std
::
max
(
row1
[
0
],
row1
[
1
]),
row1
[
2
]);
const
int8_t
max2
=
std
::
max
(
std
::
max
(
row2
[
0
],
row2
[
1
]),
row2
[
2
]);
*
img_out
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
row0
+=
2
;
row1
+=
2
;
row2
+=
2
;
img_out
++
;
left
--
;
}
#endif // __ARM_NEON
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
}
void
Pool3x3Max_int8
(
const
vector
<
int
>
&
strides
,
const
vector
<
int
>
&
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
// const int _kernel_size = 3;
const
int
stride
=
strides
[
0
];
// const int stride_width = strides[1];
const
int
padding
=
paddings
[
0
];
// const int padding_width = paddings[1];
const
int8_t
negative_max
=
-
SCHAR_MAX
;
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
int8_t
*
input_data
=
input
->
data
<
int8_t
>
();
int8_t
*
output_data
=
output
->
mutable_data
<
int8_t
>
();
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
int8_t
*
input_seg
=
input_data
+
c
*
input_channel_stride
;
int8_t
*
output_seg
=
output_data
+
c
*
output_channel_stride
;
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
int
hstart
=
ph
*
stride
-
padding
;
int
hend
=
min
(
hstart
+
3
,
input_height
);
hstart
=
max
(
hstart
,
0
);
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
int
wstart
=
pw
*
stride
-
padding
;
int
wend
=
min
(
wstart
+
3
,
input_width
);
wstart
=
max
(
wstart
,
0
);
const
int8_t
*
pos1
=
input_seg
+
hstart
*
input_width
+
wstart
;
const
int8_t
*
pos2
=
input_seg
+
(
hstart
+
1
)
*
input_width
+
wstart
;
const
int8_t
*
pos3
=
input_seg
+
(
hstart
+
2
)
*
input_width
+
wstart
;
int8_t
*
output_ptr
=
output_seg
+
ph
*
output_width
+
pw
;
if
(
hend
-
hstart
!=
3
||
wend
-
wstart
!=
3
)
{
int8_t
max_value
=
-
SCHAR_MAX
;
for
(
int
h
=
hstart
;
h
<
hend
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
w
++
)
{
int8_t
value
=
input_seg
[
h
*
input_width
+
w
];
if
(
value
>
max_value
)
{
max_value
=
value
;
}
}
}
output_seg
[
ph
*
output_width
+
pw
]
=
max_value
;
}
else
{
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm
volatile
(
"vld1.8 {d0}, [%[pos1]]
\n\t
"
"vld1.8 {d1}, [%[pos2]]
\n\t
"
"vld1.8 {d2}, [%[pos3]]
\n\t
"
"vmax.s8 d3, d0, d1
\n\t
"
"vmax.s8 d4, d2, d3
\n\t
"
"vmov.s8 d4[3], %[negative_max]
\n\t
"
"vpmax.s8 d5, d4, d4
\n\t
"
"vpmax.s8 d6, d5, d5
\n\t
"
"vst1.8 {d6[0]},[%[output_ptr]]
\n\t
"
:
:
[
pos1
]
"r"
(
pos1
),
[
pos2
]
"r"
(
pos2
),
[
pos3
]
"r"
(
pos3
),
[
output_ptr
]
"r"
(
output_ptr
),
[
negative_max
]
"r"
(
negative_max
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
);
#endif
#else
const
int8_t
max0
=
std
::
max
(
std
::
max
(
pos1
[
0
],
pos1
[
1
]),
pos1
[
2
]);
const
int8_t
max1
=
std
::
max
(
std
::
max
(
pos2
[
0
],
pos2
[
1
]),
pos2
[
2
]);
const
int8_t
max2
=
std
::
max
(
std
::
max
(
pos3
[
0
],
pos3
[
1
]),
pos3
[
2
]);
*
output_ptr
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
#endif // __ARM_NEON
}
}
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pooling.cpp
浏览文件 @
afa836d9
...
...
@@ -15,54 +15,41 @@ limitations under the License. */
#ifdef POOL_OP
#include "operators/math/pooling.h"
#include <algorithm>
#include <vector>
#include "common/types.h"
#ifdef _OPENMP
#include <omp.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
/*
* All tensors are in NCHW format.
* Ksize, strides, paddings are two elements. These two elements represent
* height and width, respectively.
*/
template
<
typename
PoolProcess
,
typename
T
>
class
PoolFunctor
<
CPU
,
PoolProcess
,
T
>
{
public:
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
ksize
,
template
<
PoolingType
P
>
void
Pooling
<
P
>::
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
kernel_size
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
PoolProcess
pool_proces
s
,
const
std
::
vector
<
int
>
&
padding
s
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
k
size
[
0
];
const
int
ksize_width
=
k
size
[
1
];
const
int
ksize_height
=
kernel_
size
[
0
];
const
int
ksize_width
=
kernel_
size
[
1
];
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
input_stride
=
input_height
*
input_width
;
const
int
output_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
.
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
size_t
input_spatial_size
=
input_height
*
input_width
;
const
size_t
output_spatial_size
=
output_height
*
output_width
;
const
T
*
input_data
=
input
.
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for collapse(2)
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
#pragma omp parallel for
int
channel
=
i
*
output_channels
+
c
;
const
float
*
input_ptr
=
input_data
+
channel
*
input_spatial_size
;
float
*
output_ptr
=
output_data
+
channel
*
output_spatial_size
;
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
...
...
@@ -72,30 +59,24 @@ class PoolFunctor<CPU, PoolProcess, T> {
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
auto
ele
=
pool_process
.
initial
()
;
PoolingVal
<
P
>
val
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
pool_process
.
compute
(
input_data
[
h
*
input_width
+
w
],
&
ele
)
;
val
+=
input_ptr
[
h
*
input_width
+
w
]
;
}
}
int
pool_size
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
pool_process
.
finalize
(
static_cast
<
float
>
(
pool_size
),
&
ele
);
output_data
[
ph
*
output_width
+
pw
]
=
static_cast
<
T
>
(
ele
);
output_data
[
ph
*
output_width
+
pw
]
=
val
.
Value
();
}
}
input_data
+=
input_stride
;
output_data
+=
output_stride
;
}
}
}
};
}
template
struct
Pooling
<
Max
>;
template
struct
Pooling
<
Avg
>;
template
class
PoolFunctor
<
CPU
,
math
::
AvgPool
<
float
,
float
>,
float
>
;
template
class
PoolFunctor
<
CPU
,
math
::
MaxPool
<
float
>,
float
>
;
template
class
PoolFunctor
<
CPU
,
math
::
AvgPool
<
int8_t
,
int32_t
>,
int8_t
>
;
template
class
PoolFunctor
<
CPU
,
math
::
MaxPool
<
int8_t
>,
int8_t
>
;
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
// POOL_OP
src/operators/math/pooling.h
浏览文件 @
afa836d9
...
...
@@ -16,75 +16,143 @@ limitations under the License. */
#pragma once
#include <
climits
>
#include <
algorithm
>
#include <cmath>
#include "common/log.h"
#include <limits>
#include <vector>
#include "common/types.h"
#include "framework/tensor.h"
#include "pool_2x2.h"
#include "pool_3x3.h"
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
#define FLT_MAX __FLT_MAX__
/*
* \brief Extracting simple operations from pooling.
* Both MaxPool and AvgPool need "initial", "compute" and "finalize"
* operation.
* MaxPool initializes temp variable to the negative maximum to find the
* maximum value in the pooling field.
* AvgPool initializes temp variable to the zero to accumulate all values
* in pool pooling, and finally takes the average.
* MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
*/
template
<
typename
T
>
class
MaxPool
{
public:
inline
T
initial
()
{
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
return
static_cast
<
T
>
(
-
SCHAR_MAX
);
template
<
PoolingType
P
=
Max
>
struct
PoolingVal
{
float
val
;
int
count
;
PoolingVal
()
{
val
=
std
::
numeric_limits
<
float
>::
min
();
count
=
0
;
}
return
static_cast
<
T
>
(
-
FLT_MAX
);
inline
PoolingVal
<
P
>
&
operator
+=
(
const
float
&
x
)
{
val
=
std
::
max
(
val
,
x
);
count
+=
1
;
return
*
this
;
}
float
Value
()
const
{
if
(
count
>
0
)
{
return
val
;
}
return
0.
f
;
}
inline
void
compute
(
const
T
&
x
,
T
*
y
)
{
*
y
=
*
y
>
x
?
*
y
:
x
;
}
inline
void
finalize
(
const
T
&
pool_field
,
T
*
y
)
{}
};
template
<
typename
Itype
,
typename
Otype
>
class
AvgPool
{
public:
inline
Otype
initial
()
{
return
static_cast
<
Otype
>
(
0
);
}
inline
void
compute
(
const
Itype
&
x
,
Otype
*
y
)
{
*
y
+=
x
;
}
inline
void
finalize
(
const
float
&
pool_field
,
Otype
*
y
)
{
if
(
typeid
(
Itype
)
==
typeid
(
int8_t
))
{
float
tmp
=
*
y
/
pool_field
;
if
(
tmp
>
SCHAR_MAX
)
{
*
y
=
SCHAR_MAX
;
}
else
if
(
tmp
<
-
SCHAR_MAX
)
{
*
y
=
-
SCHAR_MAX
;
}
else
{
*
y
=
static_cast
<
Otype
>
(
std
::
round
(
tmp
));
template
<
>
struct
PoolingVal
<
Avg
>
{
float
val
;
int
count
;
PoolingVal
()
{
val
=
0.
f
;
count
=
0
;
}
}
else
{
*
y
/=
pool_field
;
inline
PoolingVal
<
Avg
>
&
operator
+=
(
const
float
&
x
)
{
val
+=
x
;
count
+=
1
;
return
*
this
;
}
float
Value
()
const
{
if
(
count
>
0
)
{
return
val
/
count
;
}
return
0.
f
;
}
};
template
<
typename
DeviceType
,
typename
PoolProcess
,
typename
T
>
class
PoolFunctor
{
public:
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
ksize
,
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
template
<
PoolingType
P
=
Max
>
inline
float32x4_t
vPoolPreq_f32
(
const
float32x4_t
&
x1
,
const
float32x4_t
&
x2
)
{
return
vmaxq_f32
(
x1
,
x2
);
}
template
<
>
inline
float32x4_t
vPoolPreq_f32
<
Avg
>
(
const
float32x4_t
&
x1
,
const
float32x4_t
&
x2
)
{
return
vaddq_f32
(
x1
,
x2
);
}
template
<
PoolingType
P
=
Max
>
inline
float32x4_t
vPoolPostq_f32
(
const
float32x4_t
&
x
)
{
return
x
;
}
template
<
>
inline
float32x4_t
vPoolPostq_f32
<
Avg
>
(
const
float32x4_t
&
x
)
{
float32x4_t
avg
=
vdupq_n_f32
(
1.
f
/
9
);
return
vmulq_f32
(
avg
,
x
);
}
#endif // __ARM_NEON__
template
<
PoolingType
P
=
Max
>
inline
float
PoolPre
(
const
float
&
x1
,
const
float
&
x2
)
{
return
std
::
max
(
x1
,
x2
);
}
template
<
>
inline
float
PoolPre
<
Avg
>
(
const
float
&
x1
,
const
float
&
x2
)
{
return
x1
+
x2
;
}
template
<
PoolingType
P
=
Max
>
inline
float
PoolPost
(
const
float
&
x
)
{
return
x
;
}
template
<
>
inline
float
PoolPost
<
Avg
>
(
const
float
&
x
)
{
return
1.
f
/
9
*
x
;
}
template
<
PoolingType
P
>
struct
Pooling
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
kernel_size
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
PoolProcess
pool_compute
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
};
template
<
PoolingType
P
,
int
Stride
>
struct
Pooling2x2
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
};
template
<
PoolingType
P
,
int
Stride
>
struct
Pooling3x3
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
};
template
<
PoolingType
P
,
int
Stride
>
struct
Pooling5x5
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
};
template
<
PoolingType
P
,
int
Stride
>
struct
Pooling7x7
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/math/pooling3x3.cpp
0 → 100644
浏览文件 @
afa836d9
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "operators/math/pooling.h"
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#endif // __ARM_NEON
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
#define POOLING3X3_NORMAL_BORDER(start, end) \
for (int w = start; w < end; ++w) { \
const int w_in_start = -padding_w + w * Stride; \
const int w_in_end = w_in_start + 3; \
const int w_start = w_in_start > 0 ? w_in_start : 0; \
const int w_end = w_in_end < input_w ? w_in_end : input_w; \
PoolingVal<P> val; \
for (int h_in = h_start; h_in < h_end; ++h_in) { \
for (int w_in = w_start; w_in < w_end; ++w_in) { \
val += input[h_in * input_w + w_in]; \
} \
} \
output_ptr[w] = val.Value(); \
}
template
<
PoolingType
P
,
int
Stride
>
inline
void
Pooling3x3ValidCol
(
const
float
*
input
,
const
int
h_output
,
const
int
h_output_end
,
const
int
w_output
,
const
int
input_h
,
const
int
input_w
,
const
int
padding_h
,
const
int
padding_w
,
const
int
output_w
,
float
*
output
)
{
const
int
w_in_start
=
-
padding_w
+
w_output
*
Stride
;
const
int
w_in_end
=
w_in_start
+
3
;
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
const
int
w_end
=
w_in_end
<
input_w
?
w_in_end
:
input_w
;
for
(
int
h
=
h_output
;
h
<
h_output_end
;
++
h
)
{
PoolingVal
<
P
>
val
;
const
int
h_in_start
=
-
padding_h
+
h
*
Stride
;
for
(
int
i
=
0
;
i
<
3
;
++
i
)
{
for
(
int
w_in
=
w_start
;
w_in
<
w_end
;
++
w_in
)
{
val
+=
input
[(
h_in_start
+
i
)
*
input_w
+
w_in
];
}
}
output
[
h
*
output_w
+
w_output
]
=
val
.
Value
();
}
}
template
<
PoolingType
P
,
int
Stride
>
inline
void
Pooling3x3NormalRow
(
const
float
*
input
,
const
int
h_output
,
const
int
input_h
,
const
int
input_w
,
const
int
padding_h
,
const
int
padding_w
,
const
int
output_w
,
float
*
output
)
{
const
int
h_in_start
=
-
padding_h
+
h_output
*
Stride
;
const
int
h_in_end
=
h_in_start
+
3
;
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
const
int
h_end
=
h_in_end
<
input_h
?
h_in_end
:
input_h
;
int
valid_w_start
=
(
padding_w
+
Stride
-
1
)
/
Stride
;
int
valid_w_end
=
output_w
-
valid_w_start
;
float
*
output_ptr
=
output
+
h_output
*
output_w
;
// border left
POOLING3X3_NORMAL_BORDER
(
0
,
valid_w_start
)
// middle
for
(
int
w
=
valid_w_start
;
w
<
valid_w_end
;
++
w
)
{
PoolingVal
<
P
>
val
;
int
input_start
=
-
padding_w
+
w
*
Stride
;
for
(
int
h_in
=
h_start
;
h_in
<
h_end
;
++
h_in
)
{
for
(
int
j
=
0
;
j
<
3
;
++
j
)
{
val
+=
input
[
h_in
*
input_w
+
j
+
input_start
];
}
}
output_ptr
[
w
]
=
val
.
Value
();
}
// border right
POOLING3X3_NORMAL_BORDER
(
valid_w_end
,
output_w
)
}
template
<
PoolingType
P
>
struct
Pooling3x3
<
P
,
1
>
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
const
float
*
input_data
=
input
.
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
input_h
=
input
.
dims
()[
2
];
int
input_w
=
input
.
dims
()[
3
];
int
output_h
=
output
->
dims
()[
2
];
int
output_w
=
output
->
dims
()[
3
];
int
padding_h
=
paddings
[
0
];
int
padding_w
=
paddings
[
1
];
int
image_size
=
input_h
*
input_w
;
int
out_image_size
=
output_h
*
output_w
;
int
valid_h_start
=
padding_h
;
int
valid_h_end
=
output_h
-
valid_h_start
;
int
valid_h
=
valid_h_end
-
valid_h_start
;
int
valid_w_start
=
padding_w
;
int
valid_w_end
=
output_w
-
valid_w_start
;
int
valid_w
=
valid_w_end
-
valid_w_start
;
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output
->
dims
()[
1
];
++
c
)
{
const
float
*
input_ptr
=
input_data
+
c
*
image_size
;
float
*
output_ptr
=
output_data
+
c
*
out_image_size
;
// top
for
(
int
h
=
0
;
h
<
valid_h_start
;
++
h
)
{
Pooling3x3NormalRow
<
P
,
1
>
(
input_ptr
,
h
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// left
for
(
int
w
=
0
;
w
<
valid_w_start
;
++
w
)
{
Pooling3x3ValidCol
<
P
,
1
>
(
input_ptr
,
valid_h_start
,
valid_h_end
,
w
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// right
for
(
int
w
=
valid_w_end
;
w
<
output_w
;
++
w
)
{
Pooling3x3ValidCol
<
P
,
1
>
(
input_ptr
,
valid_h_start
,
valid_h_end
,
w
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// bottom
for
(
int
h
=
valid_h_end
;
h
<
output_h
;
++
h
)
{
Pooling3x3NormalRow
<
P
,
1
>
(
input_ptr
,
h
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// valid
int
output_w_tiles
=
valid_w
/
6
;
int
output_w_remain
=
valid_w
-
output_w_tiles
*
6
;
for
(
int
h
=
valid_h_start
;
h
<
valid_h_end
-
3
;
h
+=
4
)
{
const
float
*
input_ptr0
=
input_ptr
+
(
h
-
padding_h
)
*
input_w
;
const
float
*
input_ptr1
=
input_ptr0
+
input_w
;
const
float
*
input_ptr2
=
input_ptr1
+
input_w
;
const
float
*
input_ptr3
=
input_ptr2
+
input_w
;
const
float
*
input_ptr4
=
input_ptr3
+
input_w
;
const
float
*
input_ptr5
=
input_ptr4
+
input_w
;
float
*
output_ptr0
=
output_ptr
+
h
*
output_w
+
valid_w_start
;
float
*
output_ptr1
=
output_ptr0
+
output_w
;
float
*
output_ptr2
=
output_ptr1
+
output_w
;
float
*
output_ptr3
=
output_ptr2
+
output_w
;
int
remain
=
output_w_remain
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4x2_t
x0
,
x1
,
x2
;
float32x4x2_t
y0
,
y1
,
y2
;
for
(
int
loop
=
0
;
loop
<
output_w_tiles
;
++
loop
)
{
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr0
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr0
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr1
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr1
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y1
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y1
.
val
[
1
],
y0
.
val
[
1
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr2
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr2
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y2
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y2
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y2
.
val
[
0
],
y1
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y2
.
val
[
1
],
y1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y2
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y2
.
val
[
1
],
y0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
1
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
vst1_f32
(
output_ptr0
+
4
,
vget_low_f32
(
y0
.
val
[
1
]));
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr3
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr3
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
0
],
y1
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
1
],
y1
.
val
[
1
]);
y2
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
0
],
y2
.
val
[
0
]);
y2
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
1
],
y2
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y1
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y1
.
val
[
1
]);
vst1q_f32
(
output_ptr1
,
y1
.
val
[
0
]);
vst1_f32
(
output_ptr1
+
4
,
vget_low_f32
(
y1
.
val
[
1
]));
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr4
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr4
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
y2
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y2
.
val
[
0
]);
y2
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y2
.
val
[
1
]);
y2
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y2
.
val
[
0
]);
y2
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y2
.
val
[
1
]);
vst1q_f32
(
output_ptr2
,
y2
.
val
[
0
]);
vst1_f32
(
output_ptr2
+
4
,
vget_low_f32
(
y2
.
val
[
1
]));
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr5
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr5
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
1
]);
vst1q_f32
(
output_ptr3
,
y0
.
val
[
0
]);
vst1_f32
(
output_ptr3
+
4
,
vget_low_f32
(
y0
.
val
[
1
]));
input_ptr0
+=
6
;
input_ptr1
+=
6
;
input_ptr2
+=
6
;
input_ptr3
+=
6
;
input_ptr4
+=
6
;
input_ptr5
+=
6
;
output_ptr0
+=
6
;
output_ptr1
+=
6
;
output_ptr2
+=
6
;
output_ptr3
+=
6
;
}
// remain w
if
(
remain
>=
4
)
{
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr0
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr1
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y1
.
val
[
0
],
y0
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr2
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y2
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y2
.
val
[
0
],
y1
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y2
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr3
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
0
],
y1
.
val
[
0
]);
y2
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
0
],
y2
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y1
.
val
[
0
]);
vst1q_f32
(
output_ptr1
,
y1
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y2
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y2
.
val
[
0
]);
y2
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y2
.
val
[
0
]);
vst1q_f32
(
output_ptr2
,
y2
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr5
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
vst1q_f32
(
output_ptr3
,
y0
.
val
[
0
]);
input_ptr0
+=
4
;
input_ptr1
+=
4
;
input_ptr2
+=
4
;
input_ptr3
+=
4
;
input_ptr4
+=
4
;
input_ptr5
+=
4
;
output_ptr0
+=
4
;
output_ptr1
+=
4
;
output_ptr2
+=
4
;
output_ptr3
+=
4
;
remain
-=
4
;
}
#endif // __ARM_NEON__
for
(
int
r
=
0
;
r
<
remain
;
++
r
)
{
float
m0
=
PoolPre
<
P
>
(
input_ptr0
[
r
],
input_ptr0
[
r
+
1
]);
m0
=
PoolPre
<
P
>
(
m0
,
input_ptr0
[
r
+
2
]);
float
m1
=
PoolPre
<
P
>
(
input_ptr1
[
r
],
input_ptr1
[
r
+
1
]);
m1
=
PoolPre
<
P
>
(
m1
,
input_ptr1
[
r
+
2
]);
float
m2
=
PoolPre
<
P
>
(
input_ptr2
[
r
],
input_ptr2
[
r
+
1
]);
m2
=
PoolPre
<
P
>
(
m2
,
input_ptr2
[
r
+
2
]);
float
m3
=
PoolPre
<
P
>
(
input_ptr3
[
r
],
input_ptr3
[
r
+
1
]);
m3
=
PoolPre
<
P
>
(
m3
,
input_ptr3
[
r
+
2
]);
float
m4
=
PoolPre
<
P
>
(
input_ptr4
[
r
],
input_ptr4
[
r
+
1
]);
m4
=
PoolPre
<
P
>
(
m4
,
input_ptr4
[
r
+
2
]);
float
m5
=
PoolPre
<
P
>
(
input_ptr5
[
r
],
input_ptr5
[
r
+
1
]);
m5
=
PoolPre
<
P
>
(
m5
,
input_ptr5
[
r
+
2
]);
m0
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m0
,
m1
),
m2
);
m1
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m1
,
m2
),
m3
);
m2
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m2
,
m3
),
m4
);
m3
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m3
,
m4
),
m5
);
output_ptr0
[
r
]
=
PoolPost
<
P
>
(
m0
);
output_ptr1
[
r
]
=
PoolPost
<
P
>
(
m1
);
output_ptr2
[
r
]
=
PoolPost
<
P
>
(
m2
);
output_ptr3
[
r
]
=
PoolPost
<
P
>
(
m3
);
}
}
// remain h
int
start_h
=
valid_h_start
+
(
valid_h
&
0xFFFC
);
for
(
int
h
=
start_h
;
h
<
valid_h_end
;
++
h
)
{
const
float
*
input_ptr0
=
input_ptr
+
(
h
-
padding_h
)
*
input_w
;
const
float
*
input_ptr1
=
input_ptr0
+
input_w
;
const
float
*
input_ptr2
=
input_ptr1
+
input_w
;
float
*
output_ptr0
=
output_ptr
+
h
*
output_w
+
valid_w_start
;
int
remain
=
output_w_remain
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4x2_t
x0
,
x1
,
x2
,
y0
;
for
(
int
loop
=
0
;
loop
<
output_w_tiles
;
++
loop
)
{
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr0
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr0
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr1
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr1
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr2
);
x0
.
val
[
1
]
=
vld1q_f32
(
input_ptr2
+
4
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x1
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x2
.
val
[
1
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
1
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
vst1_f32
(
output_ptr0
+
4
,
vget_low_f32
(
y0
.
val
[
1
]));
input_ptr0
+=
6
;
input_ptr1
+=
6
;
input_ptr2
+=
6
;
output_ptr0
+=
6
;
}
// remain w
if
(
remain
>=
4
)
{
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr0
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr1
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
x0
.
val
[
0
]
=
vld1q_f32
(
input_ptr2
);
x1
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
1
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x0
.
val
[
1
],
2
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x1
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
input_ptr0
+=
4
;
input_ptr1
+=
4
;
input_ptr2
+=
4
;
output_ptr0
+=
4
;
remain
-=
4
;
}
#endif // __ARM_NEON__
for
(
int
r
=
0
;
r
<
remain
;
++
r
)
{
float
m0
=
PoolPre
<
P
>
(
input_ptr0
[
r
],
input_ptr0
[
r
+
1
]);
m0
=
PoolPre
<
P
>
(
m0
,
input_ptr0
[
r
+
2
]);
float
m1
=
PoolPre
<
P
>
(
input_ptr1
[
r
],
input_ptr1
[
r
+
1
]);
m1
=
PoolPre
<
P
>
(
m1
,
input_ptr1
[
r
+
2
]);
float
m2
=
PoolPre
<
P
>
(
input_ptr2
[
r
],
input_ptr2
[
r
+
1
]);
m2
=
PoolPre
<
P
>
(
m2
,
input_ptr2
[
r
+
2
]);
m0
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m0
,
m1
),
m2
);
output_ptr0
[
r
]
=
PoolPost
<
P
>
(
m0
);
}
}
}
}
};
template
<
PoolingType
P
>
struct
Pooling3x3
<
P
,
2
>
{
inline
void
operator
()(
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
const
float
*
input_data
=
input
.
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
input_h
=
input
.
dims
()[
2
];
int
input_w
=
input
.
dims
()[
3
];
int
output_h
=
output
->
dims
()[
2
];
int
output_w
=
output
->
dims
()[
3
];
int
padding_h
=
paddings
[
0
];
int
padding_w
=
paddings
[
1
];
int
image_size
=
input_h
*
input_w
;
int
out_image_size
=
output_h
*
output_w
;
int
valid_h_start
=
(
padding_h
+
1
)
/
2
;
int
valid_h_end
=
output_h
-
valid_h_start
;
int
valid_h
=
valid_h_end
-
valid_h_start
;
int
valid_w_start
=
(
padding_w
+
1
)
/
2
;
int
valid_w_end
=
output_w
-
valid_w_start
;
int
valid_w
=
valid_w_end
-
valid_w_start
;
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output
->
dims
()[
1
];
++
c
)
{
const
float
*
input_ptr
=
input_data
+
c
*
image_size
;
float
*
output_ptr
=
output_data
+
c
*
out_image_size
;
// top
for
(
int
h
=
0
;
h
<
valid_h_start
;
++
h
)
{
Pooling3x3NormalRow
<
P
,
2
>
(
input_ptr
,
h
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// left
for
(
int
w
=
0
;
w
<
valid_w_start
;
++
w
)
{
Pooling3x3ValidCol
<
P
,
2
>
(
input_ptr
,
valid_h_start
,
valid_h_end
,
w
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// right
for
(
int
w
=
valid_w_end
;
w
<
output_w
;
++
w
)
{
Pooling3x3ValidCol
<
P
,
2
>
(
input_ptr
,
valid_h_start
,
valid_h_end
,
w
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// bottom
for
(
int
h
=
valid_h_end
;
h
<
output_h
;
++
h
)
{
Pooling3x3NormalRow
<
P
,
2
>
(
input_ptr
,
h
,
input_h
,
input_w
,
padding_h
,
padding_w
,
output_w
,
output_ptr
);
}
// valid
int
input_w_start
=
2
*
valid_w_start
-
padding_w
;
int
output_w_tiles
=
valid_w
/
6
;
int
output_w_remain
=
valid_w
-
output_w_tiles
*
6
;
for
(
int
h
=
valid_h_start
;
h
<
valid_h_end
-
2
;
h
+=
3
)
{
size_t
offset
=
(
2
*
h
-
padding_h
)
*
input_w
+
input_w_start
;
const
float
*
input_ptr0
=
input_ptr
+
offset
;
const
float
*
input_ptr1
=
input_ptr0
+
input_w
;
const
float
*
input_ptr2
=
input_ptr1
+
input_w
;
const
float
*
input_ptr3
=
input_ptr2
+
input_w
;
const
float
*
input_ptr4
=
input_ptr3
+
input_w
;
const
float
*
input_ptr5
=
input_ptr4
+
input_w
;
const
float
*
input_ptr6
=
input_ptr5
+
input_w
;
float
*
output_ptr0
=
output_ptr
+
h
*
output_w
+
valid_w_start
;
float
*
output_ptr1
=
output_ptr0
+
output_w
;
float
*
output_ptr2
=
output_ptr1
+
output_w
;
int
remain
=
output_w_remain
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4x2_t
x0
,
x1
,
x2
;
float32x4x2_t
y0
,
y1
,
y2
;
for
(
int
loop
=
0
;
loop
<
output_w_tiles
;
++
loop
)
{
x0
=
vld2q_f32
(
input_ptr0
);
x1
=
vld2q_f32
(
input_ptr0
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
x0
=
vld2q_f32
(
input_ptr1
);
x1
=
vld2q_f32
(
input_ptr1
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
x0
=
vld2q_f32
(
input_ptr2
);
x1
=
vld2q_f32
(
input_ptr2
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y1
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y1
.
val
[
1
],
y0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
1
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
vst1_f32
(
output_ptr0
+
4
,
vget_low_f32
(
y0
.
val
[
1
]));
x0
=
vld2q_f32
(
input_ptr3
);
x1
=
vld2q_f32
(
input_ptr3
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y1
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y1
.
val
[
1
]);
x0
=
vld2q_f32
(
input_ptr4
);
x1
=
vld2q_f32
(
input_ptr4
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
0
],
y1
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
1
],
y1
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y1
.
val
[
0
]);
y1
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y1
.
val
[
1
]);
vst1q_f32
(
output_ptr1
,
y1
.
val
[
0
]);
vst1_f32
(
output_ptr1
+
4
,
vget_low_f32
(
y1
.
val
[
1
]));
x0
=
vld2q_f32
(
input_ptr5
);
x1
=
vld2q_f32
(
input_ptr5
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
x0
=
vld2q_f32
(
input_ptr6
);
x1
=
vld2q_f32
(
input_ptr6
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
1
]);
vst1q_f32
(
output_ptr2
,
y0
.
val
[
0
]);
vst1_f32
(
output_ptr2
+
4
,
vget_low_f32
(
y0
.
val
[
1
]));
input_ptr0
+=
12
;
input_ptr1
+=
12
;
input_ptr2
+=
12
;
input_ptr3
+=
12
;
input_ptr4
+=
12
;
input_ptr5
+=
12
;
output_ptr0
+=
6
;
output_ptr1
+=
6
;
output_ptr2
+=
6
;
}
// remain w
if
(
remain
>=
4
)
{
x0
=
vld2q_f32
(
input_ptr0
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr0
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr1
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr1
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr2
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr2
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y1
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr3
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr3
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y1
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr4
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr4
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
y0
.
val
[
0
],
y1
.
val
[
0
]);
y1
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y1
.
val
[
0
]);
vst1q_f32
(
output_ptr1
,
y1
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr5
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr5
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr6
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr6
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
vst1q_f32
(
output_ptr2
,
y0
.
val
[
0
]);
input_ptr0
+=
8
;
input_ptr1
+=
8
;
input_ptr2
+=
8
;
input_ptr3
+=
8
;
input_ptr4
+=
8
;
input_ptr5
+=
8
;
output_ptr0
+=
4
;
output_ptr1
+=
4
;
output_ptr2
+=
4
;
remain
-=
4
;
}
#endif // __ARM_NEON__
for
(
int
r
=
0
;
r
<
remain
;
++
r
)
{
float
m0
=
PoolPre
<
P
>
(
input_ptr0
[
2
*
r
],
input_ptr0
[
2
*
r
+
1
]);
m0
=
PoolPre
<
P
>
(
m0
,
input_ptr0
[
2
*
r
+
2
]);
float
m1
=
PoolPre
<
P
>
(
input_ptr1
[
2
*
r
],
input_ptr1
[
2
*
r
+
1
]);
m1
=
PoolPre
<
P
>
(
m1
,
input_ptr1
[
2
*
r
+
2
]);
float
m2
=
PoolPre
<
P
>
(
input_ptr2
[
2
*
r
],
input_ptr2
[
2
*
r
+
1
]);
m2
=
PoolPre
<
P
>
(
m2
,
input_ptr2
[
2
*
r
+
2
]);
float
m3
=
PoolPre
<
P
>
(
input_ptr3
[
2
*
r
],
input_ptr3
[
2
*
r
+
1
]);
m3
=
PoolPre
<
P
>
(
m3
,
input_ptr3
[
2
*
r
+
2
]);
float
m4
=
PoolPre
<
P
>
(
input_ptr4
[
2
*
r
],
input_ptr4
[
2
*
r
+
1
]);
m4
=
PoolPre
<
P
>
(
m4
,
input_ptr4
[
2
*
r
+
2
]);
float
m5
=
PoolPre
<
P
>
(
input_ptr5
[
2
*
r
],
input_ptr5
[
2
*
r
+
1
]);
m5
=
PoolPre
<
P
>
(
m5
,
input_ptr5
[
2
*
r
+
2
]);
float
m6
=
PoolPre
<
P
>
(
input_ptr6
[
2
*
r
],
input_ptr6
[
2
*
r
+
1
]);
m6
=
PoolPre
<
P
>
(
m6
,
input_ptr6
[
2
*
r
+
2
]);
m0
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m0
,
m1
),
m2
);
m1
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m2
,
m3
),
m4
);
m2
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m4
,
m5
),
m6
);
output_ptr0
[
r
]
=
PoolPost
<
P
>
(
m0
);
output_ptr1
[
r
]
=
PoolPost
<
P
>
(
m1
);
output_ptr2
[
r
]
=
PoolPost
<
P
>
(
m2
);
}
}
// remain h
int
start_h
=
valid_h_start
+
valid_h
/
3
*
3
;
for
(
int
h
=
start_h
;
h
<
valid_h_end
;
++
h
)
{
size_t
offset
=
(
2
*
h
-
padding_h
)
*
input_w
+
input_w_start
;
const
float
*
input_ptr0
=
input_ptr
+
offset
;
const
float
*
input_ptr1
=
input_ptr0
+
input_w
;
const
float
*
input_ptr2
=
input_ptr1
+
input_w
;
float
*
output_ptr0
=
output_ptr
+
h
*
output_w
+
valid_w_start
;
int
remain
=
output_w_remain
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4x2_t
x0
,
x1
,
x2
,
y0
;
for
(
int
loop
=
0
;
loop
<
output_w_tiles
;
++
loop
)
{
x0
=
vld2q_f32
(
input_ptr0
);
x1
=
vld2q_f32
(
input_ptr0
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
x0
=
vld2q_f32
(
input_ptr1
);
x1
=
vld2q_f32
(
input_ptr1
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
x0
=
vld2q_f32
(
input_ptr2
);
x1
=
vld2q_f32
(
input_ptr2
+
8
);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x2
.
val
[
1
]
=
vextq_f32
(
x1
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x1
.
val
[
0
],
x1
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
x2
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
1
],
y0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
y0
.
val
[
1
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
1
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
vst1_f32
(
output_ptr0
+
4
,
vget_low_f32
(
y0
.
val
[
1
]));
input_ptr0
+=
12
;
input_ptr1
+=
12
;
input_ptr2
+=
12
;
output_ptr0
+=
6
;
}
// remain w
if
(
remain
>=
4
)
{
x0
=
vld2q_f32
(
input_ptr0
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr0
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr1
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr1
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
x0
=
vld2q_f32
(
input_ptr2
);
x1
.
val
[
0
]
=
vdupq_n_f32
(
input_ptr2
[
8
]);
x2
.
val
[
0
]
=
vextq_f32
(
x0
.
val
[
0
],
x1
.
val
[
0
],
1
);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x0
.
val
[
1
]);
x0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
x2
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPreq_f32
<
P
>
(
x0
.
val
[
0
],
y0
.
val
[
0
]);
y0
.
val
[
0
]
=
vPoolPostq_f32
<
P
>
(
y0
.
val
[
0
]);
vst1q_f32
(
output_ptr0
,
y0
.
val
[
0
]);
input_ptr0
+=
8
;
input_ptr1
+=
8
;
input_ptr2
+=
8
;
output_ptr0
+=
4
;
remain
-=
4
;
}
#endif // __ARM_NEON__
for
(
int
r
=
0
;
r
<
remain
;
++
r
)
{
float
m0
=
PoolPre
<
P
>
(
input_ptr0
[
2
*
r
],
input_ptr0
[
2
*
r
+
1
]);
m0
=
PoolPre
<
P
>
(
m0
,
input_ptr0
[
2
*
r
+
2
]);
float
m1
=
PoolPre
<
P
>
(
input_ptr1
[
2
*
r
],
input_ptr1
[
2
*
r
+
1
]);
m1
=
PoolPre
<
P
>
(
m1
,
input_ptr1
[
2
*
r
+
2
]);
float
m2
=
PoolPre
<
P
>
(
input_ptr2
[
2
*
r
],
input_ptr2
[
2
*
r
+
1
]);
m2
=
PoolPre
<
P
>
(
m2
,
input_ptr2
[
2
*
r
+
2
]);
m0
=
PoolPre
<
P
>
(
PoolPre
<
P
>
(
m0
,
m1
),
m2
);
output_ptr0
[
r
]
=
PoolPost
<
P
>
(
m0
);
}
}
}
}
};
template
struct
Pooling3x3
<
Max
,
1
>;
template
struct
Pooling3x3
<
Avg
,
1
>;
template
struct
Pooling3x3
<
Max
,
2
>;
template
struct
Pooling3x3
<
Avg
,
2
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif // POOL_OP
test/operators/test_pool_op.cpp
浏览文件 @
afa836d9
...
...
@@ -14,10 +14,13 @@ limitations under the License. */
#include <iostream>
#include "../test_include.h"
#include "operators/
kernel/central-arm-func/pool_arm_func
.h"
#include "operators/
math/pooling
.h"
#include "operators/pool_op.h"
namespace
paddle_mobile
{
namespace
math
=
operators
::
math
;
static
int
PoolOutputSize
(
int
input_size
,
int
filter_size
,
int
padding
,
int
stride
,
bool
ceil_mode
)
{
int
output_size
;
...
...
@@ -30,70 +33,6 @@ static int PoolOutputSize(int input_size, int filter_size, int padding,
return
output_size
;
}
template
<
typename
T
>
static
void
PoolAvgPad0
(
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
)
{
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
input_c
=
input
->
dims
()[
1
];
const
int32_t
input_h
=
input
->
dims
()[
2
];
const
int32_t
input_w
=
input
->
dims
()[
3
];
const
int32_t
out_c
=
out
->
dims
()[
1
];
const
int32_t
out_h
=
out
->
dims
()[
2
];
const
int32_t
out_w
=
out
->
dims
()[
3
];
const
int32_t
kernel_h
=
ksize
[
0
];
const
int32_t
kernel_w
=
ksize
[
1
];
const
int32_t
stride_h
=
strides
[
0
];
const
int32_t
stride_w
=
strides
[
1
];
const
int32_t
inputdata_channel_stride
=
input_h
*
input_w
;
const
int32_t
input_batch_stride
=
input_c
*
inputdata_channel_stride
;
const
int32_t
outputdata_channel_stride
=
out_h
*
out_w
;
const
int32_t
output_batch_stride
=
out_c
*
outputdata_channel_stride
;
T
*
out_data
=
out
->
mutable_data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
**
rows
=
new
const
T
*
[
kernel_h
];
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
out_c
;
++
j
)
{
const
T
*
img_in
=
input_data
+
j
*
inputdata_channel_stride
;
T
*
img_out
=
out_data
+
j
*
outputdata_channel_stride
;
for
(
int
k
=
0
;
k
<
out_h
;
++
k
)
{
for
(
int
m
=
0
;
m
<
kernel_h
;
++
m
)
{
rows
[
m
]
=
img_in
+
(
stride_h
*
k
+
m
)
*
input_w
;
}
int32_t
left
=
out_w
;
while
(
left
>
0
)
{
float
tmp
=
0
;
for
(
int
m
=
0
;
m
<
kernel_h
;
++
m
)
{
for
(
int
l
=
0
;
l
<
kernel_w
;
++
l
)
{
tmp
+=
rows
[
m
][
l
];
}
}
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
tmp
=
tmp
/
(
kernel_h
*
kernel_w
);
if
(
tmp
<
-
127
)
{
*
img_out
=
-
127
;
}
else
if
(
tmp
>
127
)
{
*
img_out
=
127
;
}
else
{
*
img_out
=
static_cast
<
T
>
(
std
::
round
(
tmp
));
}
}
else
{
*
img_out
=
static_cast
<
T
>
(
tmp
/
(
kernel_h
*
kernel_w
));
}
for
(
int
m
=
0
;
m
<
kernel_h
;
++
m
)
{
rows
[
m
]
+=
stride_w
;
}
img_out
++
;
left
--
;
}
}
}
input_data
+=
input_batch_stride
;
out_data
+=
output_batch_stride
;
}
delete
[]
rows
;
}
template
<
typename
T
,
int
CeilMode
,
int
PoolType
,
int
Kernel
,
int
Pad
,
int
Stride
>
int
TestPoolOp
(
int
in_channels
,
int
in_height
,
int
in_width
)
{
...
...
@@ -149,41 +88,27 @@ int TestPoolOp(int in_channels, int in_height, int in_width) {
framework
::
Tensor
output_cmp
;
output_cmp
.
mutable_data
<
T
>
(
output_shape
);
if
(
pooling_type
==
"avg"
&&
pad_h
==
0
&&
pad_h
==
pad_w
)
{
PoolAvgPad0
<
T
>
(
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
input
,
&
output_cmp
);
}
else
{
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
operators
::
PoolBasic
<
int8_t
,
int32_t
>
(
pooling_type
,
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
std
::
vector
<
int
>
{
pad_h
,
pad_w
},
input
,
&
output_cmp
);
if
(
pooling_type
==
"avg"
)
{
math
::
Pooling
<
Avg
>
()(
*
input
,
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
std
::
vector
<
int
>
{
pad_h
,
pad_w
},
&
output_cmp
);
}
else
{
operators
::
PoolBasic
<
float
,
float
>
(
pooling_type
,
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
std
::
vector
<
int
>
{
pad_h
,
pad_w
},
input
,
&
output_cmp
);
}
math
::
Pooling
<
Max
>
()(
*
input
,
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
std
::
vector
<
int
>
{
pad_h
,
pad_w
},
&
output_cmp
);
}
// compare results
int
eq
=
0
;
int
neq
=
0
;
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
const
T
*
output_data
=
output
->
data
<
T
>
();
T
*
output_cmp_data
=
output_cmp
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
PADDLE_MOBILE_ENFORCE
(
output_data
[
i
]
==
output_cmp_data
[
i
],
"The execution of test_pool_op is failed!"
);
if
(
output_data
[
i
]
==
output_cmp_data
[
i
])
{
++
eq
;
}
else
{
++
neq
;
}
"output[%d] = %d, output_cmp[%d] = %d"
,
i
,
output_data
[
i
],
i
,
output_cmp_data
[
i
]);
}
std
::
cout
<<
"eq = "
<<
eq
<<
", neq = "
<<
neq
<<
std
::
endl
;
delete
op
;
return
0
;
}
}
// namespace paddle_mobile
...
...
@@ -202,7 +127,6 @@ int main(int argc, char *argv[]) {
int
in_channels
=
atoi
(
argv
[
1
]);
int
in_height
=
atoi
(
argv
[
2
]);
int
in_width
=
atoi
(
argv
[
3
]);
#if __ARM_NEON
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"
;
...
...
@@ -213,66 +137,15 @@ int main(int argc, char *argv[]) {
<<
"float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
0
,
3
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
#endif
// kernel = 3, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 2, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 1, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
1
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
2
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 3, stride = 3
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
3
,
3
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
7
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
7
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 3
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
7
,
0
,
3
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 1
// kernel = 5, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"
int8_t, ceil_mode=false, pooling_type=avg, kernel=3
, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
<<
"
float, ceil_mode=false, pooling_type=avg, kernel=5
, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
5
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel =
3, pad = 0, stride = 3
// kernel =
5, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"
int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3
"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
3
,
0
,
3
>
(
in_channels
,
in_height
,
<<
"
float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1
"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
5
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
...
...
@@ -284,9 +157,4 @@ int main(int argc, char *argv[]) {
<<
"float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
7
,
0
,
4
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 5, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
5
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录