Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
07329c60
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
07329c60
编写于
12月 04, 2018
作者:
Z
ZhenWang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add int8_t type pooling support.
上级
475d7e91
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
933 addition
and
76 deletion
+933
-76
src/operators/kernel/central-arm-func/pool_arm_func.h
src/operators/kernel/central-arm-func/pool_arm_func.h
+51
-34
src/operators/math/pool_3x3.cpp
src/operators/math/pool_3x3.cpp
+2
-1
src/operators/math/pool_3x3.h
src/operators/math/pool_3x3.h
+15
-9
src/operators/math/pool_3x3_int8.cpp
src/operators/math/pool_3x3_int8.cpp
+562
-0
src/operators/math/pooling.cpp
src/operators/math/pooling.cpp
+6
-4
src/operators/math/pooling.h
src/operators/math/pooling.h
+28
-8
test/CMakeLists.txt
test/CMakeLists.txt
+2
-2
test/operators/test_fusion_conv_add_relu_int8_op.cpp
test/operators/test_fusion_conv_add_relu_int8_op.cpp
+1
-1
test/operators/test_pool_op.cpp
test/operators/test_pool_op.cpp
+266
-17
未找到文件。
src/operators/kernel/central-arm-func/pool_arm_func.h
浏览文件 @
07329c60
...
...
@@ -23,20 +23,22 @@ namespace paddle_mobile {
namespace
operators
{
using
framework
::
Tensor
;
inline
void
PoolBasic
(
std
::
string
pooling_type
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
)
{
template
<
typename
T
,
typename
S
>
void
PoolBasic
(
std
::
string
pooling_type
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
)
{
if
(
pooling_type
==
"max"
)
{
math
::
PoolFunctor
<
CPU
,
math
::
MaxPool
<
float
>
,
float
>
pool2d_forward
;
math
::
MaxPool
<
float
>
pool_process
;
math
::
PoolFunctor
<
CPU
,
math
::
MaxPool
<
T
>
,
T
>
pool2d_forward
;
math
::
MaxPool
<
T
>
pool_process
;
pool2d_forward
(
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
PoolFunctor
<
CPU
,
math
::
AvgPool
<
float
>
,
float
>
pool2d_forward
;
math
::
AvgPool
<
float
>
pool_process
;
math
::
PoolFunctor
<
CPU
,
math
::
AvgPool
<
T
,
S
>
,
T
>
pool2d_forward
;
math
::
AvgPool
<
T
,
S
>
pool_process
;
pool2d_forward
(
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
}
}
template
<
typename
P
>
void
PoolCompute
(
const
PoolParam
<
CPU
>
&
param
)
{
const
Tensor
*
in_x
=
param
.
Input
();
...
...
@@ -52,50 +54,65 @@ void PoolCompute(const PoolParam<CPU> ¶m) {
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
<<
"Pool op only supports 2D and 3D input."
;
}
if
(
param
.
isGlobalPooling
())
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
paddings
[
i
]
=
0
;
ksize
[
i
]
=
static_cast
<
int
>
(
in_x
->
dims
()[
i
+
2
]);
}
}
if
(
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
pooling_type
==
"max"
)
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
1
)
{
math
::
Pool3x3Maxs1p1
(
in_x
,
out
);
if
(
in_x
->
type
()
==
typeid
(
int8_t
))
{
if
(
pooling_type
==
"max"
&&
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
)
{
math
::
Pool3x3Maxs1_int8
(
in_x
,
out
,
paddings
[
0
],
paddings
[
1
]);
}
else
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
2
)
{
math
::
Pool3x3Maxs2_int8
(
in_x
,
out
,
paddings
[
0
],
paddings
[
1
]);
}
else
{
math
::
Pool3x3Max
(
strides
,
paddings
,
in_x
,
out
);
}
}
else
if
(
pooling_type
==
"avg"
)
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
1
)
{
math
::
Pool3x3Avgs1p1
(
in_x
,
out
);
}
else
{
math
::
Pool3x3Avg
(
strides
,
paddings
,
in_x
,
out
);
math
::
Pool3x3Max_int8
(
strides
,
paddings
,
in_x
,
out
);
}
}
else
{
PoolBasic
<
int8_t
,
int32_t
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
}
}
else
{
if
(
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
pooling_type
==
"max"
)
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
1
)
{
math
::
Pool3x3Maxs1p1
(
in_x
,
out
);
}
else
{
math
::
Pool3x3Max
(
strides
,
paddings
,
in_x
,
out
);
}
}
else
if
(
pooling_type
==
"avg"
)
{
if
(
strides
[
0
]
==
strides
[
1
]
&&
strides
[
0
]
==
1
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
1
)
{
math
::
Pool3x3Avgs1p1
(
in_x
,
out
);
}
else
{
math
::
Pool3x3Avg
(
strides
,
paddings
,
in_x
,
out
);
}
}
}
else
if
(
ksize
[
0
]
==
2
&&
ksize
[
0
]
==
ksize
[
1
]
&&
strides
[
0
]
==
2
&&
strides
[
0
]
==
strides
[
1
]
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
0
)
{
}
else
if
(
ksize
[
0
]
==
2
&&
ksize
[
0
]
==
ksize
[
1
]
&&
strides
[
0
]
==
2
&&
strides
[
0
]
==
strides
[
1
]
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
0
)
{
#if __ARM_NEON
#if __aarch64__
PoolBasic
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
PoolBasic
<
float
,
float
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
#else
/// todo: fix bug in Pool2x2
if
(
pooling_type
==
"max"
)
{
math
::
Pool2x2Maxs2p0
(
strides
,
paddings
,
in_x
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2x2Avgs2p0
(
strides
,
paddings
,
in_x
,
out
);
}
/// todo: fix bug in Pool2x2
if
(
pooling_type
==
"max"
)
{
math
::
Pool2x2Maxs2p0
(
strides
,
paddings
,
in_x
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2x2Avgs2p0
(
strides
,
paddings
,
in_x
,
out
);
}
#endif
#else
PoolBasic
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
PoolBasic
<
float
,
float
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
#endif // __ARM_NEON
}
else
{
PoolBasic
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
}
else
{
PoolBasic
<
float
,
float
>
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
}
}
}
...
...
src/operators/math/pool_3x3.cpp
浏览文件 @
07329c60
...
...
@@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
const
int
input_width
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
output_height
=
static_cast
<
int
>
(
output
->
dims
()[
2
]);
const
int
output_width
=
static_cast
<
int
>
(
output
->
dims
()[
3
]);
output
->
mutable_data
<
float
>
();
const
int
hxw
=
input_height
*
input_width
;
...
...
@@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
const
int
inputdata_channel_stride
=
h_in
*
w_in
;
const
int
input_batch_stride
=
output_channels
*
inputdata_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
outputdata_channel_stride
;
float
*
out_data
=
output
->
data
<
float
>
();
float
*
out_data
=
output
->
mutable_
data
<
float
>
();
const
float
*
input_data
=
input
->
data
<
float
>
();
for
(
int
k
=
0
;
k
<
batch_size
;
++
k
)
{
#pragma omp parallel for
...
...
src/operators/math/pool_3x3.h
浏览文件 @
07329c60
...
...
@@ -28,15 +28,21 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
vector
;
void
Pool3x3Avgs1p1
(
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool3x3Maxs1p1
(
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool3x3Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool3x3Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
);
void
Pool3x3Avgs1p1
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
void
Pool3x3Maxs1p1
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
void
Pool3x3Max
(
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
void
Pool3x3Avg
(
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
const
framework
::
Tensor
*
in_x
,
framework
::
Tensor
*
out
);
void
Pool3x3Maxs1_int8
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
);
void
Pool3x3Maxs2_int8
(
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
);
void
Pool3x3Max_int8
(
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/math/pool_3x3_int8.cpp
0 → 100644
浏览文件 @
07329c60
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
#include <climits>
#include <iostream>
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
max
;
using
std
::
min
;
using
std
::
vector
;
template
<
typename
T
>
static
void
make_paddings
(
const
Tensor
*
input
,
Tensor
*
padded_input
,
int32_t
top
,
int32_t
bottom
,
int32_t
left
,
int32_t
right
,
T
value
)
{
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
c_in
=
input
->
dims
()[
1
];
const
int32_t
h_in
=
input
->
dims
()[
2
];
const
int32_t
w_in
=
input
->
dims
()[
3
];
const
int32_t
h_padded
=
h_in
+
top
+
bottom
;
const
int32_t
w_padded
=
w_in
+
left
+
right
;
padded_input
->
Resize
({
batch_size
,
c_in
,
h_padded
,
w_padded
});
T
*
padded_input_data
=
padded_input
->
mutable_data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
const
int32_t
input_channel_stride
=
h_in
*
w_in
;
const
int32_t
input_batch_stride
=
c_in
*
input_channel_stride
;
const
int32_t
padded_channel_stride
=
h_padded
*
w_padded
;
const
int32_t
padded_batch_stride
=
c_in
*
padded_channel_stride
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
c_in
;
++
j
)
{
const
T
*
img_in
=
input_data
+
j
*
input_channel_stride
;
T
*
img_padded
=
padded_input_data
+
j
*
padded_channel_stride
;
int
k
=
0
;
for
(;
k
<
top
;
++
k
)
{
for
(
int
l
=
0
;
l
<
w_padded
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
img_padded
+=
w_padded
;
}
for
(;
k
<
top
+
h_in
;
++
k
)
{
int
l
=
0
;
for
(;
l
<
left
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
memcpy
(
img_padded
+
left
,
img_in
,
w_in
*
sizeof
(
T
));
l
+=
w_in
;
img_in
+=
w_in
;
for
(;
l
<
w_padded
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
img_padded
+=
w_padded
;
}
for
(;
k
<
h_padded
;
++
k
)
{
for
(
int
l
=
0
;
l
<
w_padded
;
++
l
)
{
img_padded
[
l
]
=
value
;
}
img_padded
+=
w_padded
;
}
}
input_data
+=
input_batch_stride
;
padded_input_data
+=
padded_batch_stride
;
}
// input_data = input->data<T>();
// std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// const T *img_in = input_data + j * input_channel_stride;
// for (int k = 0; k < h_in; ++k) {
// for (int l = 0; l < w_in; ++l) {
// std::cout << (int32_t)*img_in << "\t";
// img_in++;
// }
// std::cout << std::endl;
// }
// }
// input_data += input_batch_stride;
// }
// std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
// std::endl;
//
// padded_input_data = padded_input->data<T>();
// std::cout << "******************Padding begin**********************"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// T *img_padded = padded_input_data + j * padded_channel_stride;
// for (int k = 0; k < h_padded; ++k) {
// for (int l = 0; l < w_padded; ++l) {
// std::cout << (int32_t)*img_padded << "\t";
// img_padded++;
// }
// std::cout << std::endl;
// }
// }
// padded_input_data += padded_batch_stride;
// }
// std::cout << "******************Padding end**********************"
// << std::endl;
}
void
Pool3x3Maxs1_int8
(
const
Tensor
*
input
,
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
)
{
Tensor
padded_input
;
if
(
pad_h
!=
0
&&
pad_w
!=
0
)
{
int8_t
value
=
-
SCHAR_MAX
;
make_paddings
(
input
,
&
padded_input
,
pad_h
,
pad_h
,
pad_w
,
pad_w
,
value
);
input
=
&
padded_input
;
}
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
h_in
=
input
->
dims
()[
2
];
const
int32_t
w_in
=
input
->
dims
()[
3
];
const
int8_t
*
input_data
=
input
->
data
<
int8_t
>
();
const
int32_t
output_channels
=
output
->
dims
()[
1
];
const
int32_t
h_out
=
output
->
dims
()[
2
];
int32_t
w_out
=
output
->
dims
()[
3
];
int8_t
*
output_data
=
output
->
mutable_data
<
int8_t
>
();
const
int32_t
outputdata_channel_stride
=
h_out
*
w_out
;
const
int32_t
inputdata_channel_stride
=
h_in
*
w_in
;
const
int32_t
input_batch_stride
=
output_channels
*
inputdata_channel_stride
;
const
int32_t
output_batch_stride
=
output_channels
*
outputdata_channel_stride
;
// std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
output_channels
;
++
j
)
{
const
int8_t
*
img_in
=
input_data
+
j
*
inputdata_channel_stride
;
int8_t
*
img_out
=
output_data
+
j
*
outputdata_channel_stride
;
for
(
int
k
=
0
;
k
<
h_out
;
++
k
)
{
const
int8_t
*
row0
=
img_in
+
k
*
w_in
;
const
int8_t
*
row1
=
img_in
+
(
k
+
1
)
*
w_in
;
const
int8_t
*
row2
=
img_in
+
(
k
+
2
)
*
w_in
;
#if __ARM_NEON
int32_t
nw
=
w_out
>>
4
;
int32_t
left_w
=
w_out
&
0xf
;
int32_t
nw1
=
left_w
>>
3
;
int32_t
left_w1
=
left_w
&
0x7
;
#if __aarch64__
// TODO(wzzju)
#else
if
(
nw
>
0
)
{
#define LOOP_LABEL "1"
// result: q15
asm
volatile
(
"vld1.8 {q0}, [%[row0]]!
\n\t
"
// q0=0-15
"vld1.8 {q2}, [%[row1]]!
\n\t
"
"vld1.8 {q4}, [%[row2]]!
\n\t
"
LOOP_LABEL
":
\n\t
"
"vld1.8 {q1}, [%[row0]]!
\n\t
"
// q1=16-31
"vext.8 q6, q0, q1, #1
\n\t
"
"vext.8 q7, q0, q1, #2
\n\t
"
"vld1.8 {q3}, [%[row1]]!
\n\t
"
"vmax.s8 q15, q0, q6
\n\t
"
"vmax.s8 q15, q15, q7
\n\t
"
"vext.8 q6, q2, q3, #1
\n\t
"
"vext.8 q7, q2, q3, #2
\n\t
"
"vld1.8 {q5}, [%[row2]]!
\n\t
"
"vmax.s8 q14, q2, q6
\n\t
"
"vmax.s8 q14, q14, q7
\n\t
"
"vext.8 q6, q4, q5, #1
\n\t
"
"vext.8 q7, q4, q5, #2
\n\t
"
"vmax.s8 q13, q4, q6
\n\t
"
"vmax.s8 q13, q13, q7
\n\t
"
"vmax.s8 q15, q15, q14
\n\t
"
"vmax.s8 q15, q15, q13
\n\t
"
"vmov.s8 q0, q1
\n\t
"
"vmov.s8 q2, q3
\n\t
"
"vmov.s8 q4, q5
\n\t
"
"vst1.8 {q15}, [%[img_out]]!
\n\t
"
"subs %[nw], #1
\n\t
"
"bne "
LOOP_LABEL
"b
\n\t
"
"sub %[row0], #16
\n\t
"
"sub %[row1], #16
\n\t
"
"sub %[row2], #16
\n\t
"
:
[
nw
]
"+r"
(
nw
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q13"
,
"q14"
,
"q15"
);
#undef LOOP_LABEL
}
if
(
nw1
>
0
||
left_w1
>
0
)
{
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm
volatile
(
"vld1.8 {d0}, [%[row0]]!
\n\t
"
// d0=0-8
"vld1.8 {d2}, [%[row1]]!
\n\t
"
"vld1.8 {d4}, [%[row2]]!
\n\t
"
"mov r0, #1
\n\t
"
"cmp %[nw1], #0
\n\t
"
"beq "
PADDLE_LABEL_LESS8
"f
\n\t
"
"vld1.8 {d1}, [%[row0]]!
\n\t
"
// d1=9-15
"vext.8 d6, d0, d1, #1
\n\t
"
"vext.8 d7, d0, d1, #2
\n\t
"
"vld1.8 {d3}, [%[row1]]!
\n\t
"
"vmax.s8 d15, d0, d6
\n\t
"
"vmax.s8 d15, d15, d7
\n\t
"
"vext.8 d6, d2, d3, #1
\n\t
"
"vext.8 d7, d2, d3, #2
\n\t
"
"vld1.8 {d5}, [%[row2]]!
\n\t
"
"vmax.s8 d14, d2, d6
\n\t
"
"vmax.s8 d14, d14, d7
\n\t
"
"vext.8 d6, d4, d5, #1
\n\t
"
"vext.8 d7, d4, d5, #2
\n\t
"
"vmax.s8 d13, d4, d6
\n\t
"
"vmax.s8 d13, d13, d7
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
"vmov.s8 d0, d1
\n\t
"
"vmov.s8 d2, d3
\n\t
"
"vmov.s8 d4, d5
\n\t
"
"vst1.8 {d15}, [%[img_out]]!
\n\t
"
PADDLE_LABEL_LESS8
":
\n\t
"
"cmp %[left_w1], #0
\n\t
"
"beq "
PADDLE_LABEL_OVER
"f
\n\t
"
"vld1.8 {d1}, [%[row0]]
\n\t
"
// d1=9-15
"vext.8 d6, d0, d1, #1
\n\t
"
"vext.8 d7, d0, d1, #2
\n\t
"
"vld1.8 {d3}, [%[row1]]
\n\t
"
"vmax.s8 d15, d0, d6
\n\t
"
"vmax.s8 d15, d15, d7
\n\t
"
"vext.8 d6, d2, d3, #1
\n\t
"
"vext.8 d7, d2, d3, #2
\n\t
"
"vld1.8 {d5}, [%[row2]]
\n\t
"
"vmax.s8 d14, d2, d6
\n\t
"
"vmax.s8 d14, d14, d7
\n\t
"
"vext.8 d6, d4, d5, #1
\n\t
"
"vext.8 d7, d4, d5, #2
\n\t
"
"vmax.s8 d13, d4, d6
\n\t
"
"vmax.s8 d13, d13, d7
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
PADDLE_LABEL_LESS8_SAVE
":
\n\t
"
"vst1.8 {d15}, [%[img_out]], r0
\n\t
"
"add %[row0], %[row0], #1
\n\t
"
"add %[row1], %[row1], #1
\n\t
"
"add %[row2], %[row2], #1
\n\t
"
"vext.8 d15, d15, d15, #1
\n\t
"
"subs %[left_w1], #1
\n\t
"
"bgt "
PADDLE_LABEL_LESS8_SAVE
"b
\n\t
"
PADDLE_LABEL_OVER
":
\n\t
"
:
[
nw1
]
"+r"
(
nw1
),
[
left_w1
]
"+r"
(
left_w1
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"r0"
,
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d13"
,
"d14"
,
"d15"
);
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t
left
=
w_out
;
while
(
left
>
0
)
{
const
int8_t
max0
=
std
::
max
(
std
::
max
(
row0
[
0
],
row0
[
1
]),
row0
[
2
]);
const
int8_t
max1
=
std
::
max
(
std
::
max
(
row1
[
0
],
row1
[
1
]),
row1
[
2
]);
const
int8_t
max2
=
std
::
max
(
std
::
max
(
row2
[
0
],
row2
[
1
]),
row2
[
2
]);
*
img_out
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
row0
+=
1
;
row1
+=
1
;
row2
+=
1
;
img_out
++
;
left
--
;
}
#endif // __ARM_NEON
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
}
void
Pool3x3Maxs2_int8
(
const
Tensor
*
input
,
Tensor
*
output
,
int32_t
pad_h
,
int32_t
pad_w
)
{
Tensor
padded_input
;
if
(
pad_h
!=
0
&&
pad_w
!=
0
)
{
int8_t
value
=
-
SCHAR_MAX
;
make_paddings
(
input
,
&
padded_input
,
pad_h
,
pad_h
,
pad_w
,
pad_w
,
value
);
input
=
&
padded_input
;
}
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
h_in
=
input
->
dims
()[
2
];
const
int32_t
w_in
=
input
->
dims
()[
3
];
const
int32_t
output_channels
=
output
->
dims
()[
1
];
const
int32_t
h_out
=
output
->
dims
()[
2
];
int32_t
w_out
=
output
->
dims
()[
3
];
const
int32_t
outputdata_channel_stride
=
h_out
*
w_out
;
const
int32_t
inputdata_channel_stride
=
h_in
*
w_in
;
const
int32_t
output_batch_stride
=
output_channels
*
outputdata_channel_stride
;
const
int32_t
input_batch_stride
=
output_channels
*
inputdata_channel_stride
;
const
int8_t
*
input_data
=
input
->
data
<
int8_t
>
();
int8_t
*
output_data
=
output
->
mutable_data
<
int8_t
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
output_channels
;
++
j
)
{
const
int8_t
*
img_in
=
input_data
+
j
*
inputdata_channel_stride
;
int8_t
*
img_out
=
output_data
+
j
*
outputdata_channel_stride
;
for
(
int
k
=
0
;
k
<
h_out
;
++
k
)
{
const
int8_t
*
row0
=
img_in
+
2
*
k
*
w_in
;
const
int8_t
*
row1
=
img_in
+
(
2
*
k
+
1
)
*
w_in
;
const
int8_t
*
row2
=
img_in
+
(
2
*
k
+
2
)
*
w_in
;
#if __ARM_NEON
int32_t
nw
=
w_out
>>
4
;
int32_t
left_w
=
w_out
&
0xf
;
int32_t
nw1
=
left_w
>>
3
;
int32_t
left_w1
=
left_w
&
0x7
;
#if __aarch64__
// TODO(wzzju)
#else
if
(
nw
>
0
)
{
#define LOOP_LABEL "1"
// result: q15
asm
volatile
(
"vld2.8 {q0, q1}, [%[row0]]!
\n\t
"
// q0=0-30, q1=1-31
"vld2.8 {q2, q3}, [%[row1]]!
\n\t
"
"vld2.8 {q4, q5}, [%[row2]]!
\n\t
"
LOOP_LABEL
":
\n\t
"
"vmax.s8 q15, q0, q1
\n\t
"
"vld2.8 {q6, q7}, [%[row0]]!
\n\t
"
// q0=32-62, q1=33-63
"vmax.s8 q14, q2, q3
\n\t
"
"vmax.s8 q13, q4, q5
\n\t
"
"vld2.8 {q8, q9}, [%[row1]]!
\n\t
"
"vext.8 q0, q0, q6, #1
\n\t
"
"vmax.s8 q15, q15, q0
\n\t
"
"vld2.8 {q10, q11}, [%[row2]]!
\n\t
"
"vext.8 q2, q2, q8, #1
\n\t
"
"vmax.s8 q14, q14, q2
\n\t
"
"vext.8 q4, q4, q10, #1
\n\t
"
"vmax.s8 q13, q13, q4
\n\t
"
"vmax.s8 q15, q15, q14
\n\t
"
"vmax.s8 q15, q15, q13
\n\t
"
"vmov.s8 q0, q6
\n\t
"
"vmov.s8 q1, q7
\n\t
"
"vmov.s8 q2, q8
\n\t
"
"vmov.s8 q3, q9
\n\t
"
"vmov.s8 q4, q10
\n\t
"
"vmov.s8 q5, q11
\n\t
"
"vst1.8 {q15}, [%[img_out]]!
\n\t
"
"subs %[nw], #1
\n\t
"
"bne "
LOOP_LABEL
"b
\n\t
"
"sub %[row0], #32
\n\t
"
"sub %[row1], #32
\n\t
"
"sub %[row2], #32
\n\t
"
:
[
nw
]
"+r"
(
nw
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q13"
,
"q14"
,
"q15"
);
#undef LOOP_LABEL
}
if
(
nw1
>
0
||
left_w1
>
0
)
{
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm
volatile
(
"vld2.8 {d0, d1}, [%[row0]]!
\n\t
"
// d0=0-14, d1=1-15
"vld2.8 {d2, d3}, [%[row1]]!
\n\t
"
"vld2.8 {d4, d5}, [%[row2]]!
\n\t
"
"mov r0, #1
\n\t
"
"cmp %[nw1], #0
\n\t
"
"beq "
PADDLE_LABEL_LESS8
"f
\n\t
"
"vmax.s8 d15, d0, d1
\n\t
"
"vld2.8 {d6, d7}, [%[row0]]!
\n\t
"
// d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3
\n\t
"
"vmax.s8 d13, d4, d5
\n\t
"
"vld2.8 {d8, d9}, [%[row1]]!
\n\t
"
"vext.8 d0, d0, d6, #1
\n\t
"
"vmax.s8 d15, d15, d0
\n\t
"
"vld2.8 {d10, d11}, [%[row2]]!
\n\t
"
"vext.8 d2, d2, d8, #1
\n\t
"
"vmax.s8 d14, d14, d2
\n\t
"
"vext.8 d4, d4, d10, #1
\n\t
"
"vmax.s8 d13, d13, d4
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
"vmov.s8 d0, d6
\n\t
"
"vmov.s8 d1, d7
\n\t
"
"vmov.s8 d2, d8
\n\t
"
"vmov.s8 d3, d9
\n\t
"
"vmov.s8 d4, d10
\n\t
"
"vmov.s8 d5, d11
\n\t
"
"vst1.8 {d15}, [%[img_out]]!
\n\t
"
PADDLE_LABEL_LESS8
":
\n\t
"
"cmp %[left_w1], #0
\n\t
"
"beq "
PADDLE_LABEL_OVER
"f
\n\t
"
"vmax.s8 d15, d0, d1
\n\t
"
"vld2.8 {d6, d7}, [%[row0]]
\n\t
"
// d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3
\n\t
"
"vmax.s8 d13, d4, d5
\n\t
"
"vld2.8 {d8, d9}, [%[row1]]
\n\t
"
"vext.8 d0, d0, d6, #1
\n\t
"
"vmax.s8 d15, d15, d0
\n\t
"
"vld2.8 {d10, d11}, [%[row2]]
\n\t
"
"vext.8 d2, d2, d8, #1
\n\t
"
"vmax.s8 d14, d14, d2
\n\t
"
"vext.8 d4, d4, d10, #1
\n\t
"
"vmax.s8 d13, d13, d4
\n\t
"
"vmax.s8 d15, d15, d14
\n\t
"
"vmax.s8 d15, d15, d13
\n\t
"
PADDLE_LABEL_LESS8_SAVE
":
\n\t
"
"vst1.8 {d15}, [%[img_out]], r0
\n\t
"
"add %[row0], %[row0], #2
\n\t
"
"add %[row1], %[row1], #2
\n\t
"
"add %[row2], %[row2], #2
\n\t
"
"vext.8 d15, d15, d15, #1
\n\t
"
"subs %[left_w1], #1
\n\t
"
"bgt "
PADDLE_LABEL_LESS8_SAVE
"b
\n\t
"
PADDLE_LABEL_OVER
":
\n\t
"
:
[
nw1
]
"+r"
(
nw1
),
[
left_w1
]
"+r"
(
left_w1
),
[
row0
]
"+r"
(
row0
),
[
row1
]
"+r"
(
row1
),
[
row2
]
"+r"
(
row2
),
[
img_out
]
"+r"
(
img_out
)
:
:
"cc"
,
"memory"
,
"r0"
,
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d13"
,
"d14"
,
"d15"
);
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t
left
=
w_out
;
while
(
left
>
0
)
{
const
int8_t
max0
=
std
::
max
(
std
::
max
(
row0
[
0
],
row0
[
1
]),
row0
[
2
]);
const
int8_t
max1
=
std
::
max
(
std
::
max
(
row1
[
0
],
row1
[
1
]),
row1
[
2
]);
const
int8_t
max2
=
std
::
max
(
std
::
max
(
row2
[
0
],
row2
[
1
]),
row2
[
2
]);
*
img_out
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
row0
+=
2
;
row1
+=
2
;
row2
+=
2
;
img_out
++
;
left
--
;
}
#endif // __ARM_NEON
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
}
void
Pool3x3Max_int8
(
const
vector
<
int
>
&
strides
,
const
vector
<
int
>
&
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
// const int _kernel_size = 3;
const
int
stride
=
strides
[
0
];
// const int stride_width = strides[1];
const
int
padding
=
paddings
[
0
];
// const int padding_width = paddings[1];
const
int8_t
negative_max
=
-
SCHAR_MAX
;
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
int8_t
*
input_data
=
input
->
data
<
int8_t
>
();
int8_t
*
output_data
=
output
->
mutable_data
<
int8_t
>
();
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
int8_t
*
input_seg
=
input_data
+
c
*
input_channel_stride
;
int8_t
*
output_seg
=
output_data
+
c
*
output_channel_stride
;
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
int
hstart
=
ph
*
stride
-
padding
;
int
hend
=
min
(
hstart
+
3
,
input_height
);
hstart
=
max
(
hstart
,
0
);
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
int
wstart
=
pw
*
stride
-
padding
;
int
wend
=
min
(
wstart
+
3
,
input_width
);
wstart
=
max
(
wstart
,
0
);
const
int8_t
*
pos1
=
input_seg
+
hstart
*
input_width
+
wstart
;
const
int8_t
*
pos2
=
input_seg
+
(
hstart
+
1
)
*
input_width
+
wstart
;
const
int8_t
*
pos3
=
input_seg
+
(
hstart
+
2
)
*
input_width
+
wstart
;
int8_t
*
output_ptr
=
output_seg
+
ph
*
output_width
+
pw
;
if
(
hend
-
hstart
!=
3
||
wend
-
wstart
!=
3
)
{
int8_t
max_value
=
-
SCHAR_MAX
;
for
(
int
h
=
hstart
;
h
<
hend
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
w
++
)
{
int8_t
value
=
input_seg
[
h
*
input_width
+
w
];
if
(
value
>
max_value
)
{
max_value
=
value
;
}
}
}
output_seg
[
ph
*
output_width
+
pw
]
=
max_value
;
}
else
{
#if __ARM_NEON
#if __aarch64__
// TODO(wzzju)
#else
asm
volatile
(
"vld1.8 {d0}, [%[pos1]]
\n\t
"
"vld1.8 {d1}, [%[pos2]]
\n\t
"
"vld1.8 {d2}, [%[pos3]]
\n\t
"
"vmax.s8 d3, d0, d1
\n\t
"
"vmax.s8 d4, d2, d3
\n\t
"
"vmov.s8 d4[3], %[negative_max]
\n\t
"
"vpmax.s8 d5, d4, d4
\n\t
"
"vpmax.s8 d6, d5, d5
\n\t
"
"vst1.8 {d6[0]},[%[output_ptr]]
\n\t
"
:
:
[
pos1
]
"r"
(
pos1
),
[
pos2
]
"r"
(
pos2
),
[
pos3
]
"r"
(
pos3
),
[
output_ptr
]
"r"
(
output_ptr
),
[
negative_max
]
"r"
(
negative_max
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
);
#endif
#else
const
int8_t
max0
=
std
::
max
(
std
::
max
(
pos1
[
0
],
pos1
[
1
]),
pos1
[
2
]);
const
int8_t
max1
=
std
::
max
(
std
::
max
(
pos2
[
0
],
pos2
[
1
]),
pos2
[
2
]);
const
int8_t
max2
=
std
::
max
(
std
::
max
(
pos3
[
0
],
pos3
[
1
]),
pos3
[
2
]);
*
output_ptr
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
#endif // __ARM_NEON
}
}
}
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pooling.cpp
浏览文件 @
07329c60
...
...
@@ -70,15 +70,15 @@ class PoolFunctor<CPU, PoolProcess, T> {
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
T
ele
=
pool_process
.
initial
();
auto
ele
=
pool_process
.
initial
();
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
pool_process
.
compute
(
input_data
[
h
*
input_width
+
w
],
&
ele
);
}
}
int
pool_size
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
ph
*
output_width
+
pw
]
=
ele
;
pool_process
.
finalize
(
static_cast
<
float
>
(
pool_size
),
&
ele
);
output_data
[
ph
*
output_width
+
pw
]
=
static_cast
<
T
>
(
ele
)
;
}
}
input_data
+=
input_stride
;
...
...
@@ -88,8 +88,10 @@ class PoolFunctor<CPU, PoolProcess, T> {
}
};
template
class
PoolFunctor
<
CPU
,
math
::
AvgPool
<
float
>,
float
>
;
template
class
PoolFunctor
<
CPU
,
math
::
AvgPool
<
float
,
float
>,
float
>
;
template
class
PoolFunctor
<
CPU
,
math
::
MaxPool
<
float
>,
float
>
;
template
class
PoolFunctor
<
CPU
,
math
::
AvgPool
<
int8_t
,
int32_t
>,
int8_t
>
;
template
class
PoolFunctor
<
CPU
,
math
::
MaxPool
<
int8_t
>,
int8_t
>
;
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/math/pooling.h
浏览文件 @
07329c60
...
...
@@ -16,6 +16,8 @@ limitations under the License. */
#pragma once
#include <climits>
#include <cmath>
#include "common/log.h"
#include "framework/tensor.h"
#include "pool_2x2.h"
...
...
@@ -37,24 +39,42 @@ namespace math {
* in pool pooling, and finally takes the average.
* MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
*/
template
<
class
T
>
template
<
typename
T
>
class
MaxPool
{
public:
inline
T
initial
()
{
return
static_cast
<
T
>
(
-
FLT_MAX
);
}
inline
T
initial
()
{
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
return
static_cast
<
T
>
(
-
SCHAR_MAX
);
}
return
static_cast
<
T
>
(
-
FLT_MAX
);
}
inline
void
compute
(
const
T
&
x
,
T
*
y
)
{
*
y
=
*
y
>
x
?
*
y
:
x
;
}
inline
void
finalize
(
const
T
&
pool_field
,
T
*
y
)
{}
};
template
<
class
T
>
template
<
typename
Itype
,
typename
Otype
>
class
AvgPool
{
public:
inline
T
initial
()
{
return
static_cast
<
T
>
(
0
);
}
inline
void
compute
(
const
T
&
x
,
T
*
y
)
{
*
y
+=
x
;
}
inline
void
finalize
(
const
T
&
pool_field
,
T
*
y
)
{
*
y
/=
pool_field
;
}
inline
Otype
initial
()
{
return
static_cast
<
Otype
>
(
0
);
}
inline
void
compute
(
const
Itype
&
x
,
Otype
*
y
)
{
*
y
+=
x
;
}
inline
void
finalize
(
const
float
&
pool_field
,
Otype
*
y
)
{
if
(
typeid
(
Itype
)
==
typeid
(
int8_t
))
{
float
tmp
=
*
y
/
pool_field
;
if
(
tmp
>
SCHAR_MAX
)
{
*
y
=
SCHAR_MAX
;
}
else
if
(
tmp
<
-
SCHAR_MAX
)
{
*
y
=
-
SCHAR_MAX
;
}
else
{
*
y
=
static_cast
<
Otype
>
(
std
::
round
(
tmp
));
}
}
else
{
*
y
/=
pool_field
;
}
}
};
template
<
typename
DeviceType
,
typename
PoolProcess
,
typename
T
>
...
...
test/CMakeLists.txt
浏览文件 @
07329c60
...
...
@@ -269,8 +269,8 @@ if (NOT FOUND_MATCH)
#gen test
ADD_EXECUTABLE
(
test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-pool paddle-mobile
)
ADD_EXECUTABLE
(
test-pool
-op
operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-pool
-op
paddle-mobile
)
#gen test
ADD_EXECUTABLE
(
test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h
)
...
...
test/operators/test_fusion_conv_add_relu_int8_op.cpp
浏览文件 @
07329c60
...
...
@@ -251,7 +251,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
attrs
[
"groups"
].
Set
<
int
>
(
1
);
attrs
[
"axis"
].
Set
<
int
>
(
0
);
auto
*
op
=
new
operators
::
FusionConvAddReluInt8Op
<
CPU
,
int8_t
>
(
auto
*
op
=
new
operators
::
FusionConvAddReluInt8Op
<
CPU
,
T
>
(
"fusion_conv_add_relu_int8"
,
inputs
,
outputs
,
attrs
,
scope
);
op
->
InferShape
();
op
->
Init
();
...
...
test/operators/test_pool_op.cpp
浏览文件 @
07329c60
...
...
@@ -12,30 +12,279 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_include.h"
#include "operators/kernel/central-arm-func/pool_arm_func.h"
#include "operators/pool_op.h"
int
main
()
{
paddle_mobile
::
framework
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
auto
program
=
loader
.
Load
(
std
::
string
(
g_googlenet
));
if
(
program
.
originProgram
==
nullptr
)
{
DLOG
<<
"program read file"
;
namespace
paddle_mobile
{
static
int
PoolOutputSize
(
int
input_size
,
int
filter_size
,
int
padding
,
int
stride
,
bool
ceil_mode
)
{
int
output_size
;
if
(
!
ceil_mode
)
{
output_size
=
(
input_size
-
filter_size
+
2
*
padding
)
/
stride
+
1
;
}
else
{
output_size
=
(
input_size
-
filter_size
+
2
*
padding
+
stride
-
1
)
/
stride
+
1
;
}
return
output_size
;
}
template
<
typename
T
>
static
void
PoolAvgPad0
(
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
out
)
{
const
int32_t
batch_size
=
input
->
dims
()[
0
];
const
int32_t
input_c
=
input
->
dims
()[
1
];
const
int32_t
input_h
=
input
->
dims
()[
2
];
const
int32_t
input_w
=
input
->
dims
()[
3
];
const
int32_t
out_c
=
out
->
dims
()[
1
];
const
int32_t
out_h
=
out
->
dims
()[
2
];
const
int32_t
out_w
=
out
->
dims
()[
3
];
const
int32_t
kernel_h
=
ksize
[
0
];
const
int32_t
kernel_w
=
ksize
[
1
];
const
int32_t
stride_h
=
strides
[
0
];
const
int32_t
stride_w
=
strides
[
1
];
const
int32_t
inputdata_channel_stride
=
input_h
*
input_w
;
const
int32_t
input_batch_stride
=
input_c
*
inputdata_channel_stride
;
const
int32_t
outputdata_channel_stride
=
out_h
*
out_w
;
const
int32_t
output_batch_stride
=
out_c
*
outputdata_channel_stride
;
T
*
out_data
=
out
->
mutable_data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
**
rows
=
new
const
T
*
[
kernel_h
];
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
out_c
;
++
j
)
{
const
T
*
img_in
=
input_data
+
j
*
inputdata_channel_stride
;
T
*
img_out
=
out_data
+
j
*
outputdata_channel_stride
;
for
(
int
k
=
0
;
k
<
out_h
;
++
k
)
{
for
(
int
m
=
0
;
m
<
kernel_h
;
++
m
)
{
rows
[
m
]
=
img_in
+
(
stride_h
*
k
+
m
)
*
input_w
;
}
int32_t
left
=
out_w
;
while
(
left
>
0
)
{
float
tmp
=
0
;
for
(
int
m
=
0
;
m
<
kernel_h
;
++
m
)
{
for
(
int
l
=
0
;
l
<
kernel_w
;
++
l
)
{
tmp
+=
rows
[
m
][
l
];
}
}
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
tmp
=
tmp
/
(
kernel_h
*
kernel_w
);
if
(
tmp
<
-
127
)
{
*
img_out
=
-
127
;
}
else
if
(
tmp
>
127
)
{
*
img_out
=
127
;
}
else
{
*
img_out
=
static_cast
<
T
>
(
std
::
round
(
tmp
));
}
}
else
{
*
img_out
=
static_cast
<
T
>
(
tmp
/
(
kernel_h
*
kernel_w
));
}
for
(
int
m
=
0
;
m
<
kernel_h
;
++
m
)
{
rows
[
m
]
+=
stride_w
;
}
img_out
++
;
left
--
;
}
}
}
input_data
+=
input_batch_stride
;
out_data
+=
output_batch_stride
;
}
delete
[]
rows
;
}
template
<
typename
T
,
int
CeilMode
,
int
PoolType
,
int
Kernel
,
int
Pad
,
int
Stride
>
int
TestPoolOp
(
int
in_channels
,
int
in_height
,
int
in_width
)
{
int
kernel_h
=
Kernel
;
int
kernel_w
=
Kernel
;
int
pad_h
=
Pad
;
int
pad_w
=
Pad
;
int
stride_h
=
Stride
;
int
stride_w
=
Stride
;
bool
ceil_mode
=
CeilMode
!=
0
;
std
::
string
pooling_type
=
(
PoolType
==
0
?
"max"
:
"avg"
);
int
batch_size
=
1
;
int
input_c
=
in_channels
;
int
input_h
=
in_height
;
int
input_w
=
in_width
;
framework
::
DDim
input_shape
=
framework
::
make_ddim
({
batch_size
,
input_c
,
input_h
,
input_w
});
std
::
vector
<
int64_t
>
output_shape_v
({
batch_size
,
input_c
});
output_shape_v
.
push_back
(
PoolOutputSize
(
input_h
,
kernel_h
,
pad_h
,
stride_h
,
ceil_mode
));
output_shape_v
.
push_back
(
PoolOutputSize
(
input_w
,
kernel_w
,
pad_w
,
stride_w
,
ceil_mode
));
framework
::
DDim
output_shape
=
framework
::
make_ddim
(
output_shape_v
);
Executor4Test
<
paddle_mobile
::
CPU
,
paddle_mobile
::
operators
::
PoolOp
<
paddle_mobile
::
CPU
,
float
>>
executor
(
program
,
"pool2d"
);
VariableNameMap
inputs
;
VariableNameMap
outputs
;
auto
scope
=
std
::
make_shared
<
framework
::
Scope
>
();
inputs
[
"X"
]
=
std
::
vector
<
std
::
string
>
({
"input"
});
outputs
[
"Out"
]
=
std
::
vector
<
std
::
string
>
({
"output"
});
paddle_mobile
::
framework
::
Tensor
input
;
SetupTensor
<
float
>
(
&
input
,
{
1
,
64
,
112
,
112
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
auto
out_ddim
=
paddle_mobile
::
framework
::
make_ddim
({
1
,
64
,
56
,
56
});
auto
output
=
executor
.
Predict
(
input
,
"conv2d_0.tmp_1"
,
"pool2d_0.tmp_0"
,
out_ddim
);
auto
input_var
=
scope
.
get
()
->
Var
(
"input"
);
auto
input
=
input_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
input
,
input_shape
,
-
127
,
127
);
float
*
output_ptr
=
output
->
data
<
float
>
();
for
(
int
j
=
0
;
j
<
output
->
numel
();
++
j
)
{
DLOG
<<
" value of output: "
<<
output_ptr
[
j
];
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
framework
::
AttributeMap
attrs
;
attrs
[
"pooling_type"
].
SetString
(
pooling_type
);
attrs
[
"ksize"
].
Set
<
vector
<
int
>>
(
std
::
vector
<
int
>
({
kernel_h
,
kernel_w
}));
attrs
[
"strides"
].
Set
<
vector
<
int
>>
(
std
::
vector
<
int
>
({
stride_h
,
stride_w
}));
attrs
[
"paddings"
].
Set
<
vector
<
int
>>
(
std
::
vector
<
int
>
({
pad_h
,
pad_w
}));
attrs
[
"ceil_mode"
].
Set
<
bool
>
(
false
);
attrs
[
"global_pooling"
].
Set
<
bool
>
(
false
);
auto
*
op
=
new
operators
::
PoolOp
<
CPU
,
float
>
(
"pool2d"
,
inputs
,
outputs
,
attrs
,
scope
);
op
->
InferShape
();
op
->
Init
();
op
->
Run
();
framework
::
Tensor
output_cmp
;
output_cmp
.
mutable_data
<
T
>
(
output_shape
);
if
(
pooling_type
==
"avg"
&&
pad_h
==
0
&&
pad_h
==
pad_w
)
{
PoolAvgPad0
<
T
>
(
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
input
,
&
output_cmp
);
}
else
{
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
operators
::
PoolBasic
<
int8_t
,
int32_t
>
(
pooling_type
,
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
std
::
vector
<
int
>
{
pad_h
,
pad_w
},
input
,
&
output_cmp
);
}
else
{
operators
::
PoolBasic
<
float
,
float
>
(
pooling_type
,
std
::
vector
<
int
>
{
kernel_h
,
kernel_w
},
std
::
vector
<
int
>
{
stride_h
,
stride_w
},
std
::
vector
<
int
>
{
pad_h
,
pad_w
},
input
,
&
output_cmp
);
}
}
// compare results
int
eq
=
0
;
int
neq
=
0
;
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
const
T
*
output_data
=
output
->
data
<
T
>
();
T
*
output_cmp_data
=
output_cmp
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
PADDLE_MOBILE_ENFORCE
(
output_data
[
i
]
==
output_cmp_data
[
i
],
"The execution of test_pool_op is failed!"
);
if
(
output_data
[
i
]
==
output_cmp_data
[
i
])
{
++
eq
;
}
else
{
++
neq
;
}
}
std
::
cout
<<
"eq = "
<<
eq
<<
", neq = "
<<
neq
<<
std
::
endl
;
delete
op
;
return
0
;
}
}
// namespace paddle_mobile
int
main
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
4
)
{
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"Usage:
\n
"
<<
" ./test-pool-op in_channels in_height in_width
\n
"
<<
" params:
\n
"
<<
" -in_channels: int, input image's channels
\n
"
<<
" -in_height: int, input image's height
\n
"
<<
" -in_width: int, input image's width
\n
"
;
return
1
;
}
int
in_channels
=
atoi
(
argv
[
1
]);
int
in_height
=
atoi
(
argv
[
2
]);
int
in_width
=
atoi
(
argv
[
3
]);
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
0
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
0
,
3
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 2, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 1, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
1
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
2
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 3, stride = 3
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
0
,
3
,
3
,
3
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
7
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
7
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 3
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
7
,
0
,
3
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 3, pad = 0, stride = 3
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3"
;
paddle_mobile
::
TestPoolOp
<
int8_t
,
0
,
1
,
3
,
0
,
3
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
7
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 7, pad = 0, stride = 4
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
7
,
0
,
4
>
(
in_channels
,
in_height
,
in_width
);
// kernel = 5, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1"
;
paddle_mobile
::
TestPoolOp
<
float
,
0
,
1
,
5
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录