Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
3b1e90bc
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3b1e90bc
编写于
9月 03, 2018
作者:
Z
zhangyang0701
提交者:
GitHub
9月 03, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
7bc2c9b5
0a38f733
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
199 addition
and
112 deletion
+199
-112
src/operators/kernel/central-arm-func/pool_arm_func.h
src/operators/kernel/central-arm-func/pool_arm_func.h
+5
-3
src/operators/math/pool_2x2.cpp
src/operators/math/pool_2x2.cpp
+186
-99
src/operators/math/pool_2x2.h
src/operators/math/pool_2x2.h
+4
-4
src/operators/math/pool_3x3.cpp
src/operators/math/pool_3x3.cpp
+4
-6
未找到文件。
src/operators/kernel/central-arm-func/pool_arm_func.h
浏览文件 @
3b1e90bc
...
...
@@ -76,15 +76,17 @@ void PoolCompute(const PoolParam<CPU> ¶m) {
}
}
}
else
if
(
ksize
[
0
]
==
2
&&
ksize
[
0
]
==
ksize
[
1
])
{
}
else
if
(
ksize
[
0
]
==
2
&&
ksize
[
0
]
==
ksize
[
1
]
&&
strides
[
0
]
==
2
&&
strides
[
0
]
==
strides
[
1
]
&&
paddings
[
0
]
==
paddings
[
1
]
&&
paddings
[
1
]
==
0
)
{
#if __ARM_NEON
#if __aarch64__
PoolBasic
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
#else
if
(
pooling_type
==
"max"
)
{
math
::
Pool2x2Max
(
strides
,
paddings
,
in_x
,
out
);
math
::
Pool2x2Max
s2p0
(
strides
,
paddings
,
in_x
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2x2Avg
(
strides
,
paddings
,
in_x
,
out
);
math
::
Pool2x2Avg
s2p0
(
strides
,
paddings
,
in_x
,
out
);
}
#endif
#else
...
...
src/operators/math/pool_2x2.cpp
浏览文件 @
3b1e90bc
...
...
@@ -20,21 +20,15 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
#define FLT_MAX __FLT_MAX__
void
Pool2x2Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
#if __aarch64__
#else
void
Pool2x2Maxs2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
2
;
...
...
@@ -47,72 +41,110 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
out_w_num
=
output_width
>>
2
;
const
int
in_h_num
=
output_height
>>
1
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
int
remain
=
output_width
-
out_w_num
<<
2
;
int
w1
=
input_width
/
16
;
int
_w1
=
input_width
%
16
;
int
w2
=
_w1
/
4
;
int
_w2
=
_w1
%
4
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_data_chanel_row_next
=
input_data
+
input_width
;
for
(;
output_height
>
0
;
output_height
--
)
{
if
(
out_w_num
>
0
)
{
asm
volatile
(
"max_loop:
\n\t
"
"vld1.f32 {q0,q1}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q2,q3}, [%[in_ptr2]]!
\n\t
"
"vmax.f32 q0, q0, q2
\n\t
"
"vmax.f32 q1, q1, q3
\n\t
"
"vpmax.f32 d4, d0, d1
\n\t
"
"vpmax.f32 d5, d2, d3
\n\t
"
"subs %[out_w_num], #1
\n\t
"
"vst1.32 {q2}, [%[out_ptr]]!
\n\t
"
"bne max_loop
\n\t
"
:
[
in_ptr1
]
"+r"
(
input_data
),
[
in_ptr2
]
"+r"
(
input_data_chanel_row_next
),
[
out_ptr
]
"+r"
(
output_data
),
[
out_w_num
]
"+r"
(
out_w_num
)
:
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
);
for
(
int
ph
=
0
;
ph
<
input_height
;
ph
+=
2
)
{
const
float
*
in_ptr1
=
input_data
+
i
*
input_batch_stride
+
c
*
input_channel_stride
+
ph
*
input_width
;
const
float
*
in_ptr2
=
in_ptr1
+
input_width
;
if
(
ph
+
1
>=
input_height
)
{
in_ptr2
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
input_width
));
memset
(
static_cast
<
void
*>
(
const_cast
<
float
*>
(
in_ptr2
)),
-
FLT_MAX
,
sizeof
(
float
)
*
input_width
);
}
float
*
out_ptr
=
output_data
+
i
*
output_batch_stride
+
c
*
output_channel_stride
+
ph
/
2
*
output_width
;
asm
volatile
(
"subs %[w1], %[w1], #1
\n\t
"
"blt end_w1_%=
\n\t
"
"loop_w1_%=:
\n\t
"
"pld [%[in_ptr1], #64]
\n\t
"
"pld [%[in_ptr2], #64]
\n\t
"
"vld1.f32 {q0, q1}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q2, q3}, [%[in_ptr2]]!
\n\t
"
"vld1.f32 {q6, q7}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q8, q9}, [%[in_ptr2]]!
\n\t
"
for
(;
remain
>
0
;
remain
--
)
{
float
max_row1
=
std
::
max
(
input_data
[
0
],
input_data
[
1
]);
float
max_row2
=
std
::
max
(
input_data_chanel_row_next
[
0
],
input_data_chanel_row_next
[
1
]);
*
output_data
=
std
::
max
(
max_row1
,
max_row2
);
input_data
+=
2
;
input_data_chanel_row_next
+=
2
;
output_data
++
;
"vmax.f32 q0, q0, q2
\n\t
"
"vmax.f32 q1, q1, q3
\n\t
"
"vmax.f32 q6, q6, q8
\n\t
"
"vmax.f32 q7, q7, q9
\n\t
"
"vpmax.f32 d8, d0, d1
\n\t
"
"vpmax.f32 d9, d2, d3
\n\t
"
"vpmax.f32 d10, d12, d13
\n\t
"
"vpmax.f32 d11, d14, d15
\n\t
"
"vst1.32 {q4, q5}, [%[out_ptr]]!
\n\t
"
"subs %[w1], %[w1], #1
\n\t
"
"bge loop_w1_%=
\n\t
"
"end_w1_%=:
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"blt end_w2_%=
\n\t
"
"loop_w2_%=:
\n\t
"
"vld1.f32 {q0}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q1}, [%[in_ptr2]]!
\n\t
"
"vmax.f32 q0, q0, q1
\n\t
"
"vpmax.f32 d4, d0, d1
\n\t
"
"vst1.32 {d4}, [%[out_ptr]]!
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"bge loop_w2_%=
\n\t
"
"end_w2_%=:
\n\t
"
:
:
[
w1
]
"r"
(
w1
),
[
w2
]
"r"
(
w2
),
[
in_ptr1
]
"r"
(
in_ptr1
),
[
in_ptr2
]
"r"
(
in_ptr2
),
[
out_ptr
]
"r"
(
out_ptr
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
);
if
(
_w2
!=
0
)
{
in_ptr1
+=
16
*
w1
+
4
*
w2
;
in_ptr2
+=
16
*
w1
+
4
*
w2
;
out_ptr
+=
8
*
w1
+
2
*
w2
;
if
(
_w2
==
1
)
{
*
out_ptr
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
}
else
if
(
_w2
==
2
)
{
float
temp
=
(
*
in_ptr1
++
>
*
in_ptr2
++
)
?
*
in_ptr1
++
:
*
in_ptr2
++
;
float
temp1
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
*
out_ptr
=
(
temp
>
temp1
)
?
temp
:
temp1
;
}
else
if
(
_w2
==
3
)
{
float
temp
=
(
*
in_ptr1
++
>
*
in_ptr2
++
)
?
*
in_ptr1
++
:
*
in_ptr2
++
;
float
temp1
=
(
*
in_ptr1
++
>
*
in_ptr2
++
)
?
*
in_ptr1
++
:
*
in_ptr2
++
;
*
out_ptr
++
=
(
temp
>
temp1
)
?
temp
:
temp1
;
*
out_ptr
=
(
*
in_ptr1
>
*
in_ptr2
)
?
*
in_ptr1
:
*
in_ptr2
;
}
}
}
input_data
+=
input_channel_stride
;
output_data
+=
output_channel_stride
;
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
#else
#endif
}
void
Pool2x2Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
#if __aarch64__
#else
void
Pool2x2Avgs2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
2
;
...
...
@@ -125,59 +157,114 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
out_w_num
=
output_width
>>
2
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
float
vqua
[]
=
{
0.25
f
,
0.25
f
,
0.25
f
,
0.25
f
};
int
remain
=
output_width
-
out_w_num
<<
2
;
int
w1
=
input_width
/
16
;
int
_w1
=
input_width
%
16
;
int
w2
=
_w1
/
4
;
int
_w2
=
_w1
%
4
;
float
quarter
=
1
/
4
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_data_chanel_row_next
=
input_data
+
input_width
;
for
(;
output_height
>
0
;
output_height
--
)
{
if
(
out_w_num
>
0
)
{
asm
volatile
(
"avg_loop:
\n\t
"
"vld1.32 {q0,q1}, [%[in_ptr1]]!
\n\t
"
"vld1.32 {q2,q3}, [%[in_ptr2]]!
\n\t
"
"vadd.f32 q0, q0, q2
\n\t
"
"vadd.f32 q1, q1, q3
\n\t
"
"vpadd.f32 d4, d0, d1
\n\t
"
"vpadd.f32 d5, d2, d3
\n\t
"
"vld1.32 {q4}, [%[vqua]]!
\n\t
"
"vmul.f32 q2, q2, q4
\n\t
"
"subs %[out_w_num], #1
\n\t
"
"vst1.32 {q2}, [%[out_ptr]]!
\n\t
"
"bne avg_loop
\n\t
"
:
[
in_ptr1
]
"+r"
(
input_data
),
[
in_ptr2
]
"+r"
(
input_data_chanel_row_next
),
[
out_ptr
]
"+r"
(
output_data
),
[
out_w_num
]
"+r"
(
out_w_num
)
:
[
vqua
]
"r"
(
vqua
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
);
for
(
int
ph
=
0
;
ph
<
input_height
;
ph
+=
2
)
{
const
float
*
in_ptr1
=
input_data
+
i
*
input_batch_stride
+
c
*
input_channel_stride
+
ph
*
input_width
;
const
float
*
in_ptr2
=
in_ptr1
+
input_width
;
if
(
ph
+
1
>=
input_height
)
{
in_ptr2
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
input_width
));
memset
(
static_cast
<
void
*>
(
const_cast
<
float
*>
(
in_ptr2
)),
0
,
sizeof
(
float
)
*
input_width
);
}
float
*
out_ptr
=
output_data
+
i
*
output_batch_stride
+
c
*
output_channel_stride
+
ph
/
2
*
output_width
;
asm
volatile
(
"subs %[w1], %[w1], #1
\n\t
"
"blt end_w1_%=
\n\t
"
"loop_w1_%=:
\n\t
"
"pld [%[in_ptr1], #64]
\n\t
"
"pld [%[in_ptr2], #64]
\n\t
"
"vmov.f32 d0[0], %[quarter]
\n\t
"
"vld1.f32 {q1, q2}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q3, q4}, [%[in_ptr2]]!
\n\t
"
"vld1.f32 {q7, q8}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q9, q10}, [%[in_ptr2]]!
\n\t
"
"vadd.f32 q1, q1, q3
\n\t
"
"vadd.f32 q2, q2, q4
\n\t
"
for
(;
remain
>
0
;
remain
--
)
{
float
max_row1
=
std
::
max
(
input_data
[
0
],
input_data
[
1
]);
float
max_row2
=
std
::
max
(
input_data_chanel_row_next
[
0
],
input_data_chanel_row_next
[
1
]);
*
output_data
=
std
::
max
(
max_row1
,
max_row2
);
input_data
+=
2
;
input_data_chanel_row_next
+=
2
;
output_data
++
;
"vadd.f32 q7, q7, q9
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"vpadd.f32 d10, d2, d3
\n\t
"
"vpadd.f32 d11, d4, d5
\n\t
"
"vpadd.f32 d12, d14, d15
\n\t
"
"vpadd.f32 d13, d16, d17
\n\t
"
"vmul.f32 q5, q5, d0[0]
\n\t
"
"vmul.f32 q6, q6, d0[0]
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"subs %[w1], %[w1], #1
\n\t
"
"bge loop_w1_%=
\n\t
"
"end_w1_%=:
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"blt end_w2_%=
\n\t
"
"loop_w2_%=:
\n\t
"
"vld1.f32 {q1}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q2}, [%[in_ptr2]]!
\n\t
"
"vadd.f32 q1, q1, q2
\n\t
"
"vpadd.f32 d4, d2, d3
\n\t
"
"vmul.f32 d4, d4, d0[0]
\n\t
"
"vst1.32 {d4}, [%[out_ptr]]!
\n\t
"
"subs %[w2], %[w2], #1
\n\t
"
"bge loop_w2_%=
\n\t
"
"end_w2_%=:
\n\t
"
:
:
[
w1
]
"r"
(
w1
),
[
w2
]
"r"
(
w2
),
[
in_ptr1
]
"r"
(
in_ptr1
),
[
in_ptr2
]
"r"
(
in_ptr2
),
[
out_ptr
]
"r"
(
out_ptr
),
[
quarter
]
"r"
(
quarter
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
);
if
(
_w2
!=
0
)
{
in_ptr1
+=
16
*
w1
+
4
*
w2
;
in_ptr2
+=
16
*
w1
+
4
*
w2
;
out_ptr
+=
8
*
w1
+
2
*
w2
;
if
(
_w2
==
1
)
{
*
out_ptr
=
0.5
*
(
*
in_ptr1
+
*
in_ptr2
);
}
else
if
(
_w2
==
2
)
{
float
temp
=
0
;
temp
+=
*
in_ptr1
++
;
temp
+=
*
in_ptr2
++
;
temp
+=
*
in_ptr1
;
temp
+=
*
in_ptr2
;
*
out_ptr
=
0.5
*
temp
;
}
else
if
(
_w2
==
3
)
{
float
temp
=
0
;
temp
+=
*
in_ptr1
++
;
temp
+=
*
in_ptr2
++
;
temp
+=
*
in_ptr1
++
;
temp
+=
*
in_ptr2
++
;
*
out_ptr
++
=
0.5
*
temp
;
*
out_ptr
=
0.5
*
(
*
in_ptr1
+
*
in_ptr2
);
}
}
}
input_data
+=
input_channel_stride
;
output_data
+=
output_channel_stride
;
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
#else
#endif
}
//}
...
...
src/operators/math/pool_2x2.h
浏览文件 @
3b1e90bc
...
...
@@ -26,11 +26,11 @@ namespace math {
using
framework
::
Tensor
;
using
std
::
vector
;
void
Pool2x2Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool2x2Max
s2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool2x2Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
);
void
Pool2x2Avg
s2p0
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/math/pool_3x3.cpp
浏览文件 @
3b1e90bc
...
...
@@ -558,15 +558,13 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
const
float
*
input_seg
=
input_data
+
c
*
input_channel_stride
;
float
*
output_seg
=
output_data
+
c
*
output_channel_stride
;
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
int
hstart
=
ph
*
stride
-
padding
;
int
hend
=
min
(
hstart
+
3
,
input_height
);
hstart
=
max
(
hstart
,
0
);
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
int
hstart
=
ph
*
stride
-
padding
;
int
wstart
=
pw
*
stride
-
padding
;
int
hend
=
min
(
hstart
+
3
,
input_height
+
padding
);
int
wend
=
min
(
wstart
+
3
,
input_width
+
padding
);
hstart
=
max
(
hstart
,
0
);
int
wend
=
min
(
wstart
+
3
,
input_width
);
wstart
=
max
(
wstart
,
0
);
hend
=
min
(
hend
,
input_height
);
wend
=
min
(
wend
,
input_width
);
const
float
*
pos1
=
input_seg
+
hstart
*
input_width
+
wstart
;
const
float
*
pos2
=
input_seg
+
(
hstart
+
1
)
*
input_width
+
wstart
;
const
float
*
pos3
=
input_seg
+
(
hstart
+
2
)
*
input_width
+
wstart
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录