Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
e310c6c8
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e310c6c8
编写于
6月 15, 2018
作者:
W
WangLiu
提交者:
GitHub
6月 15, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #425 from cocodark/develop
optimize pool kernel
上级
98c24151
e6f65a70
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
456 addition
and
40 deletion
+456
-40
src/operators/kernel/arm/conv_add_kernel.cpp
src/operators/kernel/arm/conv_add_kernel.cpp
+0
-2
src/operators/kernel/arm/conv_kernel.cpp
src/operators/kernel/arm/conv_kernel.cpp
+0
-5
src/operators/kernel/arm/pool_kernel.cpp
src/operators/kernel/arm/pool_kernel.cpp
+16
-15
src/operators/math/pool_2x2.cpp
src/operators/math/pool_2x2.cpp
+176
-0
src/operators/math/pool_2x2.h
src/operators/math/pool_2x2.h
+15
-9
src/operators/math/pool_3x3.cpp
src/operators/math/pool_3x3.cpp
+232
-0
src/operators/math/pool_3x3.h
src/operators/math/pool_3x3.h
+14
-6
src/operators/math/pooling.cpp
src/operators/math/pooling.cpp
+1
-3
src/operators/math/pooling.h
src/operators/math/pooling.h
+2
-0
未找到文件。
src/operators/kernel/arm/conv_add_kernel.cpp
浏览文件 @
e310c6c8
...
...
@@ -42,8 +42,6 @@ void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
template
<
>
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FushionConvAddParam
&
param
)
const
{
DLOG
<<
param
;
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
...
...
src/operators/kernel/arm/conv_kernel.cpp
浏览文件 @
e310c6c8
...
...
@@ -21,8 +21,6 @@ namespace operators {
template
<
>
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
LOG
(
kLOG_DEBUG
)
<<
param
;
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
...
...
@@ -32,8 +30,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam ¶m) const {
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
// DLOG << " compute end get Attrs " << strides[0];
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
...
...
@@ -66,7 +62,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam ¶m) const {
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
DLOG
<<
" filter.dims() = "
<<
filter
.
dims
();
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
...
...
src/operators/kernel/arm/pool_kernel.cpp
浏览文件 @
e310c6c8
...
...
@@ -56,22 +56,23 @@ void PoolKernel<CPU, float>::Compute(const PoolParam ¶m) const {
paddings
[
i
]
=
0
;
ksize
[
i
]
=
static_cast
<
int
>
(
in_x
->
dims
()[
i
+
2
]);
}
}
}
else
if
(
ksize
[
0
]
==
3
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
pooling_type
==
"max"
)
{
math
::
Pool3x3Max
(
strides
,
paddings
,
in_x
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool3x3Avg
(
strides
,
paddings
,
in_x
,
out
);
}
PoolBasic
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
// if (param.isGlobalPooling() || ksize[0] != ksize[1] ||
// strides[0] != strides[1] || strides[1] != 2 ||
// paddings[0] != paddings[1] || paddings[1] > 1) {
// PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
//
// } else if (ksize[0] == 2) {
//
// } else if (ksize[0] == 3) {
//
// } else {
// PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
// }
}
else
if
(
ksize
[
0
]
==
2
&&
ksize
[
0
]
==
ksize
[
1
])
{
if
(
pooling_type
==
"max"
)
{
math
::
Pool2x2Max
(
strides
,
paddings
,
in_x
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2x2Avg
(
strides
,
paddings
,
in_x
,
out
);
}
}
else
{
PoolBasic
(
pooling_type
,
ksize
,
strides
,
paddings
,
in_x
,
out
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/math/pool_2x2.cpp
0 → 100644
浏览文件 @
e310c6c8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "pool_2x2.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
void
Pool2x2Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
2
;
const
int
ksize_width
=
2
;
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
out_w_num
=
output_width
>>
2
;
const
int
in_h_num
=
output_height
>>
1
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
int
remain
=
output_width
-
out_w_num
<<
2
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_data_chanel_row_next
=
input_data
+
input_width
;
for
(;
output_height
>
0
;
output_height
--
)
{
if
(
out_w_num
>
0
)
{
asm
volatile
(
"max_loop:
\n\t
"
"vld1.f32 {q0,q1}, [%[in_ptr1]]!
\n\t
"
"vld1.f32 {q2,q3}, [%[in_ptr2]]!
\n\t
"
"vmax.f32 q0, q0, q2
\n\t
"
"vmax.f32 q1, q1, q3
\n\t
"
"vpmax.f32 d4, d0, d1
\n\t
"
"vpmax.f32 d5, d2, d3
\n\t
"
"subs %[out_w_num], #1
\n\t
"
"vst1.32 {q2}, [%[out_ptr]]!
\n\t
"
"bne max_loop
\n\t
"
:
[
in_ptr1
]
"+r"
(
input_data
),
[
in_ptr2
]
"+r"
(
input_data_chanel_row_next
),
[
out_ptr
]
"+r"
(
output_data
),
[
out_w_num
]
"+r"
(
out_w_num
)
:
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
);
}
for
(;
remain
>
0
;
remain
--
)
{
float
max_row1
=
std
::
max
(
input_data
[
0
],
input_data
[
1
]);
float
max_row2
=
std
::
max
(
input_data_chanel_row_next
[
0
],
input_data_chanel_row_next
[
1
]);
*
output_data
=
std
::
max
(
max_row1
,
max_row2
);
input_data
+=
2
;
input_data_chanel_row_next
+=
2
;
output_data
++
;
}
}
input_data
+=
input_channel_stride
;
output_data
+=
output_channel_stride
;
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
}
void
Pool2x2Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
ksize_height
=
2
;
const
int
ksize_width
=
2
;
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
out_w_num
=
output_width
>>
2
;
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
float
vqua
[]
=
{
0.25
f
,
0.25
f
,
0.25
f
,
0.25
f
};
int
remain
=
output_width
-
out_w_num
<<
2
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
const
float
*
input_data_chanel_row_next
=
input_data
+
input_width
;
for
(;
output_height
>
0
;
output_height
--
)
{
if
(
out_w_num
>
0
)
{
asm
volatile
(
"avg_loop:
\n\t
"
"vld1.32 {q0,q1}, [%[in_ptr1]]!
\n\t
"
"vld1.32 {q2,q3}, [%[in_ptr2]]!
\n\t
"
"vadd.f32 q0, q0, q2
\n\t
"
"vadd.f32 q1, q1, q3
\n\t
"
"vpadd.f32 d4, d0, d1
\n\t
"
"vpadd.f32 d5, d2, d3
\n\t
"
"vld1.32 {q4}, [%[vqua]]!
\n\t
"
"vmul.f32 q2, q2, q4
\n\t
"
"subs %[out_w_num], #1
\n\t
"
"vst1.32 {q2}, [%[out_ptr]]!
\n\t
"
"bne avg_loop
\n\t
"
:
[
in_ptr1
]
"+r"
(
input_data
),
[
in_ptr2
]
"+r"
(
input_data_chanel_row_next
),
[
out_ptr
]
"+r"
(
output_data
),
[
out_w_num
]
"+r"
(
out_w_num
)
:
[
vqua
]
"r"
(
vqua
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
);
}
for
(;
remain
>
0
;
remain
--
)
{
float
max_row1
=
std
::
max
(
input_data
[
0
],
input_data
[
1
]);
float
max_row2
=
std
::
max
(
input_data_chanel_row_next
[
0
],
input_data_chanel_row_next
[
1
]);
*
output_data
=
std
::
max
(
max_row1
,
max_row2
);
input_data
+=
2
;
input_data_chanel_row_next
+=
2
;
output_data
++
;
}
}
input_data
+=
input_channel_stride
;
output_data
+=
output_channel_stride
;
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
}
//}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool_2x2.h
浏览文件 @
e310c6c8
...
...
@@ -16,16 +16,22 @@ limitations under the License. */
#pragma once
#include "framework/tensor.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
static
void
Pool2x2Max
()
{
// todo impl with neon
}
static
void
Pool2x2Avg
()
{
// todo impl with neon
}
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
vector
;
void
Pool2x2Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool2x2Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool_3x3.cpp
0 → 100644
浏览文件 @
e310c6c8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#define __ARM_NEON true
#include "pool_3x3.h"
#include "framework/tensor.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
max
;
using
std
::
min
;
using
std
::
vector
;
void
Pool3x3Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
_kernel_size
=
3
;
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
float
negative_max
=
-
INT_MAX
;
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
const
float
*
pos1
,
*
pos2
,
*
pos3
,
*
output_ptr
;
int
hstart
,
wstart
,
hend
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
hstart
=
ph
*
stride_height
-
padding_height
;
wstart
=
pw
*
stride_width
-
padding_width
;
hend
=
min
(
hstart
+
_kernel_size
,
input_height
+
padding_height
);
wend
=
min
(
wstart
+
_kernel_size
,
input_width
+
padding_width
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
hend
=
min
(
hend
,
input_height
);
wend
=
min
(
wend
,
input_width
);
pos1
=
input_data
+
hstart
*
input_width
+
wstart
;
pos2
=
input_data
+
(
hstart
+
1
)
*
input_width
+
wstart
;
pos3
=
input_data
+
(
hstart
+
2
)
*
input_width
+
wstart
;
output_ptr
=
output_data
+
ph
*
output_width
+
pw
;
if
(
hend
-
hstart
!=
3
||
wend
-
wstart
!=
3
)
{
float
max_value
=
-
INT_MAX
;
for
(
int
h
=
hstart
;
h
<
hend
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
w
++
)
{
float
value
=
input_data
[
h
*
input_width
+
w
];
if
(
value
>
max_value
)
{
max_value
=
value
;
}
}
}
output_data
[
ph
*
output_width
+
pw
]
=
max_value
;
}
else
{
#if defined(ARMV7)
asm
volatile
(
"vld1.32 {q1}, [%[pos1]]
\n\t
"
"vld1.32 {q2}, [%[pos2]]
\n\t
"
"vld1.32 {q3}, [%[pos3]]
\n\t
"
"vmax.f32 q1, q1, q2
\n\t
"
"vmax.f32 q2, q1, q3
\n\t
"
"vmov.f32 d5[1], %[negative_max]
\n\t
"
"vpmax.f32 d6, d4, d5
\n\t
"
"vpmax.f32 d7, d6, d6
\n\t
"
"vst1.32 {d7[0]},[%[output_ptr]]
\n\t
"
:
:
[
input_data
]
"r"
(
input_data
),
[
pos1
]
"r"
(
pos1
),
[
pos2
]
"r"
(
pos2
),
[
pos3
]
"r"
(
pos3
),
[
output_ptr
]
"r"
(
output_ptr
),
[
negative_max
]
"r"
(
negative_max
)
:
"memory"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
);
#else
const
float32x4_t
data1
=
vld1q_f32
(
pos1
);
const
float32x4_t
data2
=
vld1q_f32
(
pos2
);
const
float32x4_t
data3
=
vld1q_f32
(
pos3
);
const
float32x4_t
max_data
=
vmaxq_f32
(
vmaxq_f32
(
data1
,
data3
),
data2
);
float32x2_t
res
=
vpmax_f32
(
vget_high_f32
(
vsetq_lane_f32
(
-
INT_MAX
,
max_data
,
3
)),
vget_low_f32
(
max_data
));
res
=
vpmax_f32
(
res
,
res
);
output_data
[
ph
*
output_width
+
pw
]
=
vget_lane_f32
(
res
,
0
);
#endif
}
}
}
input_data
+=
input_channel_stride
;
output_data
+=
output_channel_stride
;
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
}
void
Pool3x3Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
)
{
#if __ARM_NEON
const
int
batch_size
=
input
->
dims
()[
0
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
input_width
=
input
->
dims
()[
3
];
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
const
int
output_width
=
output
->
dims
()[
3
];
const
int
_kernel_size
=
3
;
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
input_channel_stride
=
input_height
*
input_width
;
const
int
output_channel_stride
=
output_height
*
output_width
;
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
const
float
zero
=
0
;
const
float
nine
=
1.0
/
9.0
;
const
float
nine_ptr
[]
=
{
nine
,
nine
};
const
int
input_batch_stride
=
output_channels
*
input_channel_stride
;
const
int
output_batch_stride
=
output_channels
*
output_channel_stride
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
output_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
output_width
;
pw
++
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
hend
=
min
(
hstart
+
_kernel_size
,
input_height
+
padding_height
);
int
wend
=
min
(
wstart
+
_kernel_size
,
input_width
+
padding_width
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
hend
=
min
(
hend
,
input_height
);
wend
=
min
(
wend
,
input_width
);
const
float
*
pos1
=
input_data
+
hstart
*
input_width
+
wstart
;
const
float
*
pos2
=
input_data
+
(
hstart
+
1
)
*
input_width
+
wstart
;
const
float
*
pos3
=
input_data
+
(
hstart
+
2
)
*
input_width
+
wstart
;
const
float
*
output_ptr
=
output_data
+
ph
*
output_width
+
pw
;
if
(
hend
-
hstart
!=
3
||
wend
-
wstart
!=
3
)
{
float
sum
=
0
;
for
(
int
h
=
hstart
;
h
<
hend
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
w
++
)
{
sum
+=
input_data
[
h
*
input_width
+
w
];
}
}
output_data
[
ph
*
output_width
+
pw
]
=
sum
/
9.0
;
}
else
{
#if defined(ARMV7)
asm
volatile
(
"vld1.32 {q1}, [%[pos1]]
\n\t
"
"vld1.32 {q2}, [%[pos2]]
\n\t
"
"vld1.32 {q3}, [%[pos3]]
\n\t
"
"vadd.f32 q1, q1, q2
\n\t
"
"vadd.f32 q2, q1, q3
\n\t
"
"vmov.f32 d5[1], %[zero]
\n\t
"
"vpadd.f32 d6, d4, d5
\n\t
"
"vpadd.f32 d6, d6, d6
\n\t
"
"vld1.f32 d7, [%[nine_ptr]]!
\n\t
"
"vmul.f32 d6,d7
\n\t
"
"vst1.32 {d6[0]},[%[output_ptr]]
\n\t
"
:
:
[
input_data
]
"r"
(
input_data
),
[
pos1
]
"r"
(
pos1
),
[
pos2
]
"r"
(
pos2
),
[
pos3
]
"r"
(
pos3
),
[
output_ptr
]
"r"
(
output_ptr
),
[
zero
]
"r"
(
zero
),
[
nine_ptr
]
"r"
(
nine_ptr
)
:
"memory"
,
"r6"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
);
#else
const
float32x4_t
data1
=
vld1q_f32
(
pos1
);
const
float32x4_t
data2
=
vld1q_f32
(
pos2
);
const
float32x4_t
data3
=
vld1q_f32
(
pos3
);
const
float32x4_t
sum_data
=
vaddq_f32
(
vaddq_f32
(
data1
,
data3
),
data2
);
float32x2_t
res
=
vpadd_f32
(
vget_high_f32
(
vsetq_lane_f32
(
0
,
sum_data
,
3
)),
vget_low_f32
(
sum_data
));
res
=
vpadd_f32
(
res
,
res
);
output_data
[
ph
*
output_width
+
pw
]
=
vget_lane_f32
(
res
,
0
)
/
9.0
;
#endif
}
}
}
input_data
+=
input_channel_stride
;
output_data
+=
output_channel_stride
;
}
input_data
+=
input_batch_stride
;
output_data
+=
output_batch_stride
;
}
#endif
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pool3x3.h
→
src/operators/math/pool
_
3x3.h
浏览文件 @
e310c6c8
...
...
@@ -16,16 +16,24 @@ limitations under the License. */
#pragma once
#include "framework/tensor.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
static
void
Pool3x3Max
()
{
// todo impl with neon
}
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
using
std
::
vector
;
static
void
Pool3x3Avg
()
{
// todo impl with neon
}
void
Pool3x3Max
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
input
,
Tensor
*
output
);
void
Pool3x3Avg
(
vector
<
int
>
strides
,
vector
<
int
>
paddings
,
const
Tensor
*
in_x
,
Tensor
*
out
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/pooling.cpp
浏览文件 @
e310c6c8
...
...
@@ -38,9 +38,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
if
(
output
==
nullptr
)
{
DLOG
<<
"output tensor is null"
;
}
const
int
output_channels
=
output
->
dims
()[
1
];
const
int
output_height
=
output
->
dims
()[
2
];
...
...
src/operators/math/pooling.h
浏览文件 @
e310c6c8
...
...
@@ -18,6 +18,8 @@ limitations under the License. */
#include "common/log.h"
#include "framework/tensor.h"
#include "pool_2x2.h"
#include "pool_3x3.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录