提交 1e151706 编写于 作者: H hjchen2

Optimize pooling which efficiency has increased by 30% for googlenet, Fix pooling3x3 for stride 2

上级 e7201cf6
...@@ -110,7 +110,7 @@ inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) { ...@@ -110,7 +110,7 @@ inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
break; break;
case 3: case 3:
vst1_f32(output, vget_low_f32(r0)); vst1_f32(output, vget_low_f32(r0));
vst1_lane_f32(output, vget_high_f32(r0), 0); vst1q_lane_f32(output, r0, 2);
break; break;
} }
} }
......
...@@ -53,7 +53,7 @@ struct PoolingVal<AVG> { ...@@ -53,7 +53,7 @@ struct PoolingVal<AVG> {
++count; ++count;
return *this; return *this;
} }
inline float Value() { return (count > 0) ? val / count : 0.f; } inline float Value() { return (count > 0) ? val * (1.f / count) : 0.f; }
}; };
#if defined(__ARM_NEON) || defined(__ARM_NEON__) #if defined(__ARM_NEON) || defined(__ARM_NEON__)
...@@ -67,6 +67,16 @@ inline float32x4_t vPoolInitq_f32<AVG>() { ...@@ -67,6 +67,16 @@ inline float32x4_t vPoolInitq_f32<AVG>() {
return vdupq_n_f32(0.f); return vdupq_n_f32(0.f);
} }
template <PoolingType P = MAX>
inline float32x2_t vPoolInit_f32() {
return vdup_n_f32(-std::numeric_limits<float>::max());
}
template <>
inline float32x2_t vPoolInit_f32<AVG>() {
return vdup_n_f32(0.f);
}
template <PoolingType P = MAX> template <PoolingType P = MAX>
inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) { inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) {
return vmaxq_f32(x1, x2); return vmaxq_f32(x1, x2);
...@@ -78,6 +88,28 @@ inline float32x4_t vPoolPreq_f32<AVG>(const float32x4_t &x1, ...@@ -78,6 +88,28 @@ inline float32x4_t vPoolPreq_f32<AVG>(const float32x4_t &x1,
return vaddq_f32(x1, x2); return vaddq_f32(x1, x2);
} }
template <PoolingType P = MAX>
inline float32x2_t vPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) {
return vmax_f32(x1, x2);
}
template <>
inline float32x2_t vPoolPre_f32<AVG>(const float32x2_t &x1,
const float32x2_t &x2) {
return vadd_f32(x1, x2);
}
template <PoolingType P = MAX>
inline float32x2_t vpPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) {
return vpmax_f32(x1, x2);
}
template <>
inline float32x2_t vpPoolPre_f32<AVG>(const float32x2_t &x1,
const float32x2_t &x2) {
return vpadd_f32(x1, x2);
}
template <PoolingType P = MAX> template <PoolingType P = MAX>
inline float32x4_t vPoolPostq_f32(const float32x4_t &x, inline float32x4_t vPoolPostq_f32(const float32x4_t &x,
const float32x4_t &post) { const float32x4_t &post) {
...@@ -89,6 +121,18 @@ inline float32x4_t vPoolPostq_f32<AVG>(const float32x4_t &x, ...@@ -89,6 +121,18 @@ inline float32x4_t vPoolPostq_f32<AVG>(const float32x4_t &x,
const float32x4_t &post) { const float32x4_t &post) {
return vmulq_f32(x, post); return vmulq_f32(x, post);
} }
template <PoolingType P = MAX>
inline float32x2_t vPoolPost_f32(const float32x2_t &x,
const float32x2_t &post) {
return x;
}
template <>
inline float32x2_t vPoolPost_f32<AVG>(const float32x2_t &x,
const float32x2_t &post) {
return vmul_f32(x, post);
}
#endif // __ARM_NEON__ #endif // __ARM_NEON__
template <PoolingType P = MAX> template <PoolingType P = MAX>
......
...@@ -40,7 +40,7 @@ namespace math { ...@@ -40,7 +40,7 @@ namespace math {
template <PoolingType P, int Stride = 1> template <PoolingType P, int Stride = 1>
struct Pooling2x2NormalRowLoadInput { struct Pooling2x2NormalRowLoadInput {
inline void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) { void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) {
x0[0] = vld1q_f32(input); x0[0] = vld1q_f32(input);
x0[1] = vld1q_f32(input + 4); x0[1] = vld1q_f32(input + 4);
x1[0] = vextq_f32(x0[0], x0[1], 1); x1[0] = vextq_f32(x0[0], x0[1], 1);
...@@ -50,7 +50,7 @@ struct Pooling2x2NormalRowLoadInput { ...@@ -50,7 +50,7 @@ struct Pooling2x2NormalRowLoadInput {
template <PoolingType P> template <PoolingType P>
struct Pooling2x2NormalRowLoadInput<P, 2> { struct Pooling2x2NormalRowLoadInput<P, 2> {
inline void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) { void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) {
float32x4x2_t t0 = vld2q_f32(input); float32x4x2_t t0 = vld2q_f32(input);
float32x4x2_t t1 = vld2q_f32(input + 8); float32x4x2_t t1 = vld2q_f32(input + 8);
x0[0] = t0.val[0]; x0[0] = t0.val[0];
......
此差异已折叠。
...@@ -169,55 +169,55 @@ int main(int argc, char *argv[]) { ...@@ -169,55 +169,55 @@ int main(int argc, char *argv[]) {
<< "float, pooling_type=avg, kernel=3, pad=5, stride=2"; << "float, pooling_type=avg, kernel=3, pad=5, stride=2";
paddle_mobile::TestPoolOp<1, 3, 5, 2>(in_channels, in_height, in_width); paddle_mobile::TestPoolOp<1, 3, 5, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=0, stride=1"; // << "float, pooling_type=max, kernel=2, pad=0, stride=1";
paddle_mobile::TestPoolOp<0, 2, 0, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 0, 1>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=1, stride=1"; // << "float, pooling_type=max, kernel=2, pad=1, stride=1";
paddle_mobile::TestPoolOp<0, 2, 1, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 1, 1>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=2, stride=1"; // << "float, pooling_type=max, kernel=2, pad=2, stride=1";
paddle_mobile::TestPoolOp<0, 2, 2, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 2, 1>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=5, stride=1"; // << "float, pooling_type=max, kernel=2, pad=5, stride=1";
paddle_mobile::TestPoolOp<0, 2, 5, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 5, 1>(in_channels, in_height, in_width);
//
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=0, stride=1"; // << "float, pooling_type=avg, kernel=2, pad=0, stride=1";
paddle_mobile::TestPoolOp<1, 2, 0, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 0, 1>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=1, stride=1"; // << "float, pooling_type=avg, kernel=2, pad=1, stride=1";
paddle_mobile::TestPoolOp<1, 2, 1, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 1, 1>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=2, stride=1"; // << "float, pooling_type=avg, kernel=2, pad=2, stride=1";
paddle_mobile::TestPoolOp<1, 2, 2, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 2, 1>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=5, stride=1"; // << "float, pooling_type=avg, kernel=2, pad=5, stride=1";
paddle_mobile::TestPoolOp<1, 2, 5, 1>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 5, 1>(in_channels, in_height, in_width);
//
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=0, stride=2"; // << "float, pooling_type=max, kernel=2, pad=0, stride=2";
paddle_mobile::TestPoolOp<0, 2, 0, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 0, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=1, stride=2"; // << "float, pooling_type=max, kernel=2, pad=1, stride=2";
paddle_mobile::TestPoolOp<0, 2, 1, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 1, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=2, stride=2"; // << "float, pooling_type=max, kernel=2, pad=2, stride=2";
paddle_mobile::TestPoolOp<0, 2, 2, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 2, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=max, kernel=2, pad=5, stride=2"; // << "float, pooling_type=max, kernel=2, pad=5, stride=2";
paddle_mobile::TestPoolOp<0, 2, 5, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<0, 2, 5, 2>(in_channels, in_height, in_width);
//
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=0, stride=2"; // << "float, pooling_type=avg, kernel=2, pad=0, stride=2";
paddle_mobile::TestPoolOp<1, 2, 0, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 0, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=1, stride=2"; // << "float, pooling_type=avg, kernel=2, pad=1, stride=2";
paddle_mobile::TestPoolOp<1, 2, 1, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 1, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=2, stride=2"; // << "float, pooling_type=avg, kernel=2, pad=2, stride=2";
paddle_mobile::TestPoolOp<1, 2, 2, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 2, 2>(in_channels, in_height, in_width);
LOG(paddle_mobile::kLOG_INFO) // LOG(paddle_mobile::kLOG_INFO)
<< "float, pooling_type=avg, kernel=2, pad=5, stride=2"; // << "float, pooling_type=avg, kernel=2, pad=5, stride=2";
paddle_mobile::TestPoolOp<1, 2, 5, 2>(in_channels, in_height, in_width); // paddle_mobile::TestPoolOp<1, 2, 5, 2>(in_channels, in_height, in_width);
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册