optimize pool kernel

e6f65a70 · wangliu · 98c24151 · e6f65a70 · e6f65a70 · e6f65a70
9 changed file
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -42,8 +42,6 @@ void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
 template <>
 void ConvAddKernel<CPU, float>::Compute(
    const FushionConvAddParam &param) const {
-  DLOG << param;
-
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -21,8 +21,6 @@ namespace operators {

 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-  LOG(kLOG_DEBUG) << param;
-
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
@@ -32,8 +30,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  std::vector<int> paddings = param.Paddings();
  std::vector<int> dilations = param.Dilations();

-  //  DLOG << " compute end get Attrs " << strides[0];
-
  const int batch_size = static_cast<int>(input->dims()[0]);

  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
@@ -66,7 +62,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
  filter.Resize(filter_matrix_shape);
-  DLOG << " filter.dims() = " << filter.dims();
  framework::DDim output_matrix_shape = {
      output->dims()[1],
      output->numel() / (output->dims()[0] * output->dims()[1])};

--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -56,22 +56,23 @@ void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
      paddings[i] = 0;
      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
    }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool3x3Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool3x3Avg(strides, paddings, in_x, out);
    }

-  PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool2x2Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avg(strides, paddings, in_x, out);
+    }

-  //    if (param.isGlobalPooling() || ksize[0] != ksize[1] ||
-  //        strides[0] != strides[1] || strides[1] != 2 ||
-  //        paddings[0] != paddings[1] || paddings[1] > 1) {
-  //        PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //
-  //    } else if (ksize[0] == 2) {
-  //
-  //    } else if (ksize[0] == 3) {
-  //
-  //    } else {
-  //        PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //    }
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#include "pool_2x2.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int ksize_height = 2;
+  const int ksize_width = 2;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  int out_w_num = output_width >> 2;
+  const int in_h_num = output_height >> 1;
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  int remain = output_width - out_w_num << 2;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_data_chanel_row_next = input_data + input_width;
+      for (; output_height > 0; output_height--) {
+        if (out_w_num > 0) {
+          asm volatile(
+              "max_loop:                            \n\t"
+              "vld1.f32  {q0,q1},  [%[in_ptr1]]!         \n\t"
+              "vld1.f32  {q2,q3},  [%[in_ptr2]]!         \n\t"
+              "vmax.f32  q0,  q0,  q2                 \n\t"
+              "vmax.f32  q1,  q1,  q3                 \n\t"
+              "vpmax.f32  d4,  d0, d1                  \n\t"
+              "vpmax.f32  d5,  d2, d3                  \n\t"
+              "subs %[out_w_num],  #1                  \n\t"
+              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
+              "bne  max_loop                            \n\t"
+              : [in_ptr1] "+r"(input_data),
+                [in_ptr2] "+r"(input_data_chanel_row_next),
+                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
+              :
+              : "memory", "q0", "q1", "q2", "q3");
+        }
+
+        for (; remain > 0; remain--) {
+          float max_row1 = std::max(input_data[0], input_data[1]);
+          float max_row2 = std::max(input_data_chanel_row_next[0],
+                                    input_data_chanel_row_next[1]);
+          *output_data = std::max(max_row1, max_row2);
+          input_data += 2;
+          input_data_chanel_row_next += 2;
+          output_data++;
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int ksize_height = 2;
+  const int ksize_width = 2;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  int out_w_num = output_width >> 2;
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f};
+  int remain = output_width - out_w_num << 2;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_data_chanel_row_next = input_data + input_width;
+      for (; output_height > 0; output_height--) {
+        if (out_w_num > 0) {
+          asm volatile(
+              "avg_loop:                            \n\t"
+              "vld1.32  {q0,q1},  [%[in_ptr1]]!         \n\t"
+              "vld1.32  {q2,q3},  [%[in_ptr2]]!         \n\t"
+              "vadd.f32  q0,  q0,  q2                 \n\t"
+              "vadd.f32  q1,  q1,  q3                 \n\t"
+              "vpadd.f32  d4,  d0, d1                  \n\t"
+              "vpadd.f32  d5,  d2, d3                  \n\t"
+              "vld1.32  {q4}, [%[vqua]]!                  \n\t"
+              "vmul.f32  q2,  q2,  q4                          \n\t"
+              "subs %[out_w_num],  #1                  \n\t"
+              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
+              "bne  avg_loop                            \n\t"
+              : [in_ptr1] "+r"(input_data),
+                [in_ptr2] "+r"(input_data_chanel_row_next),
+                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
+              : [vqua] "r"(vqua)
+              : "memory", "q0", "q1", "q2", "q3", "q4");
+        }
+
+        for (; remain > 0; remain--) {
+          float max_row1 = std::max(input_data[0], input_data[1]);
+          float max_row2 = std::max(input_data_chanel_row_next[0],
+                                    input_data_chanel_row_next[1]);
+          *output_data = std::max(max_row1, max_row2);
+          input_data += 2;
+          input_data_chanel_row_next += 2;
+          output_data++;
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+//}
+}  // namespace math
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -16,16 +16,22 @@ limitations under the License. */

 #pragma once

+#include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
-
-static void Pool2x2Max() {
-  // todo impl with neon
-}
-
-static void Pool2x2Avg() {
-  // todo impl with neon
-}
-
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::vector;
+
+void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output);
+
+void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
+                Tensor *out);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
 #endif
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#define __ARM_NEON true
+#include "pool_3x3.h"
+#include "framework/tensor.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+
+void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const float negative_max = -INT_MAX;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            float max_value = -INT_MAX;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                float value = input_data[h * input_width + w];
+                if (value > max_value) {
+                  max_value = value;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = max_value;
+          } else {
+#if defined(ARMV7)
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vmax.f32 q1, q1, q2            \n\t"
+                "vmax.f32 q2, q1, q3            \n\t"
+                "vmov.f32 d5[1],  %[negative_max]         \n\t"
+                "vpmax.f32  d6, d4, d5            \n\t"
+                "vpmax.f32  d7, d6, d6             \n\t"
+                "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
+                : "memory", "q1", "q2", "q3", "q4");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data3), data2);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  const float zero = 0;
+  const float nine = 1.0 / 9.0;
+  const float nine_ptr[] = {nine, nine};
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int hend = min(hstart + _kernel_size, input_height + padding_height);
+          int wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          const float *pos1 = input_data + hstart * input_width + wstart;
+          const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
+          const float *output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            float sum = 0;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                sum += input_data[h * input_width + w];
+              }
+            }
+            output_data[ph * output_width + pw] = sum / 9.0;
+          } else {
+#if defined(ARMV7)
+
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vadd.f32 q1, q1, q2            \n\t"
+                "vadd.f32 q2, q1, q3            \n\t"
+                "vmov.f32 d5[1],  %[zero]         \n\t"
+                "vpadd.f32  d6, d4, d5            \n\t"
+                "vpadd.f32  d6, d6, d6             \n\t"
+                "vld1.f32 d7, [%[nine_ptr]]!        \n\t"
+                "vmul.f32 d6,d7                     \n\t"
+                "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
+                  [nine_ptr] "r"(nine_ptr)
+                : "memory", "r6", "q1", "q2", "q3", "q4");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t sum_data =
+                vaddq_f32(vaddq_f32(data1, data3), data2);
+            float32x2_t res =
+                vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
+                          vget_low_f32(sum_data));
+            res = vpadd_f32(res, res);
+            output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/pool3x3.h
+++ b/src/operators/math/pool3x3.h
@@ -16,16 +16,24 @@ limitations under the License. */

 #pragma once

+#include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON

-static void Pool3x3Max() {
-  // todo impl with neon
-}
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::vector;

-static void Pool3x3Avg() {
-  // todo impl with neon
-}
+void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output);
+
+void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
+                Tensor *out);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile

 #endif
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -38,9 +38,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
    const int input_height = input.dims()[2];

    const int input_width = input.dims()[3];
-    if (output == nullptr) {
-      DLOG << "output tensor is null";
-    }
+
    const int output_channels = output->dims()[1];

    const int output_height = output->dims()[2];

--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -18,6 +18,8 @@ limitations under the License. */

 #include "common/log.h"
 #include "framework/tensor.h"
+#include "pool_2x2.h"
+#include "pool_3x3.h"

 namespace paddle_mobile {
 namespace operators {