Merge branch 'feature_wuch' into 'master'

fix pooling op See merge request !45

Merge branch 'feature_wuch' into 'master'
fix pooling op See merge request !45
1e51497a · 吴承辉 · 4ed6e236 · ab17d3bd · 1e51497a · 1e51497a
4 changed file
--- a/mace/kernels/neon/max_pooling_neon_2x2.cc
+++ b/mace/kernels/neon/max_pooling_neon_2x2.cc
@@ -61,6 +61,8 @@ void PoolingMaxNeonK2x2S2x2(const float *input,
          }
        }

+        w += num_vectors << 2;
+
        for (; num_vectors > 0; --num_vectors) {
          float32x4_t r00 = vld1q_f32(r0);
          float32x4_t r10 = vld1q_f32(r1);
@@ -79,7 +81,6 @@ void PoolingMaxNeonK2x2S2x2(const float *input,
          outptr += 4;
        }

-        w += num_vectors << 2;
        for (; w < out_width; ++w) {
          float max = std::numeric_limits<float>::lowest();
          for (int kh = 0; kh < 2; ++kh) {

--- a/mace/kernels/neon/max_pooling_neon_3x3.cc
+++ b/mace/kernels/neon/max_pooling_neon_3x3.cc
@@ -71,6 +71,7 @@ void PoolingMaxNeonK3x3S2x2(const float *input,
          }
        }

+        w += num_vectors << 2;
        float32x4x2_t row0 = vld2q_f32(r0);
        float32x4x2_t row1 = vld2q_f32(r1);
        float32x4x2_t row2 = vld2q_f32(r2);
@@ -105,7 +106,6 @@ void PoolingMaxNeonK3x3S2x2(const float *input,
          outptr += 4;
        }

-        w += num_vectors << 2;
        for (; w < out_width; ++w) {
          float max = std::numeric_limits<float>::lowest();
          for (int kh = 0; kh < 3; ++kh) {

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -39,10 +39,12 @@ class PoolingFunctor {
    index_t channels = output_shape[1];
    index_t height = output_shape[2];
    index_t width = output_shape[3];
+    index_t out_image_size = height * width;

    index_t input_channels = input_shape[1];
    index_t input_height = input_shape[2];
    index_t input_width = input_shape[3];
+    index_t in_image_size = input_height * input_width;

    int kernel_h = kernels_[0];
    int kernel_w = kernels_[1];
@@ -57,56 +59,55 @@ class PoolingFunctor {
    int padded_h_start = 0 - paddings_[0] / 2;
    int padded_w_start = 0 - paddings_[1] / 2;

+    if (pooling_type_ == MAX) {
 #pragma omp parallel for collapse(2)
-    for (int n = 0; n < batch; ++n) {
-      for (int c = 0; c < channels; ++c) {
-        index_t out_offset = n * channels * height * width + c * height * width;
-        index_t in_offset = n * input_channels * input_height * input_width +
-                            c * input_height * input_width;
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            T sum_or_max = 0;
-            switch (pooling_type_) {
-              case AVG:
-                break;
-              case MAX:
-                sum_or_max = std::numeric_limits<T>::lowest();
-                break;
-              default:
-                MACE_CHECK(false, "Unsupported pooling type: ", pooling_type_);
-            }
-            for (int kh = 0; kh < kernel_h; ++kh) {
-              for (int kw = 0; kw < kernel_w; ++kw) {
-                int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                if (inh >= 0 && inh < input_height && inw >= 0 &&
-                    inw < input_width) {
-                  index_t input_offset = in_offset + inh * input_width + inw;
-                  switch (pooling_type_) {
-                    case AVG:
-                      sum_or_max += input[input_offset];
-                      break;
-                    case MAX:
-                      sum_or_max = std::max(sum_or_max, input[input_offset]);
-                      break;
-                    default:
-                      MACE_CHECK(false, "Unsupported pooling type: ",
-                                 pooling_type_);
+      for (int b = 0; b < batch; ++b) {
+        for (int c = 0; c < channels; ++c) {
+          index_t out_offset = (b * channels + c) * out_image_size;
+          index_t in_offset = (b * input_channels + c) * in_image_size;
+          for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+              T max = std::numeric_limits<T>::lowest();
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
+                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
+                  if (inh >= 0 && inh < input_height && inw >= 0 &&
+                      inw < input_width) {
+                    index_t input_offset = in_offset + inh * input_width + inw;
+                    max = std::max(max, input[input_offset]);
                  }
                }
              }
+              output[out_offset] = max;
+              out_offset += 1;
            }
-            switch (pooling_type_) {
-              case AVG:
-                output[out_offset] = sum_or_max / (kernel_h * kernel_w);
-                break;
-              case MAX:
-                output[out_offset] = sum_or_max;
-                break;
-              default:
-                MACE_CHECK(false, "Unsupported pooling type: ", pooling_type_);
+          }
+        }
+      }
+    } else if (pooling_type_ == AVG) {
+#pragma omp parallel for collapse(2)
+      for (int b = 0; b < batch; ++b) {
+        for (int c = 0; c < channels; ++c) {
+          index_t out_offset = (b * channels + c) * out_image_size;
+          index_t in_offset = (b * input_channels + c) * in_image_size;
+          for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+              T sum = 0;
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
+                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
+                  if (inh >= 0 && inh < input_height && inw >= 0 &&
+                      inw < input_width) {
+                    index_t input_offset = in_offset + inh * input_width + inw;
+                    sum += input[input_offset];
+                  }
+                }
+              }
+              output[out_offset] = sum / (kernel_h * kernel_w);
+              out_offset += 1;
            }
-            out_offset += 1;
          }
        }
      }

--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -24,16 +24,17 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
  bool Run() override {
    const Tensor* input = this->Input(INPUT);
    Tensor* output = this->Output(OUTPUT);
-    std::vector<index_t> in_shape = input->shape();

    std::vector<index_t> output_shape(4);
    std::vector<int> paddings(2);
-    std::vector<index_t> filter_shape = std::vector<index_t>(4);
-    filter_shape[0] = in_shape[1];
-    filter_shape[1] = in_shape[0];
+    std::vector<index_t> filter_shape(4);
+    filter_shape[0] = input->shape()[1];
+    filter_shape[1] = input->shape()[0];
    filter_shape[2] = kernels_[0];
    filter_shape[3] = kernels_[1];
-    kernels::CalcPaddingAndOutputSize(in_shape.data(), filter_shape.data(),
+
+    kernels::CalcPaddingAndOutputSize(input->shape().data(),
+                                      filter_shape.data(),
                                      this->dilations_.data(),
                                      this->strides_.data(), this->padding_,
                                      output_shape.data(), paddings.data());
@@ -42,7 +43,7 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
    auto pooling_func = kernels::PoolingFunctor<D, T>(
        pooling_type_, kernels_.data(), this->strides_.data(), paddings.data(),
        this->dilations_.data());
-    pooling_func(input->data<float>(), in_shape.data(),
+    pooling_func(input->data<float>(), input->shape().data(),
                 output->mutable_data<float>(), output->shape().data());
    return true;
  };