global avg pooling

7b6c3241 · wuchenghui · 5b21653b · 7b6c3241 · 7b6c3241 · 7b6c3241
6 changed file
--- a/mace/kernels/global_avg_pooling.h
+++ b/mace/kernels/global_avg_pooling.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_GLOBAL_AVG_POOLING_H_
+#define MACE_KERNELS_GLOBAL_AVG_POOLING_H_
+
+#include "mace/core/tensor.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, typename T>
+class GlobalAvgPoolingFunctor {
+ public:
+  GlobalAvgPoolingFunctor() {}
+
+  void operator()(const T *input, const index_t *input_shape, T *output) {
+    index_t batch = input_shape[0];
+    index_t channels = input_shape[1];
+    index_t height = input_shape[2];
+    index_t width = input_shape[3];
+
+    index_t image_size = height * width;
+    index_t input_offset = 0;
+    index_t total_channels = batch * channels;
+
+    for (int c = 0; c < total_channels; ++c) {
+      T sum = 0;
+      for (int i = 0; i < image_size; ++i) {
+        sum += input[input_offset + i];
+      }
+      output[c] = sum / image_size;
+      input_offset += image_size;
+    }
+  }
+};
+
+template <>
+void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
+    const float *input,
+    const index_t *input_shape,
+    float *output);
+
+}  //  namespace kernels
+}  //  namespace mace
+
+#endif  // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
\ No newline at end of file
--- a/mace/kernels/neon/global_avg_pooling_neon.cc
+++ b/mace/kernels/neon/global_avg_pooling_neon.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/global_avg_pooling.h"
+#include <arm_neon.h>
+
+namespace mace {
+namespace kernels {
+
+template<>
+void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
+    const float *input,
+    const index_t *input_shape,
+    float *output) {
+  index_t batch = input_shape[0];
+  index_t channels = input_shape[1];
+  index_t height = input_shape[2];
+  index_t width = input_shape[3];
+
+  index_t image_size = height * width;
+  index_t input_offset = 0;
+  index_t total_channels = batch * channels;
+
+#pragma omp parallel for
+  for (int c = 0; c < total_channels; ++c) {
+    const float *inptr = input + c * image_size;
+    float sum = 0.0;
+
+    int num_vectors = image_size >> 2;
+    int remain = image_size - (num_vectors << 2);
+
+    if (num_vectors > 0) {
+      float sum_out[4] = {0.0, 0.0, 0.0, 0.0};
+
+      float32x4_t sum_vector = vld1q_f32(inptr);
+      inptr += 4;
+      for (int n = 1; n < num_vectors; ++n) {
+        float32x4_t vector = vld1q_f32(inptr);
+        sum_vector = vaddq_f32(sum_vector, vector);
+        inptr += 4;
+      }
+      vst1q_f32(sum_out, sum_vector);
+
+      sum = sum_out[0] + sum_out[1] + sum_out[2] + sum_out[3];
+    }
+
+    for (int i = 0; i < remain; ++i) {
+      sum += *inptr;
+      ++inptr;
+    }
+    output[c] = sum / image_size;
+  }
+};
+
+}  // namespace kernels
+}  // namespace mace
\ No newline at end of file
--- a/mace/ops/global_avg_pooling.cc
+++ b/mace/ops/global_avg_pooling.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/global_avg_pooling.h"
+
+namespace mace {
+
+REGISTER_CPU_OPERATOR(GlobalAvgPooling,
+                      GlobalAvgPoolingOp<DeviceType::CPU, float>);
+
+#if __ARM_NEON
+REGISTER_NEON_OPERATOR(GlobalAvgPooling,
+                       GlobalAvgPoolingOp<DeviceType::NEON, float>);
+#endif  // __ARM_NEON
+
+}  //  namespace mace
--- a/mace/ops/global_avg_pooling.h
+++ b/mace/ops/global_avg_pooling.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_GLOBAL_AVG_POOLING_H_
+#define MACE_OPS_GLOBAL_AVG_POOLING_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/global_avg_pooling.h"
+
+namespace mace {
+
+template<DeviceType D, class T>
+class GlobalAvgPoolingOp : public Operator<D, T> {
+ public:
+  GlobalAvgPoolingOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<D, T>(operator_def, ws) {}
+
+  bool Run() override {
+    const Tensor *input = this->Input(INPUT);
+    Tensor *output = this->Output(OUTPUT);
+
+    std::vector<index_t> output_shape(4);
+    output_shape[0] = input->shape()[0];
+    output_shape[1] = input->shape()[1];
+    output_shape[2] = output_shape[3] = 1;
+
+    output->Resize(output_shape);
+
+    auto pooling_func = kernels::GlobalAvgPoolingFunctor<D, T>();
+    pooling_func(input->data<float>(), input->shape().data(),
+                 output->mutable_data<float>());
+    return true;
+  }
+
+ protected:
+  OP_INPUT_TAGS(INPUT);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace mace
+
+#endif  // MACE_OPS_GLOBAL_AVG_POOLING_H_
--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/global_avg_pooling.h"
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+using namespace mace;
+using namespace mace::kernels;
+
+template <DeviceType D>
+static void GlobalAvgPooling(int iters,
+                             int batch,
+                             int channels,
+                             int height,
+                             int width) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  net.AddRandomInput<float>("Input", {batch, channels, height, width});
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+}
+
+#define BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, DEVICE)                    \
+  static void                                                                       \
+      BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
+          int iters) {                                                              \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                \
+    mace::testing::ItemsProcessed(tot);                                             \
+    mace::testing::BytesProcessed(tot*(sizeof(float)));                             \
+    GlobalAvgPooling<DEVICE>(iters, N, C, H, W);                                               \
+  }                                                                                 \
+  BENCHMARK(BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE)
+
+#define BM_GLOBAL_AVG_POOLING(N, C, H, W)       \
+  BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, CPU); \
+  BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, NEON);
+
+BM_GLOBAL_AVG_POOLING(1, 3, 7, 7);
+BM_GLOBAL_AVG_POOLING(1, 3, 64, 64);
+BM_GLOBAL_AVG_POOLING(1, 3, 256, 256);
\ No newline at end of file
--- a/mace/ops/global_avg_pooling_test.cc
+++ b/mace/ops/global_avg_pooling_test.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+
+using namespace mace;
+
+class GlobalAvgPoolingOpTest : public OpsTestBase {};
+
+TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
+  // Construct graph
+  auto& net = test_net();
+  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  std::vector<float> input(147);
+  for (int i = 0; i < 147; ++i) {
+    input[i] = i/49 + 1;
+  }
+  net.AddInputFromArray<float>(
+      "Input", {1, 3, 7, 7}, input);
+
+  // Run
+  net.RunOp();
+
+  // Check
+  auto expected =
+      CreateTensor<float>({1, 3, 1, 1}, {1, 2, 3});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+}
+
+TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {
+  // Construct graph
+  auto& net = test_net();
+  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  std::vector<float> input(147);
+  for (int i = 0; i < 147; ++i) {
+    input[i] = i/49 + 1;
+  }
+  net.AddInputFromArray<float>(
+      "Input", {1, 3, 7, 7}, input);
+
+  // Run
+  net.RunOp(DeviceType::NEON);
+
+  // Check
+  auto expected =
+      CreateTensor<float>({1, 3, 1, 1}, {1, 2, 3});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+}