Change index type to TIndex and Add member variable to batch_norm kernel functor.

303311af · liuqi · 9aabdccc · 303311af · 303311af · 303311af
6 changed file
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -44,8 +44,8 @@ class OperatorBase {
        *operator_def_, name, default_value);
  }
-  inline const Tensor *Input(int idx) {
+  inline const Tensor *Input(TIndex idx) {
-    MACE_CHECK(static_cast<size_t>(idx) < inputs_.size());
+    MACE_CHECK(idx < inputs_.size());
    return inputs_[idx];
  }

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -68,15 +68,8 @@ class Tensor {
  inline TIndex dim_size() const { return shape_.size(); }
-  inline int dim32(int index) const {
+  inline TIndex dim(TIndex index) const {
-    MACE_CHECK(static_cast<size_t>(index) < shape_.size(), "Exceeding ndim limit");
+    MACE_CHECK(index < shape_.size(), "Exceeding ndim limit");
-    MACE_CHECK(index >= 0, "Cannot have negative dimension index");
-    MACE_CHECK(shape_[index], std::numeric_limits<int>::max());
-    return static_cast<int>(shape_[index]);
-  }
-  inline TIndex dim(int index) const {
-    MACE_CHECK(static_cast<size_t>(index) < shape_.size(), "Exceeding ndim limit");
    MACE_CHECK(index >= 0, "Cannot have negative dimension index");
    return shape_[index];
  }
@@ -133,8 +126,8 @@ class Tensor {
  }
  template <typename T>
-  inline void Copy(const T* src, size_t size) {
+  inline void Copy(const T* src, TIndex size) {
-    MACE_CHECK(static_cast<TIndex>(size) == size_, "copy src and dst with different size.");
+    MACE_CHECK(size == size_, "copy src and dst with different size.");
    CopyBytes(static_cast<const void*>(src), sizeof(T) * size);
  }

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -11,33 +11,29 @@
 namespace mace {
 namespace kernels {
+template <DeviceType D, typename T>
+struct BatchNormFunctorBase {
+  BatchNormFunctorBase(const float variance_epsilon)
+          :variance_epsilon_(variance_epsilon){}
-template<DeviceType D>
+  float variance_epsilon_;
-struct BatchNormFunctor {
-  void operator()(const float* input,
-                  const float* scale,
-                  const float* offset,
-                  const float* mean,
-                  const float* var,
-                  const int n,
-                  const int channel,
-                  const int sample_size,
-                  const float variance_epsilon,
-                  float* output) ;
 };
-template<>
-struct BatchNormFunctor<DeviceType::CPU> {
+template<DeviceType D, typename T>
-  void operator()(const float* input,
+struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
-                  const float* scale,
+  BatchNormFunctor(const float variance_epsilon)
-                  const float* offset,
+          :BatchNormFunctorBase<D, T>(variance_epsilon){}
-                  const float* mean,
-                  const float* var,
+  void operator()(const T* input,
-                  const int n,
+                  const T* scale,
-                  const int channel,
+                  const T* offset,
-                  const int sample_size,
+                  const T* mean,
-                  const float variance_epsilon,
+                  const T* var,
-                  float* output) {
+                  const TIndex n,
+                  const TIndex channel,
+                  const TIndex sample_size,
+                  T* output) {
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
    // The calculation formula for inference is
    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
@@ -45,21 +41,22 @@ struct BatchNormFunctor<DeviceType::CPU> {
    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
    // new_offset = \offset - mean * common_val;
    // Y = new_scale * X + new_offset;
-    float new_scale, new_offset;
+    T new_scale, new_offset;
-    for (int c = 0; c < channel; ++c) {
+    for (TIndex c = 0; c < channel; ++c) {
-      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
+      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
      new_offset = offset[c] - mean[c] * new_scale;
-      for (int i = 0; i < n; ++i) {
+      for (TIndex i = 0; i < n; ++i) {
-        int pos = i * channel * sample_size + c * sample_size;
+        TIndex pos = i * channel * sample_size + c * sample_size;
-        const float* input_sample_ptr = input + pos;
+        const T* input_sample_ptr = input + pos;
-        float* output_sample_ptr = output + pos;
+        T* output_sample_ptr = output + pos;
-        for (int j = 0; j < sample_size; ++j) {
+        for (TIndex j = 0; j < sample_size; ++j) {
          output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
        }
      }
    }
  }
 };
 } //  namepsace kernels

--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -2,25 +2,27 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#if __ARM_NEON
+//#if __ARM_NEON
 #include <arm_neon.h>
 #include "mace/kernels/batch_norm.h"
 namespace mace {
 namespace kernels {
-template<>
+template <typename T>
-struct BatchNormFunctor<DeviceType::NEON> {
+struct BatchNormFunctor<DeviceType::NEON> : public BatchNormFunctorBase<DeviceType::NEON, T> {
-  void operator()(const float* input,
+  BatchNormFunctor(const float variance_epsilon)
-                  const float* scale,
+          :BatchNormFunctorBase<DeviceType::NEON, T>(variance_epsilon){}
-                  const float* offset,
-                  const float* mean,
+  void operator()(const T* input,
-                  const float* var,
+                  const T* scale,
+                  const T* offset,
+                  const T* mean,
+                  const T* var,
                  const int n,
                  const int channel,
                  const int sample_size,
-                  const float variance_epsilon,
+                  T* output) {
-                  float* output) {
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
    // The calculation formula for inference is
@@ -29,21 +31,21 @@ struct BatchNormFunctor<DeviceType::NEON> {
    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
    // new_offset = \offset - mean * common_val;
    // Y = new_scale * X + new_offset;
-    float new_scale, new_offset;
+    T new_scale, new_offset;
    int count = sample_size >> 2;
    int remain_count = sample_size - count;
-    for (int c = 0; c < channel; ++c) {
+    for (TIndex c = 0; c < channel; ++c) {
-      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
+      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon_);
      new_offset = offset[c] - mean[c] * new_scale;
      float32x4_t new_scale_f = vdupq_n_f32(new_scale);
      float32x4_t new_offset_f = vdupq_n_f32(new_offset);
-      for (int i = 0; i < n; ++i) {
+      for (TIndex i = 0; i < n; ++i) {
-        int pos = i * channel * sample_size + c * sample_size;
+        TIndex pos = i * channel * sample_size + c * sample_size;
        const float* input_sample_ptr = input + pos;
        float* output_sample_ptr = output + pos;
-        for(int j = 0; j < count; ++j) {
+        for(TIndex j = 0; j < count; ++j) {
          float32x4_t input_f = vld1q_f32(input_sample_ptr);
          float32x4_t output_f = new_offset_f;
          output_f = vfmaq_f32(output_f, input_f, new_scale_f);
@@ -51,7 +53,7 @@ struct BatchNormFunctor<DeviceType::NEON> {
          input_sample_ptr += 4;
          output_sample_ptr += 4;
        }
-        for(int j = 0; j < remain_count; ++j) {
+        for(TIndex j = 0; j < remain_count; ++j) {
          *output_sample_ptr = new_scale * *input_sample_ptr + new_offset;
          ++output_sample_ptr;
          ++input_sample_ptr;
@@ -63,4 +65,4 @@ struct BatchNormFunctor<DeviceType::NEON> {
 } // namespace kernels
 } //  namespace mace
-#endif // __ARM_NEON
+//#endif // __ARM_NEON
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -3,7 +3,6 @@
 //
 #include "mace/ops/batch_norm.h"
-#include "mace/proto/mace.pb.h"
 namespace mace {

--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -14,7 +14,8 @@ template<DeviceType D, class T>
 class BatchNormOp : public Operator<D, T> {
  public:
    BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-            : Operator<D, T>(operator_def, ws) {}
+            : Operator<D, T>(operator_def, ws),
+              functor_(OperatorBase::GetSingleArgument<float>("variance_epsilon", 1e-4)){}
    bool Run() override {
      const Tensor* input = this->Input(0);
@@ -23,8 +24,6 @@ class BatchNormOp : public Operator<D, T> {
      const Tensor* mean = this->Input(3);
      const Tensor* var = this->Input(4);
-      const float variance_epsilon = this->template GetSingleArgument<float>("variance_epsilon", 1e-4);
      MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", input->dim_size());
      MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", scale->dim_size());
      MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", offset->dim_size());
@@ -34,9 +33,9 @@ class BatchNormOp : public Operator<D, T> {
      Tensor* output = this->Output(0);
      output->ResizeLike(input);
-      const int n = input->dim32(0);
+      const TIndex n = input->dim(0);
-      const int channel = input->dim32(1);
+      const TIndex channel = input->dim(1);
-      const int sample_size = input->dim32(2) * input->dim32(3);
+      const TIndex sample_size = input->dim(2) * input->dim(3);
      const float* input_ptr = input->data<float>();
      const float* scale_ptr = scale->data<float>();
@@ -45,11 +44,13 @@ class BatchNormOp : public Operator<D, T> {
      const float* var_ptr = var->data<float>();
      float* output_ptr = output->mutable_data<float>();
-      kernels::BatchNormFunctor<D>()(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr,
+      functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr,
                                     n, channel, sample_size,
-                                     variance_epsilon, output_ptr);
+                                     output_ptr);
      return true;
    }
+  private:
+    kernels::BatchNormFunctor<D, T> functor_;
 };