From 303311afb03eceed6f528c8e87a0650a14dcca83 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Wed, 6 Sep 2017 18:08:09 +0800
Subject: [PATCH] Change index type to TIndex and Add member variable to
 batch_norm kernel functor.

---
 mace/core/operator.h                 |  4 +-
 mace/core/tensor.h                   | 15 ++-----
 mace/kernels/batch_norm.h            | 61 +++++++++++++---------------
 mace/kernels/neon/batch_norm_neon.cc | 38 +++++++++--------
 mace/ops/batch_norm.cc               |  1 -
 mace/ops/batch_norm.h                | 17 ++++----
 6 files changed, 64 insertions(+), 72 deletions(-)
diff --git a/mace/core/operator.h b/mace/core/operator.h
index ddf8fb2e..970404f6 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -44,8 +44,8 @@ class OperatorBase {
         *operator_def_, name, default_value);
   }
 
-  inline const Tensor *Input(int idx) {
-    MACE_CHECK(static_cast<size_t>(idx) < inputs_.size());
+  inline const Tensor *Input(TIndex idx) {
+    MACE_CHECK(idx < inputs_.size());
     return inputs_[idx];
   }
 
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 2c45255d..77a44615 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -68,15 +68,8 @@ class Tensor {
 
   inline TIndex dim_size() const { return shape_.size(); }
 
-  inline int dim32(int index) const {
-    MACE_CHECK(static_cast<size_t>(index) < shape_.size(), "Exceeding ndim limit");
-    MACE_CHECK(index >= 0, "Cannot have negative dimension index");
-    MACE_CHECK(shape_[index], std::numeric_limits<int>::max());
-    return static_cast<int>(shape_[index]);
-  }
-
-  inline TIndex dim(int index) const {
-    MACE_CHECK(static_cast<size_t>(index) < shape_.size(), "Exceeding ndim limit");
+  inline TIndex dim(TIndex index) const {
+    MACE_CHECK(index < shape_.size(), "Exceeding ndim limit");
     MACE_CHECK(index >= 0, "Cannot have negative dimension index");
     return shape_[index];
   }
@@ -133,8 +126,8 @@ class Tensor {
   }
 
   template <typename T>
-  inline void Copy(const T* src, size_t size) {
-    MACE_CHECK(static_cast<TIndex>(size) == size_, "copy src and dst with different size.");
+  inline void Copy(const T* src, TIndex size) {
+    MACE_CHECK(size == size_, "copy src and dst with different size.");
     CopyBytes(static_cast<const void*>(src), sizeof(T) * size);
   }
 
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index fd405fa5..d81fdae9 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -11,33 +11,29 @@
 namespace mace {
 namespace kernels {
 
+template <DeviceType D, typename T>
+struct BatchNormFunctorBase {
+  BatchNormFunctorBase(const float variance_epsilon)
+          :variance_epsilon_(variance_epsilon){}
 
-template<DeviceType D>
-struct BatchNormFunctor {
-  void operator()(const float* input,
-                  const float* scale,
-                  const float* offset,
-                  const float* mean,
-                  const float* var,
-                  const int n,
-                  const int channel,
-                  const int sample_size,
-                  const float variance_epsilon,
-                  float* output) ;
+  float variance_epsilon_;
 };
 
-template<>
-struct BatchNormFunctor<DeviceType::CPU> {
-  void operator()(const float* input,
-                  const float* scale,
-                  const float* offset,
-                  const float* mean,
-                  const float* var,
-                  const int n,
-                  const int channel,
-                  const int sample_size,
-                  const float variance_epsilon,
-                  float* output) {
+
+template<DeviceType D, typename T>
+struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
+  BatchNormFunctor(const float variance_epsilon)
+          :BatchNormFunctorBase<D, T>(variance_epsilon){}
+
+  void operator()(const T* input,
+                  const T* scale,
+                  const T* offset,
+                  const T* mean,
+                  const T* var,
+                  const TIndex n,
+                  const TIndex channel,
+                  const TIndex sample_size,
+                  T* output) {
     // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
     // The calculation formula for inference is
     // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
@@ -45,21 +41,22 @@ struct BatchNormFunctor<DeviceType::CPU> {
     // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
     // new_offset = \offset - mean * common_val;
     // Y = new_scale * X + new_offset;
-    float new_scale, new_offset;
-    for (int c = 0; c < channel; ++c) {
-      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
+    T new_scale, new_offset;
+    for (TIndex c = 0; c < channel; ++c) {
+      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
       new_offset = offset[c] - mean[c] * new_scale;
 
-      for (int i = 0; i < n; ++i) {
-        int pos = i * channel * sample_size + c * sample_size;
-        const float* input_sample_ptr = input + pos;
-        float* output_sample_ptr = output + pos;
-        for (int j = 0; j < sample_size; ++j) {
+      for (TIndex i = 0; i < n; ++i) {
+        TIndex pos = i * channel * sample_size + c * sample_size;
+        const T* input_sample_ptr = input + pos;
+        T* output_sample_ptr = output + pos;
+        for (TIndex j = 0; j < sample_size; ++j) {
           output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
         }
       }
     }
   }
+
 };
 
 } //  namepsace kernels
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index 2fbf6ece..d307173f 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -2,25 +2,27 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#if __ARM_NEON
+//#if __ARM_NEON
 #include <arm_neon.h>
 #include "mace/kernels/batch_norm.h"
 
 namespace mace {
 namespace kernels {
 
-template<>
-struct BatchNormFunctor<DeviceType::NEON> {
-  void operator()(const float* input,
-                  const float* scale,
-                  const float* offset,
-                  const float* mean,
-                  const float* var,
+template <typename T>
+struct BatchNormFunctor<DeviceType::NEON> : public BatchNormFunctorBase<DeviceType::NEON, T> {
+  BatchNormFunctor(const float variance_epsilon)
+          :BatchNormFunctorBase<DeviceType::NEON, T>(variance_epsilon){}
+
+  void operator()(const T* input,
+                  const T* scale,
+                  const T* offset,
+                  const T* mean,
+                  const T* var,
                   const int n,
                   const int channel,
                   const int sample_size,
-                  const float variance_epsilon,
-                  float* output) {
+                  T* output) {
 
     // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
     // The calculation formula for inference is
@@ -29,21 +31,21 @@ struct BatchNormFunctor<DeviceType::NEON> {
     // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
     // new_offset = \offset - mean * common_val;
     // Y = new_scale * X + new_offset;
-    float new_scale, new_offset;
+    T new_scale, new_offset;
     int count = sample_size >> 2;
     int remain_count = sample_size - count;
-    for (int c = 0; c < channel; ++c) {
-      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
+    for (TIndex c = 0; c < channel; ++c) {
+      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon_);
       new_offset = offset[c] - mean[c] * new_scale;
 
       float32x4_t new_scale_f = vdupq_n_f32(new_scale);
       float32x4_t new_offset_f = vdupq_n_f32(new_offset);
-      for (int i = 0; i < n; ++i) {
-        int pos = i * channel * sample_size + c * sample_size;
+      for (TIndex i = 0; i < n; ++i) {
+        TIndex pos = i * channel * sample_size + c * sample_size;
         const float* input_sample_ptr = input + pos;
         float* output_sample_ptr = output + pos;
 
-        for(int j = 0; j < count; ++j) {
+        for(TIndex j = 0; j < count; ++j) {
           float32x4_t input_f = vld1q_f32(input_sample_ptr);
           float32x4_t output_f = new_offset_f;
           output_f = vfmaq_f32(output_f, input_f, new_scale_f);
@@ -51,7 +53,7 @@ struct BatchNormFunctor<DeviceType::NEON> {
           input_sample_ptr += 4;
           output_sample_ptr += 4;
         }
-        for(int j = 0; j < remain_count; ++j) {
+        for(TIndex j = 0; j < remain_count; ++j) {
           *output_sample_ptr = new_scale * *input_sample_ptr + new_offset;
           ++output_sample_ptr;
           ++input_sample_ptr;
@@ -63,4 +65,4 @@ struct BatchNormFunctor<DeviceType::NEON> {
 
 } // namespace kernels
 } //  namespace mace
-#endif // __ARM_NEON
+//#endif // __ARM_NEON
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 09d0e300..9a48b669 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -3,7 +3,6 @@
 //
 
 #include "mace/ops/batch_norm.h"
-#include "mace/proto/mace.pb.h"
 
 namespace mace {
 
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index 2b4fad42..8a3c01b4 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -14,7 +14,8 @@ template<DeviceType D, class T>
 class BatchNormOp : public Operator<D, T> {
   public:
     BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-            : Operator<D, T>(operator_def, ws) {}
+            : Operator<D, T>(operator_def, ws),
+              functor_(OperatorBase::GetSingleArgument<float>("variance_epsilon", 1e-4)){}
 
     bool Run() override {
       const Tensor* input = this->Input(0);
@@ -23,8 +24,6 @@ class BatchNormOp : public Operator<D, T> {
       const Tensor* mean = this->Input(3);
       const Tensor* var = this->Input(4);
 
-      const float variance_epsilon = this->template GetSingleArgument<float>("variance_epsilon", 1e-4);
-
       MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", input->dim_size());
       MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", scale->dim_size());
       MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", offset->dim_size());
@@ -34,9 +33,9 @@ class BatchNormOp : public Operator<D, T> {
       Tensor* output = this->Output(0);
       output->ResizeLike(input);
 
-      const int n = input->dim32(0);
-      const int channel = input->dim32(1);
-      const int sample_size = input->dim32(2) * input->dim32(3);
+      const TIndex n = input->dim(0);
+      const TIndex channel = input->dim(1);
+      const TIndex sample_size = input->dim(2) * input->dim(3);
 
       const float* input_ptr = input->data<float>();
       const float* scale_ptr = scale->data<float>();
@@ -45,11 +44,13 @@ class BatchNormOp : public Operator<D, T> {
       const float* var_ptr = var->data<float>();
       float* output_ptr = output->mutable_data<float>();
 
-      kernels::BatchNormFunctor<D>()(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr,
+      functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr,
                                      n, channel, sample_size,
-                                     variance_epsilon, output_ptr);
+                                     output_ptr);
       return true;
     }
+  private:
+    kernels::BatchNormFunctor<D, T> functor_;
 
 };
 
-- 
GitLab