diff --git a/mace/core/BUILD b/mace/core/BUILD
index 218fd1bd2354a9936d243764c9cdb113389ba817..7f974ba4e89aa14085fde004eb8da1413d4de124 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,6 +7,8 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load("//mace:mace.bzl", "if_android")
+
 cc_library(
     name = "core",
     srcs = glob([
@@ -19,6 +21,10 @@ cc_library(
     deps = [
         "//mace/proto:cc_proto",
     ],
+    linkopts = if_android([
+        "-llog",
+        "-pie",
+    ]),
 )
 
 # Main program for tests
diff --git a/mace/core/common.h b/mace/core/common.h
index c2c2931660275634f894948927a3c1dd7e909204..e5e07225ab1b21165fe7b3b7f2ca809824a7e740 100644
--- a/mace/core/common.h
+++ b/mace/core/common.h
@@ -12,7 +12,6 @@
 #include <vector>
 #include <algorithm>
 
-#include "mace/core/integral_types.h"
 #include "mace/core/logging.h"
 
 using std::set;
@@ -21,7 +20,7 @@ using std::string;
 using std::unique_ptr;
 using std::vector;
 
-typedef int64 TIndex;
+typedef int64_t index_t;
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
diff --git a/mace/core/integral_types.h b/mace/core/integral_types.h
deleted file mode 100644
index 72298201ef68403a93dbdb4d41087ad0f669e7a7..0000000000000000000000000000000000000000
--- a/mace/core/integral_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-
-#ifndef MACE_CORE_INTEGRAL_TYPES_H_
-#define MACE_CORE_INTEGRAL_TYPES_H_
-
-typedef int8_t int8;
-typedef int16_t int16;
-typedef int32_t int32;
-typedef int64_t int64;
-
-typedef uint8_t uint8;
-typedef uint16_t uint16;
-typedef uint32_t uint32;
-typedef uint64_t uint64;
-
-#endif // MACE_CORE_INTEGRAL_TYPES_H_
diff --git a/mace/core/logging.cc b/mace/core/logging.cc
index 5e0982d58e5d38fa1117b9d35ba2bec8a55dc092..f01d0980241187b2fcc2acb829e3c4b79f30b8d4 100644
--- a/mace/core/logging.cc
+++ b/mace/core/logging.cc
@@ -69,18 +69,18 @@ void LogMessage::GenerateLogMessage() {
 
 namespace {
 
-// Parse log level (int64) from environment variable (char*)
-int64 LogLevelStrToInt(const char* tf_env_var_val) {
-  if (tf_env_var_val == nullptr) {
+// Parse log level (int64_t) from environment variable (char*)
+int64_t LogLevelStrToInt(const char* mace_env_var_val) {
+  if (mace_env_var_val == nullptr) {
     return 0;
   }
 
   // Ideally we would use env_var / safe_strto64, but it is
   // hard to use here without pulling in a lot of dependencies,
   // so we use std:istringstream instead
-  string min_log_level(tf_env_var_val);
+  string min_log_level(mace_env_var_val);
   std::istringstream ss(min_log_level);
-  int64 level;
+  int64_t level;
   if (!(ss >> level)) {
     // Invalid vlog level setting, set level to default (0)
     level = 0;
@@ -89,26 +89,26 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
   return level;
 }
 
-int64 MinLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
+int64_t MinLogLevelFromEnv() {
+  const char* mace_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL");
+  return LogLevelStrToInt(mace_env_var_val);
 }
 
-int64 MinVLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
+int64_t MinVLogLevelFromEnv() {
+  const char* mace_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL");
+  return LogLevelStrToInt(mace_env_var_val);
 }
 
 }  // namespace
 
 LogMessage::~LogMessage() {
   // Read the min log level once during the first call to logging.
-  static int64 min_log_level = MinLogLevelFromEnv();
+  static int64_t min_log_level = MinLogLevelFromEnv();
   if (severity_ >= min_log_level) GenerateLogMessage();
 }
 
-int64 LogMessage::MinVLogLevel() {
-  static int64 min_vlog_level = MinVLogLevelFromEnv();
+int64_t LogMessage::MinVLogLevel() {
+  static int64_t min_vlog_level = MinVLogLevelFromEnv();
   return min_vlog_level;
 }
 
diff --git a/mace/core/logging.h b/mace/core/logging.h
index c613a87d640618d689d6aadf04782f40a8172011..0787af3383d91074ae60c214f096923b8fc891d9 100644
--- a/mace/core/logging.h
+++ b/mace/core/logging.h
@@ -9,8 +9,6 @@
 #include <limits>
 #include <string>
 
-#include "mace/core/integral_types.h"
-
 #undef ERROR
 
 namespace mace {
@@ -62,7 +60,7 @@ class LogMessage : public std::basic_ostringstream<char> {
   // Returns the minimum log level for VLOG statements.
   // E.g., if MinVLogLevel() is 2, then VLOG(2) statements will produce output,
   // but VLOG(3) will not. Defaults to 0.
-  static int64 MinVLogLevel();
+  static int64_t MinVLogLevel();
 
  protected:
   void GenerateLogMessage();
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 22956876262c5b16a042e9e1289e6fd4fc73dca6..a8f1f80e17e2f150ff5b9dda32df3c03e97e8014 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -37,6 +37,7 @@ bool SimpleNet::Run() {
       return false;
     }
   }
+  return true;
 }
 
 unique_ptr<NetBase> CreateNet(const NetDef& net_def,
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 2e5086ac222a70503bf655ff9d92557369beccb4..a755577b65b7d3c5c80dd1da50b6dd4d256bccf8 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -6,8 +6,8 @@
 
 namespace mace {
 
-std::map<int32, OperatorRegistry*>* gDeviceTypeRegistry() {
-  static std::map<int32, OperatorRegistry*> g_device_type_registry;
+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
+  static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
   return &g_device_type_registry;
 }
 
diff --git a/mace/core/operator.h b/mace/core/operator.h
index fc883855f4e7eb6f72a6c9340fb54584bcad3afe..df488691bac3fd8de6aea9c98a7175f80b50f41e 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -44,7 +44,7 @@ class OperatorBase {
         *operator_def_, name, default_value);
   }
 
-  inline const Tensor *Input(int idx) {
+  inline const Tensor *Input(index_t idx) {
     MACE_CHECK(idx < inputs_.size());
     return inputs_[idx];
   }
diff --git a/mace/core/serializer.cc b/mace/core/serializer.cc
index 310e76299db02eb8dae9fa2032f65a5cccd1c6e2..3e80e545b2a0aa23eb26906f588c9713beba046e 100644
--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
@@ -17,8 +17,8 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                                            DeviceType type) {
   unique_ptr<Tensor> tensor(new Tensor(GetDeviceAllocator(type),
                                        proto.data_type()));
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
+  vector<index_t> dims;
+  for (const index_t d : proto.dims()) {
     dims.push_back(d);
   }
   tensor->Resize(dims);
@@ -33,31 +33,31 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                            proto.double_data().size());
       break;
     case DT_INT32:
-      tensor->template Copy<int32>(proto.int32_data().data(),
+      tensor->template Copy<int32_t>(proto.int32_data().data(),
                                    proto.int32_data().size());
       break;
     case DT_UINT8:
-      tensor->CopyWithCast<int32, uint8>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, uint8_t>(proto.int32_data().data(),
                                          proto.int32_data().size());
       break;
     case DT_INT16:
-      tensor->CopyWithCast<int32, int16>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, int16_t>(proto.int32_data().data(),
                                          proto.int32_data().size());
       break;
     case DT_INT8:
-      tensor->CopyWithCast<int32, int8>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, int8_t>(proto.int32_data().data(),
                                         proto.int32_data().size());
       break;
     case DT_INT64:
-      tensor->Copy<int64>(proto.int64_data().data(),
+      tensor->Copy<int64_t>(proto.int64_data().data(),
                           proto.int64_data().size());
       break;
     case DT_UINT16:
-      tensor->CopyWithCast<int32, uint16>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, uint16_t>(proto.int32_data().data(),
                                           proto.int32_data().size());
       break;
     case DT_BOOL:
-      tensor->CopyWithCast<int32, bool>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, bool>(proto.int32_data().data(),
                                         proto.int32_data().size());
       break;
     case DT_STRING: {
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 7aea8b1567d74d9d5c4e8f3f809de174e599e612..1af32d3f2338f344332375d9cb67cbe23a4f119d 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -25,13 +25,13 @@ namespace mace {
   switch (TYPE_ENUM) {                                         \
     CASE(float, SINGLE_ARG(STMTS))                             \
     CASE(double, SINGLE_ARG(STMTS))                            \
-    CASE(int32, SINGLE_ARG(STMTS))                             \
-    CASE(uint8, SINGLE_ARG(STMTS))                             \
-    CASE(uint16, SINGLE_ARG(STMTS))                            \
-    CASE(int16, SINGLE_ARG(STMTS))                             \
-    CASE(int8, SINGLE_ARG(STMTS))                              \
+    CASE(int32_t, SINGLE_ARG(STMTS))                             \
+    CASE(uint8_t, SINGLE_ARG(STMTS))                             \
+    CASE(uint16_t, SINGLE_ARG(STMTS))                            \
+    CASE(int16_t, SINGLE_ARG(STMTS))                             \
+    CASE(int8_t, SINGLE_ARG(STMTS))                              \
     CASE(string, SINGLE_ARG(STMTS))                            \
-    CASE(int64, SINGLE_ARG(STMTS))                             \
+    CASE(int64_t, SINGLE_ARG(STMTS))                             \
     CASE(bool, SINGLE_ARG(STMTS))                              \
     case DT_INVALID:                                           \
       INVALID;                                                 \
@@ -64,11 +64,17 @@ class Tensor {
 
   inline DataType dtype() const { return dtype_; }
 
-  inline const vector<TIndex>& shape() const { return shape_; }
+  inline const vector<index_t>& shape() const { return shape_; }
 
-  inline TIndex dim_size() { return shape_.size(); }
+  inline index_t dim_size() const { return shape_.size(); }
 
-  inline TIndex size() const { return size_; }
+  inline index_t dim(index_t index) const {
+    MACE_CHECK(index < shape_.size(), "Exceeding ndim limit");
+    MACE_CHECK(index >= 0, "Cannot have negative dimension index");
+    return shape_[index];
+  }
+
+  inline index_t size() const { return size_; }
 
   inline const void* raw_data() const {
     MACE_CHECK(data_.get() || size_ == 0);
@@ -102,9 +108,9 @@ class Tensor {
     return static_cast<T*>(raw_mutable_data());
   }
 
-  inline void Resize(const vector<TIndex>& shape) {
+  inline void Resize(const vector<index_t>& shape) {
     shape_ = shape;
-    TIndex size = NumElements();
+    index_t size = NumElements();
     if (size_ != size) {
       size_ = size;
       data_.reset();
@@ -120,16 +126,16 @@ class Tensor {
   }
 
   template <typename T>
-  inline void Copy(const T* src, size_t size) {
+  inline void Copy(const T* src, index_t size) {
     MACE_CHECK(size == size_, "copy src and dst with different size.");
     CopyBytes(static_cast<const void*>(src), sizeof(T) * size);
   }
 
   template <typename SrcType, typename DstType>
   inline void CopyWithCast(const SrcType* src, size_t size) {
-    MACE_CHECK(size == size_, "copy src and dst with different size.");
+    MACE_CHECK(static_cast<index_t>(size) == size_, "copy src and dst with different size.");
     unique_ptr<DstType[]> buffer(new DstType[size]);
-    for (int i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       buffer[i] = static_cast<DstType>(src[i]);
     }
     CopyBytes(static_cast<const void*>(buffer.get()), sizeof(DstType) * size);
@@ -155,15 +161,15 @@ class Tensor {
   }
 
  private:
-  inline int64 NumElements() const {
-    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64>());
+  inline int64_t NumElements() const {
+    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>());
   }
 
   Allocator* alloc_;
-  TIndex size_;
+  index_t size_;
   DataType dtype_;
   std::shared_ptr<void> data_;
-  vector<TIndex> shape_;
+  vector<index_t> shape_;
 };
 
 } // namespace tensor
diff --git a/mace/core/testing/env_time.h b/mace/core/testing/env_time.h
index 6be189a658ab489fdf59fcc4f666c71574ad468b..f07783c1f66e4551886276e30796001ae1fc1a52 100644
--- a/mace/core/testing/env_time.h
+++ b/mace/core/testing/env_time.h
@@ -16,10 +16,10 @@ namespace mace {
 
 namespace testing {
 
-inline int64 NowMicros() {
+inline int64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
-  return static_cast<int64>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
 
 }  // namespace testing
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 1eb976ec998833ffcf436dcf4744566d460f168f..885a9a63f70956428008291f29dc293245c7d37a 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -16,10 +16,10 @@ namespace testing {
 
 static std::vector<Benchmark*>* all_benchmarks = nullptr;
 static std::string label;
-static int64 bytes_processed;
-static int64 items_processed;
-static int64 accum_time = 0;
-static int64 start_time = 0;
+static int64_t bytes_processed;
+static int64_t items_processed;
+static int64_t accum_time = 0;
+static int64_t start_time = 0;
 
 Benchmark::Benchmark(const char* name, void (*fn)(int))
     : name_(name), num_args_(0), fn0_(fn) {
@@ -112,10 +112,10 @@ void Benchmark::Register() {
 }
 
 void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
-  static const int64 kMinIters = 100;
-  static const int64 kMaxIters = 1000000000;
+  static const int64_t kMinIters = 100;
+  static const int64_t kMaxIters = 1000000000;
   static const double kMinTime = 0.5;
-  int64 iters = kMinIters;
+  int64_t iters = kMinIters;
   while (true) {
     accum_time = 0;
     start_time = NowMicros();
@@ -142,13 +142,13 @@ void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
     double multiplier = 1.4 * kMinTime / std::max(seconds, 1e-9);
     multiplier = std::min(10.0, multiplier);
     if (multiplier <= 1.0) multiplier *= 2.0;
-    iters = std::max<int64>(multiplier * iters, iters + 1);
+    iters = std::max<int64_t>(multiplier * iters, iters + 1);
     iters = std::min(iters, kMaxIters);
   }
 }
 
-void BytesProcessed(int64 n) { bytes_processed = n; }
-void ItemsProcessed(int64 n) { items_processed = n; }
+void BytesProcessed(int64_t n) { bytes_processed = n; }
+void ItemsProcessed(int64_t n) { items_processed = n; }
 void StartTiming() {
   if (start_time == 0) start_time = NowMicros();
 }
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index 44a352f54df4cae609b3955eb5343c9b78d34126..5800f5edb0912899b09fc95ebebb8a741e2a48e1 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -42,8 +42,8 @@ class Benchmark {
 };
 
 void RunBenchmarks();
-void BytesProcessed(int64);
-void ItemsProcessed(int64);
+void BytesProcessed(int64_t);
+void ItemsProcessed(int64_t);
 void StartTiming();
 void StopTiming();
 
diff --git a/mace/core/types.h b/mace/core/types.h
index 161be5a7103a9c8c69be3932b9997e7dbee51124..b174993d024587875d6b597cdcb7a19f9d79d154 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -42,16 +42,16 @@ struct EnumToDataType {};  // Specializations below
 
 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
 MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
-MATCH_TYPE_AND_ENUM(int32, DT_INT32);
-MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
-MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
-MATCH_TYPE_AND_ENUM(int16, DT_INT16);
-MATCH_TYPE_AND_ENUM(int8, DT_INT8);
+MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
+MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16);
+MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8);
+MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
+MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(string, DT_STRING);
-MATCH_TYPE_AND_ENUM(int64, DT_INT64);
+MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 
-static const int32 kint32max = ((int32)0x7FFFFFFF);
+static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);
 
 } // namespace mace
 
diff --git a/mace/examples/BUILD b/mace/examples/BUILD
index 4f4a7794e0cb00f7d8312299dd7572afd74e68d6..82915d74e05e02c0bbddc04163d7c4e53f12f22b 100644
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -7,10 +7,6 @@ cc_binary(
         "helloworld.cc",
     ],
     copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-pie",
-        "-llog",
-    ]),
     deps = [
         "//mace/core",
         "//mace/ops",
@@ -21,10 +17,6 @@ cc_test(
     name = "benchmark_example",
     srcs = ["benchmark_example.cc"],
     copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-pie",
-        "-llog",
-    ]),
     linkstatic = 1,
     deps = [
         "//mace/core",
diff --git a/mace/examples/benchmark_example.cc b/mace/examples/benchmark_example.cc
index 106c6c3c4ccdf7dcc091a6c9f9bbc8c0c15d2611..50e5184b0bf384d81932466584cdbc688db40a21 100644
--- a/mace/examples/benchmark_example.cc
+++ b/mace/examples/benchmark_example.cc
@@ -6,7 +6,7 @@
 
 static void foo(int iters) {
   static const int N = 32;
-  const int64 tot = static_cast<int64>(iters) * N;
+  const int64_t tot = static_cast<int64_t>(iters) * N;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
@@ -26,7 +26,7 @@ BENCHMARK(foo);
 
 
 static void bar(int iters, int n) {
-  const int64 tot = static_cast<int64>(iters) * n;
+  const int64_t tot = static_cast<int64_t>(iters) * n;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index de8293e35421ac29031db5281d162de3999efb78..098e80a949d457406730e0f0146b45c03b75faee 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -18,6 +18,9 @@ cc_library(
         "//mace/core:core",
     ],
     copts = ['-std=c++11'],
+    linkopts = ["-fopenmp"] + if_android([
+        "-lm",
+        ]),
 )
 
 cc_test(
@@ -29,11 +32,9 @@ cc_test(
         "//mace/core:core",
     ],
     copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android([
+    linkopts = if_android([
         "-pie",
-        "-llog",
-        "-lm",
-    ]),
+        ]),
     linkstatic = 1,
     testonly = 1,
 )
@@ -47,11 +48,6 @@ cc_test(
         "//mace/core:test_benchmark_main",
     ],
     copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android([
-        "-pie",
-        "-llog",
-        "-lm",
-    ]),
     linkstatic = 1,
     testonly = 1,
 )
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 3f79ac69b468c623acbe2cdb6d9179bbe3906bda..30648eb8a15186198ec8b2c9fb98c04695bf4366 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -15,7 +15,7 @@ void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tenso
   int n = input_tensor.size();
   MACE_CHECK(n > 1);
   MACE_CHECK_NOTNULL(input_tensor[0]);
-  int64 size = input_tensor[0]->size();
+  int64_t size = input_tensor[0]->size();
   vector<const T*> inputs(n);
   for (int i = 0; i < n; ++i) {
     inputs[i] = input_tensor[i]->data<T>();
@@ -24,7 +24,7 @@ void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tenso
   T* output = output_tensor->mutable_data<T>();
 
   for (int i = 0; i < n; ++i) {
-    for (int64 j = 0; j < size; ++j) {
+    for (int64_t j = 0; j < size; ++j) {
       output[j] += inputs[i][j];
     }
   }
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..84ca48d4a76bc477258ce0d9ec152d5f313709a9
--- /dev/null
+++ b/mace/kernels/batch_norm.h
@@ -0,0 +1,66 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_BATCH_NORM_H_
+#define MACE_KERNELS_BATCH_NORM_H_
+
+#include "mace/core/tensor.h"
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, typename T>
+struct BatchNormFunctorBase {
+  BatchNormFunctorBase(const float variance_epsilon)
+          :variance_epsilon_(variance_epsilon){}
+
+  float variance_epsilon_;
+};
+
+
+template<DeviceType D, typename T>
+struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
+  BatchNormFunctor(const float variance_epsilon)
+          :BatchNormFunctorBase<D, T>(variance_epsilon){}
+
+  void operator()(const T* input,
+                  const T* scale,
+                  const T* offset,
+                  const T* mean,
+                  const T* var,
+                  const index_t n,
+                  const index_t channel,
+                  const index_t sample_size,
+                  T* output) {
+    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
+    // The calculation formula for inference is
+    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
+    //          ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} }
+    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
+    // new_offset = \offset - mean * common_val;
+    // Y = new_scale * X + new_offset;
+    T new_scale, new_offset;
+    for (index_t c = 0; c < channel; ++c) {
+      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
+      new_offset = offset[c] - mean[c] * new_scale;
+      index_t pos = c * sample_size;
+
+      for (index_t i = 0; i < n; ++i) {
+        const T* input_sample_ptr = input + pos;
+        T* output_sample_ptr = output + pos;
+        for (index_t j = 0; j < sample_size; ++j) {
+          output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
+        }
+        pos += channel * sample_size;
+      }
+    }
+  }
+
+};
+
+} //  namepsace kernels
+} //  namespace mace
+
+#endif //  MACE_KERNELS_BATCH_NORM_H_
diff --git a/mace/kernels/benchmark/addn_benchmark.cc b/mace/kernels/benchmark/addn_benchmark.cc
index f63fed77b11847f3aacca8291c333699b0bd840a..4cec0270dbc6e1d9f55eb0db404965d9d1f1088e 100644
--- a/mace/kernels/benchmark/addn_benchmark.cc
+++ b/mace/kernels/benchmark/addn_benchmark.cc
@@ -11,7 +11,7 @@ using namespace mace;
 using namespace mace::kernels;
 
 static void AddNBenchmark(int iters, int n, int type) {
-  const int64 tot = static_cast<int64>(iters) * n * 3;
+  const int64_t tot = static_cast<int64_t>(iters) * n * 3;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
@@ -35,7 +35,7 @@ static void AddNBenchmark(int iters, int n, int type) {
   float *input3 = input_tensor3.mutable_data<float>();
   float *output = output_tensor.mutable_data<float>();
 
-  for (int64 i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     input1[i] = nd(gen);
     input2[i] = nd(gen);
     input3[i] = nd(gen);
diff --git a/mace/kernels/benchmark/relu_benchmark.cc b/mace/kernels/benchmark/relu_benchmark.cc
index 9276cadc737bba60a0fac81893dd5aa797d3f6a9..86858681ca29518f6ed98e46f58794d82c984057 100644
--- a/mace/kernels/benchmark/relu_benchmark.cc
+++ b/mace/kernels/benchmark/relu_benchmark.cc
@@ -11,7 +11,7 @@ using namespace mace;
 using namespace mace::kernels;
 
 static void ReluBenchmark(int iters, int n, int type) {
-  const int64 tot = static_cast<int64>(iters) * n;
+  const int64_t tot = static_cast<int64_t>(iters) * n;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
@@ -25,7 +25,7 @@ static void ReluBenchmark(int iters, int n, int type) {
   output_tensor.ResizeLike(input_tensor);
   float *input = input_tensor.mutable_data<float>();
   float *output = output_tensor.mutable_data<float>();
-  for (int64 i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     input[i] = nd(gen);
   }
 
diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc
index 3baab3c3b3dadb8570e0f7b4830fd9c14c1799fa..ad6f06e8df7c17dc189316a20be3be5586a212e6 100644
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -14,7 +14,7 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
   int n = input_tensor.size();
   MACE_CHECK(n > 1);
   MACE_CHECK_NOTNULL(input_tensor[0]);
-  int64 size = input_tensor[0]->size();
+  int64_t size = input_tensor[0]->size();
   output_tensor->ResizeLike(input_tensor[0]);
   float *output = output_tensor->mutable_data<float>();
   vector<const float *> inputs(n);
@@ -22,19 +22,19 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
     inputs[i] = input_tensor[i]->data<float>();
   }
 
-  int64 cost = size * n;
-  int64 groups = 1;
+  int64_t cost = size * n;
+  int64_t groups = 1;
   if (cost > kCostPerGroup) {
     groups = cost / kCostPerGroup;
   }
-  int64 element_per_group = size / groups;
+  int64_t element_per_group = size / groups;
 
 #pragma omp parallel for num_threads(1) // no significant performance improve
-  for (int64 i = 0; i < size; i += element_per_group) {
-    int64 count = std::min(element_per_group, size - i);
+  for (int64_t i = 0; i < size; i += element_per_group) {
+    int64_t count = std::min(element_per_group, size - i);
     int nn = count >> 2;
     int remain = count - (nn << 2);
-    for (int64 j = 0; j < n; ++j) {
+    for (int64_t j = 0; j < n; ++j) {
       const float *inptr = inputs[j] + i;
       float *outptr = output + i;
       for (int k = 0; k < nn; ++k) {
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a306fdbc804e0c5995846fa89dd5bb681d31e1ed
--- /dev/null
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "mace/kernels/batch_norm.h"
+
+namespace mace {
+namespace kernels {
+
+template <typename T>
+struct BatchNormFunctor<DeviceType::NEON, T> : public BatchNormFunctorBase<DeviceType::NEON, T> {
+  BatchNormFunctor(const float variance_epsilon)
+          :BatchNormFunctorBase<DeviceType::NEON, T>(variance_epsilon){}
+
+  void operator()(const T* input,
+                  const T* scale,
+                  const T* offset,
+                  const T* mean,
+                  const T* var,
+                  const int n,
+                  const int channel,
+                  const int sample_size,
+                  T* output) {
+
+    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
+    // The calculation formula for inference is
+    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
+    //          ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} }
+    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
+    // new_offset = \offset - mean * common_val;
+    // Y = new_scale * X + new_offset;
+    T new_scale, new_offset;
+    int count = sample_size >> 2;
+    int remain_count = sample_size - count;
+    for (index_t c = 0; c < channel; ++c) {
+      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
+      new_offset = offset[c] - mean[c] * new_scale;
+      index_t pos = c * sample_size;
+
+      float32x4_t new_scale_f = vdupq_n_f32(new_scale);
+      float32x4_t new_offset_f = vdupq_n_f32(new_offset);
+      for (index_t i = 0; i < n; ++i) {
+        const float* input_sample_ptr = input + pos;
+        float* output_sample_ptr = output + pos;
+
+        for(index_t j = 0; j < count; ++j) {
+          float32x4_t input_f = vld1q_f32(input_sample_ptr);
+          float32x4_t output_f = new_offset_f;
+          output_f = vfmaq_f32(output_f, input_f, new_scale_f);
+          vst1q_f32(output_sample_ptr, output_f);
+          input_sample_ptr += 4;
+          output_sample_ptr += 4;
+        }
+        for(index_t j = 0; j < remain_count; ++j) {
+          *output_sample_ptr = new_scale * *input_sample_ptr + new_offset;
+          ++output_sample_ptr;
+          ++input_sample_ptr;
+        }
+        pos += channel * sample_size;
+      }
+    }
+  }
+};
+
+} // namespace kernels
+} //  namespace mace
+#endif // __ARM_NEON
diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc
index 29c4e354a783f945f25e25ce9de75b879776f737..e487081891ab7d6bd7a039c5429442d0a7641d1a 100644
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -10,14 +10,14 @@ namespace kernels {
 
 void NeonReluFuntion_float(const Tensor *input_tensor,
                            Tensor *output_tensor) {
-  int64 size = input_tensor->size();
+  int64_t size = input_tensor->size();
   output_tensor->ResizeLike(input_tensor);
   const float *input = input_tensor->data<float>();
   float *output = output_tensor->mutable_data<float>();
 
 #pragma omp parallel for num_threads(1) // no significant performance improve
-  for (int64 i = 0; i < size; i += kCostPerGroup) {
-    int64 count = std::min(static_cast<int64>(kCostPerGroup), size - i);
+  for (int64_t i = 0; i < size; i += kCostPerGroup) {
+    int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
     int nn = count >> 2;
     int remain = count - (nn << 2);
     const float *inptr = input + i;
diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h
index 086f762b41e85c3ff7042086ba1b56d3607d30c2..d0de2f0b061524537479c9082ca250fba47e6c29 100644
--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -12,12 +12,12 @@ namespace kernels {
 
 template<typename T>
 void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) {
-  int64 size = input_tensor->size();
+  int64_t size = input_tensor->size();
   output_tensor->ResizeLike(input_tensor);
   const T *input = input_tensor->data<T>();
   T *output = output_tensor->mutable_data<T>();
 
-  for (int64 i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
     output[i] = std::max(input[i], static_cast<T>(0));
   }
 }
diff --git a/mace/kernels/test/addn_neon_test.cc b/mace/kernels/test/addn_neon_test.cc
index 8d1ca924b9b3ef8fecf96301007afa593cd54600..521fe9129b64a1e8f646c2124b3f56de32af677a 100644
--- a/mace/kernels/test/addn_neon_test.cc
+++ b/mace/kernels/test/addn_neon_test.cc
@@ -15,7 +15,7 @@ TEST(NeonTest, AddN) {
   std::mt19937 gen(rd());
   std::normal_distribution<float> nd(0, 1);
 
-  int64 count = 100000;
+  int64_t count = 100000;
   Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
   input_tensor1.Resize({100, 1000});
   Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
@@ -37,7 +37,7 @@ TEST(NeonTest, AddN) {
   float *output = output_tensor.mutable_data<float>();
   float *output_neon = output_tensor_neon.mutable_data<float>();
 
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     input1[i] = nd(gen);
     input2[i] = nd(gen);
     input3[i] = nd(gen);
@@ -48,7 +48,7 @@ TEST(NeonTest, AddN) {
 
   ASSERT_EQ(count, output_tensor.size());
   ASSERT_EQ(count, output_tensor_neon.size());
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     ASSERT_FLOAT_EQ(output[i], output_neon[i]);
   }
 }
diff --git a/mace/kernels/test/relu_neon_test.cc b/mace/kernels/test/relu_neon_test.cc
index 40c1bc62d68a94820ac99d1140203c24dd412235..a16dc2692501017a494d25d5af9dab73be8c44db 100644
--- a/mace/kernels/test/relu_neon_test.cc
+++ b/mace/kernels/test/relu_neon_test.cc
@@ -15,7 +15,7 @@ TEST(NeonTest, Relu) {
   std::mt19937 gen(rd());
   std::normal_distribution<float> nd(0, 1);
 
-  int64 count = 100000;
+  int64_t count = 100000;
   Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
   input_tensor.Resize({100, 1000});
   Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
@@ -27,7 +27,7 @@ TEST(NeonTest, Relu) {
   float *output = output_tensor.mutable_data<float>();
   float *output_neon = output_tensor_neon.mutable_data<float>();
 
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     input[i] = nd(gen);
   }
 
@@ -36,7 +36,7 @@ TEST(NeonTest, Relu) {
 
   ASSERT_EQ(count, output_tensor.size());
   ASSERT_EQ(count, output_tensor_neon.size());
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     ASSERT_FLOAT_EQ(output[i], output_neon[i]);
   }
 }
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 09a00ab3445025528a82129704be9005161dc141..3c09daa38d9e9af7f2bc5bd8e139ed61ec8dab81 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -9,10 +9,28 @@ licenses(["notice"])  # Apache 2.0
 
 load("//mace:mace.bzl", "if_android")
 
+cc_library(
+    name = "test",
+    testonly = 1,
+    hdrs = [
+        "ops_test_util.h",
+    ],
+    deps = [
+        "//mace/core",
+        "@gtest//:gtest",
+    ],
+)
+
 cc_library(
     name = "ops",
-    srcs = glob(["*.cc"]),
-    hdrs = glob(["*.h"]),
+    srcs = glob(
+        ["*.cc"],
+        exclude = ["*_test.cc"],
+    ),
+    hdrs = glob(
+        ["*.h"],
+        exclude = ["ops_test_util.h"],
+    ),
     copts = ["-std=c++11"],
     deps = [
         "//mace/core",
@@ -53,3 +71,15 @@ cc_test(
         "@gtest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "batch_norm_test",
+    srcs = ["batch_norm_test.cc"],
+    copts = ["-std=c++11"],
+    linkstatic = 1,
+    deps = [
+        ":ops",
+        ":test",
+        "@gtest//:gtest_main",
+    ],
+)
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a48b669f5a1f97f9a15059cf76331740e7f943c
--- /dev/null
+++ b/mace/ops/batch_norm.cc
@@ -0,0 +1,15 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/batch_norm.h"
+
+namespace mace {
+
+REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>);
+
+#if __ARM_NEON
+REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>);
+#endif // __ARM_NEON
+
+} //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..59c227c865b519b81c7e6d818a052336acd2e570
--- /dev/null
+++ b/mace/ops/batch_norm.h
@@ -0,0 +1,59 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_BATCH_NORM_H_
+#define MACE_BATCH_NORM_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/batch_norm.h"
+
+namespace mace {
+
+template<DeviceType D, class T>
+class BatchNormOp : public Operator<D, T> {
+  public:
+    BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
+            : Operator<D, T>(operator_def, ws),
+              functor_(OperatorBase::GetSingleArgument<float>("variance_epsilon", 1e-4)){}
+
+    bool Run() override {
+      const Tensor* input = this->Input(0);
+      const Tensor* scale = this->Input(1);
+      const Tensor* offset = this->Input(2);
+      const Tensor* mean = this->Input(3);
+      const Tensor* var = this->Input(4);
+
+      MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", input->dim_size());
+      MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", scale->dim_size());
+      MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", offset->dim_size());
+      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", mean->dim_size());
+      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", var->dim_size());
+
+      Tensor* output = this->Output(0);
+      output->ResizeLike(input);
+
+      const index_t n = input->dim(0);
+      const index_t channel = input->dim(1);
+      const index_t sample_size = input->dim(2) * input->dim(3);
+
+      const float* input_ptr = input->data<float>();
+      const float* scale_ptr = scale->data<float>();
+      const float* offset_ptr = offset->data<float>();
+      const float* mean_ptr = mean->data<float>();
+      const float* var_ptr = var->data<float>();
+      float* output_ptr = output->mutable_data<float>();
+
+      functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr,
+                                     n, channel, sample_size,
+                                     output_ptr);
+      return true;
+    }
+  private:
+    kernels::BatchNormFunctor<D, T> functor_;
+
+};
+
+} //  namespace mace
+
+#endif //  MACE_BATCH_NORM_H_
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b52d0590dbdb77b1ef8c5a35215b7c6a9582ef1
--- /dev/null
+++ b/mace/ops/batch_norm_test.cc
@@ -0,0 +1,46 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+
+class BatchNormOpTest : public OpsTestBase {};
+
+TEST_F(BatchNormOpTest, Simple) {
+  // Construct graph
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Output("Output")
+        .Finalize(operator_def());
+
+  // Add input data
+  AddInputFromArray<float>("Input", {1, 1, 6, 2},
+                    {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
+  AddInputFromArray<float>("Scale", {2},
+                           {4.0f, 4.0f});
+  AddInputFromArray<float>("Offset", {2},
+                           {2.0, 2.0});
+  AddInputFromArray<float>("Mean", {2},
+                           {10, 10});
+  AddInputFromArray<float>("Var", {2},
+                           {11.67f, 11.67f});
+
+  // Run
+  RunOp();
+
+  // Check
+  Tensor expected = CreateTensor<float>({1, 1, 6, 2},
+                                        {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+                                         3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
+
+  ExpectTensorNear<float>(expected, *GetOutput("Output"), 0.01);
+}
+
+}
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e96943c60085014bc01c65323882ebc0480249e
--- /dev/null
+++ b/mace/ops/ops_test_util.h
@@ -0,0 +1,170 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_TEST_UTIL_H_
+#define MACE_OPS_TEST_UTIL_H_
+
+#include "gtest/gtest.h"
+#include "mace/core/common.h"
+#include "mace/core/tensor.h"
+#include "mace/core/net.h"
+
+namespace mace {
+
+class OpDefBuilder {
+  public:
+    OpDefBuilder(const char* type, const char* name) {
+      op_def_.set_type(type);
+      op_def_.set_name(name);
+    }
+    OpDefBuilder& Input(const char* input_name) {
+      op_def_.add_input(input_name);
+      return *this;
+    }
+    OpDefBuilder& Output(const char* output_name) {
+      op_def_.add_output(output_name);
+      return *this;
+    }
+    void Finalize(OperatorDef* op_def) const {
+      MACE_CHECK(op_def != NULL, "input should not be null.");
+      *op_def = op_def_;
+    }
+    OperatorDef op_def_;
+};
+
+class OpsTestBase : public ::testing::Test {
+  protected:
+    virtual void TearDown() {
+      auto tensor_names = ws_.Tensors();
+      for (auto& name : tensor_names) {
+        ws_.RemoveTensor(name);
+      }
+    }
+  public:
+    template <typename T>
+    void AddInputFromArray(const char* name, const std::vector<index_t>& shape, const std::vector<T>& data) {
+      Tensor* input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+      input->Resize(shape);
+      float* input_data = input->mutable_data<float>();
+      memcpy(input_data, data.data(), data.size() * sizeof(T));
+    }
+
+    OperatorDef* operator_def() { return &op_def_; }
+
+    bool RunOp() {
+      NetDef net_def;
+      net_def.add_op()->CopyFrom(op_def_);
+      VLOG(0) << net_def.DebugString();
+      auto net = CreateNet(net_def, &ws_, DeviceType::CPU);
+      return net->Run();
+    }
+
+    Tensor* GetOutput(const char* output_name) {
+      return ws_.GetTensor(output_name);
+    }
+
+  private:
+    Workspace ws_;
+    OperatorDef op_def_;
+};
+
+template <typename T>
+Tensor CreateTensor(const std::vector<index_t>& shape, const std::vector<T>& data) {
+  Tensor res(cpu_allocator(), DataTypeToEnum<T>::v());
+  res.Resize(shape);
+  float* input_data = res.mutable_data<float>();
+  memcpy(input_data, data.data(), data.size() * sizeof(T));
+  return res;
+}
+
+inline bool IsSameSize(const Tensor& x, const Tensor& y) {
+  if (x.dim_size() != y.dim_size()) return false;
+  for (int d = 0; d < x.dim_size(); ++d) {
+    if (x.dim(d) != y.dim(d)) return false;
+  }
+  return true;
+}
+
+inline std::string ShapeToString(const Tensor& x) {
+  std::stringstream stream;
+  for (int i = 0; i < x.dim_size(); i++) {
+    if (i > 0) stream<<",";
+    int64_t dim = x.dim(i);
+    if (dim < 0) {
+      stream<<"?";
+    } else {
+      stream<<dim;
+    }
+  }
+  stream<<"]";
+  return std::string(stream.str());
+}
+
+
+template <typename T>
+struct is_floating_point_type {
+  static const bool value = std::is_same<T, float>::value ||
+                            std::is_same<T, double>::value;
+};
+
+template <typename T>
+inline void ExpectEqual(const T& a, const T& b) {
+  EXPECT_EQ(a, b);
+}
+
+template <>
+inline void ExpectEqual<float>(const float& a, const float& b) {
+  EXPECT_FLOAT_EQ(a, b);
+}
+
+template <>
+inline void ExpectEqual<double>(const double& a, const double& b) {
+  EXPECT_DOUBLE_EQ(a, b);
+}
+
+inline void AssertSameTypeDims(const Tensor& x, const Tensor& y) {
+  ASSERT_EQ(x.dtype(), y.dtype());
+  ASSERT_TRUE(IsSameSize(x, y))
+        << "x.shape [" << ShapeToString(x) << "] vs "
+        << "y.shape [ " << ShapeToString(y) << "]";
+}
+
+template <typename T, bool is_fp = is_floating_point_type<T>::value>
+struct Expector;
+// Partial specialization for float and double.
+template <typename T>
+struct Expector<T, true> {
+  static void Equal(const T& a, const T& b) { ExpectEqual(a, b); }
+
+  static void Equal(const Tensor& x, const Tensor& y) {
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
+    AssertSameTypeDims(x, y);
+    auto a = x.data<T>();
+    auto b = y.data<T>();
+    for (int i = 0; i < x.size(); ++i) {
+      ExpectEqual(a(i), b(i));
+    }
+  }
+
+  static void Near(const Tensor& x, const Tensor& y, const double abs_err) {
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
+    AssertSameTypeDims(x, y);
+    auto a = x.data<T>();
+    auto b = y.data<T>();
+    for (int i = 0; i < x.size(); ++i) {
+      EXPECT_NEAR(a[i], b[i], abs_err)
+            << "a = " << a << " b = " << b << " index = " << i;
+    }
+  }
+};
+
+template <typename T>
+void ExpectTensorNear(const Tensor& x, const Tensor& y, const double abs_err) {
+  static_assert(is_floating_point_type<T>::value, "T is not a floating point type");
+  Expector<T>::Near(x, y ,abs_err);
+}
+
+} //  namespace mace
+
+#endif //  MACE_OPS_TEST_UTIL_H_