From 8ae8f575d65c39932ecce6e82de62224d8ee68b3 Mon Sep 17 00:00:00 2001
From: Liangliang He <lliang.he@gmail.com>
Date: Fri, 15 Sep 2017 16:46:02 +0800
Subject: [PATCH] Fix google coding style

---
 mace/core/allocator.cc                    |  10 +-
 mace/core/allocator.h                     |  10 +-
 mace/core/common.h                        |  14 +-
 mace/core/logging.cc                      |   5 +-
 mace/core/logging.h                       |  39 +++--
 mace/core/macros.h                        |   3 +-
 mace/core/net.cc                          |  27 ++--
 mace/core/net.h                           |  29 ++--
 mace/core/operator.cc                     |  29 ++--
 mace/core/operator.h                      |  83 ++++------
 mace/core/proto_utils.cc                  | 161 +++++++++----------
 mace/core/proto_utils.h                   |  93 +++++------
 mace/core/registry.h                      |  43 +++---
 mace/core/serializer.cc                   |  29 ++--
 mace/core/serializer.h                    |   8 +-
 mace/core/tensor.h                        |  55 +++----
 mace/core/testing/test_benchmark.cc       |   9 +-
 mace/core/testing/test_benchmark.h        |   6 +-
 mace/core/testing/test_benchmark_main.cc  |   1 -
 mace/core/types.h                         |  31 ++--
 mace/core/workspace.cc                    |  17 +-
 mace/core/workspace.h                     |   8 +-
 mace/examples/benchmark_example.cc        |   5 +-
 mace/kernels/addn.h                       |  16 +-
 mace/kernels/batch_norm.h                 |  39 ++---
 mace/kernels/conv_2d.h                    | 179 ++++++++++------------
 mace/kernels/conv_pool_2d_util.cc         |  38 ++---
 mace/kernels/conv_pool_2d_util.h          |  26 ++--
 mace/kernels/neon/addn_neon.cc            |  13 +-
 mace/kernels/neon/batch_norm_neon.cc      |  38 ++---
 mace/kernels/neon/conv_2d_neon.cc         |  82 ++++------
 mace/kernels/neon/conv_2d_neon_1x1.cc     |  61 ++++----
 mace/kernels/neon/conv_2d_neon_3x3.cc     | 139 +++++++++--------
 mace/kernels/neon/conv_2d_neon_5x5.cc     |  32 ++--
 mace/kernels/neon/max_pooling_neon_2x2.cc |  19 +--
 mace/kernels/neon/max_pooling_neon_3x3.cc |  23 ++-
 mace/kernels/neon/pooling_neon.cc         |  54 +++----
 mace/kernels/neon/relu_neon.cc            |  13 +-
 mace/kernels/pooling.h                    |  64 ++++----
 mace/kernels/relu.h                       |   8 +-
 mace/kernels/resize_bilinear.h            |  59 ++++---
 mace/ops/addn.cc                          |   4 +-
 mace/ops/addn.h                           |   8 +-
 mace/ops/addn_benchmark.cc                |  27 ++--
 mace/ops/addn_test.cc                     |   2 +-
 mace/ops/batch_norm.cc                    |   4 +-
 mace/ops/batch_norm.h                     |  89 ++++++-----
 mace/ops/batch_norm_benchmark.cc          |  42 ++---
 mace/ops/batch_norm_test.cc               |  35 ++---
 mace/ops/conv_2d.cc                       |   4 +-
 mace/ops/conv_2d.h                        |  29 ++--
 mace/ops/conv_2d_benchmark.cc             |  48 +++---
 mace/ops/conv_2d_test.cc                  | 176 +++++++++------------
 mace/ops/conv_pool_2d_base.h              |  17 +-
 mace/ops/ops_test_util.h                  |  85 +++++-----
 mace/ops/pooling.cc                       |   5 +-
 mace/ops/pooling.h                        |  45 +++---
 mace/ops/pooling_benchmark.cc             |  36 +++--
 mace/ops/pooling_test.cc                  | 117 ++++++--------
 mace/ops/relu.cc                          |   4 +-
 mace/ops/relu.h                           |   8 +-
 mace/ops/relu_benchmark.cc                |  24 ++-
 mace/ops/relu_test.cc                     |   2 +-
 mace/ops/resize_bilinear.cc               |   7 +-
 mace/ops/resize_bilinear.h                |  19 +--
 mace/ops/resize_bilinear_test.cc          |   2 +-
 66 files changed, 1096 insertions(+), 1361 deletions(-)

diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc
index 371bd593..61d5ee2a 100644
--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -7,13 +7,9 @@
 namespace mace {
 
 static std::unique_ptr<CPUAllocator> g_cpu_allocator(new CPUAllocator());
-CPUAllocator* cpu_allocator() {
-  return g_cpu_allocator.get();
-}
+CPUAllocator* cpu_allocator() { return g_cpu_allocator.get(); }
 
-void SetCPUAllocator(CPUAllocator* alloc) {
-  g_cpu_allocator.reset(alloc);
-}
+void SetCPUAllocator(CPUAllocator* alloc) { g_cpu_allocator.reset(alloc); }
 
 Allocator* GetDeviceAllocator(DeviceType type) {
   switch (type) {
@@ -26,4 +22,4 @@ Allocator* GetDeviceAllocator(DeviceType type) {
   return nullptr;
 }
 
-} // namespace mace
+}  // namespace mace
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 0cde9c61..bfce30e5 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -39,7 +39,7 @@ class Allocator {
   }
 };
 
-class CPUAllocator: public Allocator {
+class CPUAllocator : public Allocator {
  public:
   ~CPUAllocator() override {}
   void* New(size_t nbytes) override {
@@ -55,9 +55,7 @@ class CPUAllocator: public Allocator {
     return data;
   }
 
-  void Delete(void* data) override {
-    free(data);
-  }
+  void Delete(void* data) override { free(data); }
 
   void CopyBytes(void* dst, const void* src, size_t size) override {
     memcpy(dst, src, size);
@@ -85,6 +83,6 @@ struct DeviceContext<DeviceType::NEON> {
 
 Allocator* GetDeviceAllocator(DeviceType type);
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_CORE_ALLOCATOR_H_
+#endif  // MACE_CORE_ALLOCATOR_H_
diff --git a/mace/core/common.h b/mace/core/common.h
index df22eacd..b52526f7 100644
--- a/mace/core/common.h
+++ b/mace/core/common.h
@@ -5,12 +5,12 @@
 #ifndef MACE_CORE_COMMON_H_
 #define MACE_CORE_COMMON_H_
 
-#include <set>
+#include <algorithm>
 #include <map>
-#include <string>
 #include <memory>
+#include <set>
+#include <string>
 #include <vector>
-#include <algorithm>
 
 #include "mace/core/logging.h"
 
@@ -24,9 +24,9 @@ typedef int64_t index_t;
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname)                              \
-private:                                                                       \
-  classname(const classname&) = delete;                                        \
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+ private:                                  \
+  classname(const classname&) = delete;    \
   classname& operator=(const classname&) = delete
 #endif
 
@@ -35,4 +35,4 @@ private:                                                                       \
 // TODO: need to fine tune this
 #define kCostPerGroup 1024000000
 
-#endif // MACE_CORE_COMMON_H_
+#endif  // MACE_CORE_COMMON_H_
diff --git a/mace/core/logging.cc b/mace/core/logging.cc
index f01d0980..ca479176 100644
--- a/mace/core/logging.cc
+++ b/mace/core/logging.cc
@@ -2,7 +2,6 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-
 #include "mace/core/logging.h"
 
 #include <stdlib.h>
@@ -62,11 +61,11 @@ void LogMessage::GenerateLogMessage() {
 #else
 
 void LogMessage::GenerateLogMessage() {
-  fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_, str().c_str());
+  fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+          str().c_str());
 }
 #endif
 
-
 namespace {
 
 // Parse log level (int64_t) from environment variable (char*)
diff --git a/mace/core/logging.h b/mace/core/logging.h
index be31a70a..61a39251 100644
--- a/mace/core/logging.h
+++ b/mace/core/logging.h
@@ -5,8 +5,8 @@
 #ifndef MACE_CORE_LOGGING_H_
 #define MACE_CORE_LOGGING_H_
 
-#include <sstream>
 #include <limits>
+#include <sstream>
 #include <string>
 
 #undef ERROR
@@ -30,8 +30,8 @@ inline void MakeStringInternal(std::stringstream& ss, const T& t) {
 }
 
 template <typename T, typename... Args>
-inline void
-MakeStringInternal(std::stringstream& ss, const T& t, const Args&... args) {
+inline void MakeStringInternal(std::stringstream& ss, const T& t,
+                               const Args&... args) {
   MakeStringInternal(ss, t);
   MakeStringInternal(ss, args...);
 }
@@ -48,9 +48,7 @@ template <>
 inline string MakeString(const string& str) {
   return str;
 }
-inline string MakeString(const char* c_str) {
-  return string(c_str);
-}
+inline string MakeString(const char* c_str) { return string(c_str); }
 
 class LogMessage : public std::basic_ostringstream<char> {
  public:
@@ -85,8 +83,7 @@ class LogMessageFatal : public LogMessage {
   ::mace::internal::LogMessage(__FILE__, __LINE__, mace::WARNING)
 #define _MACE_LOG_ERROR \
   ::mace::internal::LogMessage(__FILE__, __LINE__, mace::ERROR)
-#define _MACE_LOG_FATAL \
-  ::mace::internal::LogMessageFatal(__FILE__, __LINE__)
+#define _MACE_LOG_FATAL ::mace::internal::LogMessageFatal(__FILE__, __LINE__)
 
 #define _MACE_LOG_QFATAL _MACE_LOG_FATAL
 
@@ -96,10 +93,10 @@ class LogMessageFatal : public LogMessage {
 // Turn VLOG off when under mobile devices for considerations of binary size.
 #define VLOG_IS_ON(lvl) ((lvl) <= 0)
 #else
-// Otherwise, Set MACE_CPP_MIN_VLOG_LEVEL environment to update minimum log level
+// Otherwise, Set MACE_CPP_MIN_VLOG_LEVEL environment to update minimum log
+// level
 // of VLOG
-#define VLOG_IS_ON(lvl) \
-  ((lvl) <= ::mace::internal::LogMessage::MinVLogLevel())
+#define VLOG_IS_ON(lvl) ((lvl) <= ::mace::internal::LogMessage::MinVLogLevel())
 #endif
 
 #define VLOG(lvl)      \
@@ -113,16 +110,16 @@ class LogMessageFatal : public LogMessage {
 //    MACE_CHECK(fp->Write(x) == 4)
 //    MACE_CHECK(fp->Write(x) == 4, "Write failed")
 // which are not correct for MACE_ASSERT.
-#define MACE_CHECK(condition, ...)     \
-  if (!(condition)) \
-    LOG(FATAL) << "Check failed: " #condition " " \
-    << ::mace::internal::MakeString(__VA_ARGS__)
+#define MACE_CHECK(condition, ...)              \
+  if (!(condition))                             \
+  LOG(FATAL) << "Check failed: " #condition " " \
+             << ::mace::internal::MakeString(__VA_ARGS__)
 
 #ifndef NDEBUG
-#define MACE_ASSERT(condition, ...)     \
-  if (!(condition)) \
-    LOG(FATAL) << "Assert failed: " #condition " " \
-    << ::mace::internal::MakeString(__VA_ARGS__)
+#define MACE_ASSERT(condition, ...)              \
+  if (!(condition))                              \
+  LOG(FATAL) << "Assert failed: " #condition " " \
+             << ::mace::internal::MakeString(__VA_ARGS__)
 #else
 #define MACE_ASSERT(condition, ...) ((void)0)
 #endif
@@ -135,9 +132,9 @@ T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
   return std::forward<T>(t);
 }
 
-#define MACE_CHECK_NOTNULL(val)                                 \
+#define MACE_CHECK_NOTNULL(val)                      \
   ::mace::internal::CheckNotNull(__FILE__, __LINE__, \
-                                       "'" #val "' Must be non NULL", (val))
+                                 "'" #val "' Must be non NULL", (val))
 
 }  // namespace internal
 }  // namespace mace
diff --git a/mace/core/macros.h b/mace/core/macros.h
index e23699ae..ced106e5 100644
--- a/mace/core/macros.h
+++ b/mace/core/macros.h
@@ -17,5 +17,4 @@
 #define MACE_PREDICT_TRUE(x) (x)
 #endif
 
-
-#endif //MACE_CORE_MACROS_H_
+#endif  // MACE_CORE_MACROS_H_
diff --git a/mace/core/net.cc b/mace/core/net.cc
index a8f1f80e..33be1650 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -6,22 +6,19 @@
 
 namespace mace {
 
-NetBase::NetBase(const std::shared_ptr<const NetDef> &net_def,
-                 Workspace *ws,
+NetBase::NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws,
                  DeviceType type)
-  : name_(net_def->name()) {
-}
-
+    : name_(net_def->name()) {}
 
-SimpleNet::SimpleNet(const std::shared_ptr<const NetDef> &net_def,
-                     Workspace *ws,
-                     DeviceType type) : NetBase(net_def, ws, type) {
+SimpleNet::SimpleNet(const std::shared_ptr<const NetDef>& net_def,
+                     Workspace* ws, DeviceType type)
+    : NetBase(net_def, ws, type) {
   VLOG(1) << "Constructing SimpleNet " << net_def->name();
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     const auto& operator_def = net_def->op(idx);
     VLOG(1) << "Creating operator " << operator_def.name() << ":"
             << operator_def.type();
-    std::unique_ptr<OperatorBase> op {nullptr};
+    std::unique_ptr<OperatorBase> op{nullptr};
     OperatorDef temp_def(operator_def);
     op = CreateOperator(temp_def, ws, type);
     operators_.emplace_back(std::move(op));
@@ -40,20 +37,16 @@ bool SimpleNet::Run() {
   return true;
 }
 
-unique_ptr<NetBase> CreateNet(const NetDef& net_def,
-                              Workspace* ws,
+unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws,
                               DeviceType type) {
   std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
   return CreateNet(tmp_net_def, ws, type);
 }
 
-unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const NetDef>& net_def,
-    Workspace* ws,
-    DeviceType type) {
+unique_ptr<NetBase> CreateNet(const std::shared_ptr<const NetDef>& net_def,
+                              Workspace* ws, DeviceType type) {
   unique_ptr<NetBase> net(new SimpleNet(net_def, ws, type));
   return net;
 }
 
-
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/core/net.h b/mace/core/net.h
index 93ce98ce..621b7ae3 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -6,35 +6,31 @@
 #define MACE_CORE_NET_H_
 
 #include "mace/core/common.h"
-#include "mace/proto/mace.pb.h"
 #include "mace/core/operator.h"
 #include "mace/core/workspace.h"
+#include "mace/proto/mace.pb.h"
 
 namespace mace {
 
 class NetBase {
  public:
-  NetBase(const std::shared_ptr<const NetDef> &net_def,
-          Workspace* ws,
+  NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws,
           DeviceType type);
   virtual ~NetBase() noexcept {}
 
   virtual bool Run() = 0;
 
-  const string &Name() const {
-    return name_;
-  }
+  const string& Name() const { return name_; }
 
  protected:
   string name_;
 
- DISABLE_COPY_AND_ASSIGN(NetBase);
+  DISABLE_COPY_AND_ASSIGN(NetBase);
 };
 
 class SimpleNet : public NetBase {
  public:
-  SimpleNet(const std::shared_ptr<const NetDef>& net_def,
-            Workspace* ws,
+  SimpleNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws,
             DeviceType type);
 
   bool Run() override;
@@ -42,17 +38,14 @@ class SimpleNet : public NetBase {
  protected:
   vector<unique_ptr<OperatorBase> > operators_;
 
- DISABLE_COPY_AND_ASSIGN(SimpleNet);
+  DISABLE_COPY_AND_ASSIGN(SimpleNet);
 };
 
-unique_ptr<NetBase> CreateNet(const NetDef& net_def,
-                              Workspace* ws,
+unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws,
                               DeviceType type);
-unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const NetDef>& net_def,
-    Workspace* ws,
-    DeviceType type);
+unique_ptr<NetBase> CreateNet(const std::shared_ptr<const NetDef>& net_def,
+                              Workspace* ws, DeviceType type);
 
-} //  namespace mace
+}  //  namespace mace
 
-#endif // MACE_CORE_NET_H_
+#endif  // MACE_CORE_NET_H_
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index a755577b..2af4db46 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -11,33 +11,22 @@ std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
   return &g_device_type_registry;
 }
 
-MACE_DEFINE_REGISTRY(
-    CPUOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
+MACE_DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase, const OperatorDef&,
+                     Workspace*);
 MACE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry);
 
-MACE_DEFINE_REGISTRY(
-    NEONOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
+MACE_DEFINE_REGISTRY(NEONOperatorRegistry, OperatorBase, const OperatorDef&,
+                     Workspace*);
 MACE_REGISTER_DEVICE_TYPE(DeviceType::NEON, NEONOperatorRegistry);
 
-unique_ptr<OperatorBase> CreateOperator(
-    const OperatorDef& operator_def,
-    Workspace* ws,
-    DeviceType type) {
+unique_ptr<OperatorBase> CreateOperator(const OperatorDef& operator_def,
+                                        Workspace* ws, DeviceType type) {
   OperatorRegistry* registry = gDeviceTypeRegistry()->at(type);
   return registry->Create(operator_def.type(), operator_def, ws);
 }
 
-
-OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
+OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
     : operator_ws_(ws),
-      operator_def_(std::make_shared<OperatorDef>(operator_def)) {
-}
-
+      operator_def_(std::make_shared<OperatorDef>(operator_def)) {}
 
-} // namespace mace
+}  // namespace mace
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 4ec4e7b1..4c677073 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -5,12 +5,12 @@
 #ifndef MACE_CORE_OPERATOR_H
 #define MACE_CORE_OPERATOR_H
 
-#include "mace/core/proto_utils.h"
 #include "mace/core/common.h"
-#include "mace/proto/mace.pb.h"
-#include "mace/core/tensor.h"
+#include "mace/core/proto_utils.h"
 #include "mace/core/registry.h"
+#include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
+#include "mace/proto/mace.pb.h"
 
 namespace mace {
 
@@ -23,22 +23,21 @@ class OperatorBase {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::HasArgument(*operator_def_, name);
   }
-  template<typename T>
+  template <typename T>
   inline T GetSingleArgument(const string &name, const T &default_value) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
         *operator_def_, name, default_value);
   }
-  template<typename T>
+  template <typename T>
   inline bool HasSingleArgumentOfType(const string &name) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
         *operator_def_, name);
   }
-  template<typename T>
+  template <typename T>
   inline vector<T> GetRepeatedArgument(
-      const string &name,
-      const vector<T> &default_value = {}) const {
+      const string &name, const vector<T> &default_value = {}) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
         *operator_def_, name, default_value);
@@ -49,9 +48,7 @@ class OperatorBase {
     return inputs_[idx];
   }
 
-  inline Tensor *Output(int idx) {
-    return outputs_[idx];
-  }
+  inline Tensor *Output(int idx) { return outputs_[idx]; }
 
   inline int InputSize() { return inputs_.size(); }
   inline int OutputSize() { return outputs_.size(); }
@@ -70,9 +67,7 @@ class OperatorBase {
     operator_def_ = operator_def;
   }
 
-  inline bool has_debug_def() const {
-    return operator_def_ != nullptr;
-  }
+  inline bool has_debug_def() const { return operator_def_ != nullptr; }
 
  protected:
   Workspace *operator_ws_;
@@ -80,7 +75,7 @@ class OperatorBase {
   vector<const Tensor *> inputs_;
   vector<Tensor *> outputs_;
 
- DISABLE_COPY_AND_ASSIGN(OperatorBase);
+  DISABLE_COPY_AND_ASSIGN(OperatorBase);
 };
 
 template <DeviceType D, class T>
@@ -90,26 +85,22 @@ class Operator : public OperatorBase {
       : OperatorBase(operator_def, ws) {
     for (const string &input_str : operator_def.input()) {
       const Tensor *tensor = ws->GetTensor(input_str);
-      MACE_CHECK(
-          tensor != nullptr,
-          "op ",
-          operator_def.type(),
-          ": Encountered a non-existing input tensor: ",
-          input_str);
+      MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
+                 ": Encountered a non-existing input tensor: ", input_str);
       inputs_.push_back(tensor);
     }
 
     for (const string &output_str : operator_def.output()) {
-      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(output_str,
-                         DeviceContext<D>::allocator(),
-                         DataTypeToEnum<T>::v())));
+      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
+          output_str, DeviceContext<D>::allocator(), DataTypeToEnum<T>::v())));
     }
   }
   virtual bool Run() override = 0;
   ~Operator() noexcept override {}
 };
 
-// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of the
+// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of
+// the
 // operator's inputs and outputs, in order to avoid confusion. For example, for
 // a fully convolution layer that has input, weight and bias, you can define its
 // input tags as:
@@ -119,9 +110,9 @@ class Operator : public OperatorBase {
 // you can now do
 //     auto& weight = Input(WEIGHT);
 // to make it more clear.
-#define OP_INPUT_TAGS(first_input, ...)                                           \
+#define OP_INPUT_TAGS(first_input, ...) \
   enum _InputTags { first_input = 0, __VA_ARGS__ }
-#define OP_OUTPUT_TAGS(first_input, ...)                                          \
+#define OP_OUTPUT_TAGS(first_input, ...) \
   enum _OutputTags { first_input = 0, __VA_ARGS__ }
 
 typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
@@ -135,7 +126,7 @@ struct DeviceTypeRegisterer {
     if (gDeviceTypeRegistry()->count(type)) {
       LOG(ERROR) << "Device type " << type
                  << "registered twice. This should not happen. Did you have "
-                     "duplicated numbers assigned to different devices?";
+                    "duplicated numbers assigned to different devices?";
       std::exit(1);
     }
     // Calling the registry function to get the actual registry pointer.
@@ -143,39 +134,31 @@ struct DeviceTypeRegisterer {
   }
 };
 
-#define MACE_REGISTER_DEVICE_TYPE(type, registry_function) \
-  namespace {                                               \
-  static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(     \
-      DeviceType)(type, &registry_function);                \
+#define MACE_REGISTER_DEVICE_TYPE(type, registry_function)         \
+  namespace {                                                      \
+  static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \
+      type, &registry_function);                                   \
   }
 
-MACE_DECLARE_REGISTRY(
-    CPUOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
+MACE_DECLARE_REGISTRY(CPUOperatorRegistry, OperatorBase, const OperatorDef &,
+                      Workspace *);
 
 #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
   MACE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_CPU_OPERATOR(name, ...)                           \
+#define REGISTER_CPU_OPERATOR(name, ...) \
   MACE_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
 
-MACE_DECLARE_REGISTRY(
-    NEONOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
+MACE_DECLARE_REGISTRY(NEONOperatorRegistry, OperatorBase, const OperatorDef &,
+                      Workspace *);
 
 #define REGISTER_NEON_OPERATOR_CREATOR(key, ...) \
   MACE_REGISTER_CREATOR(NEONOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_NEON_OPERATOR(name, ...)                           \
+#define REGISTER_NEON_OPERATOR(name, ...) \
   MACE_REGISTER_CLASS(NEONOperatorRegistry, name, __VA_ARGS__)
 
-unique_ptr<OperatorBase> CreateOperator(
-    const OperatorDef &operator_def,
-    Workspace *ws,
-    DeviceType type);
+unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
+                                        Workspace *ws, DeviceType type);
 
-} //  namespace mace
+}  //  namespace mace
 
-#endif //MACE_CORE_OPERATOR_H
+#endif  // MACE_CORE_OPERATOR_H
diff --git a/mace/core/proto_utils.cc b/mace/core/proto_utils.cc
index 0658913a..9906dd26 100644
--- a/mace/core/proto_utils.cc
+++ b/mace/core/proto_utils.cc
@@ -5,9 +5,9 @@
 #include "mace/core/proto_utils.h"
 
 #include <fcntl.h>
+#include <unistd.h>
 #include <cerrno>
 #include <fstream>
-#include <unistd.h>
 
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
@@ -82,13 +82,12 @@ bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-void WriteProtoToBinaryFile(
-    const MessageLite& /*proto*/,
-    const char* /*filename*/) {
+void WriteProtoToBinaryFile(const MessageLite& /*proto*/,
+                            const char* /*filename*/) {
   LOG(FATAL) << "Not implemented yet.";
 }
 
-#else  // MACE_USE_LITE_PROTO
+#else                  // MACE_USE_LITE_PROTO
 
 // Full protocol buffer.
 
@@ -118,7 +117,7 @@ void WriteProtoToTextFile(const Message& proto, const char* filename) {
 }
 
 bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
-#if defined (_MSC_VER)  // for MSC compiler binary flag needs to be specified
+#if defined(_MSC_VER)  // for MSC compiler binary flag needs to be specified
   int fd = open(filename, O_RDONLY | O_BINARY);
 #else
   int fd = open(filename, O_RDONLY);
@@ -138,8 +137,8 @@ bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
 
 void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename) {
   int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-  MACE_CHECK(
-      fd != -1, "File cannot be created: ", filename, " error number: ", errno);
+  MACE_CHECK(fd != -1, "File cannot be created: ", filename, " error number: ",
+             errno);
   std::unique_ptr<ZeroCopyOutputStream> raw_output(new FileOutputStream(fd));
   std::unique_ptr<CodedOutputStream> coded_output(
       new CodedOutputStream(raw_output.get()));
@@ -151,18 +150,17 @@ void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename) {
 
 #endif  // MACE_USE_LITE_PROTO
 
-ArgumentHelper::ArgumentHelper(const OperatorDef &def) {
-  for (auto &arg : def.arg()) {
+ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
+  for (auto& arg : def.arg()) {
     if (arg_map_.find(arg.name()) != arg_map_.end()) {
       MACE_CHECK(
           arg.SerializeAsString() == arg_map_[arg.name()].SerializeAsString(),
-          "Found argument of the same name '",
-          arg.name(),
-          "' but with different contents: ",
-          ProtoDebugString(def));
+          "Found argument of the same name '", arg.name(),
+          "' but with different contents: ", ProtoDebugString(def));
 
       LOG(WARNING) << "Duplicated argument name found in operator def: "
-        << ProtoDebugString(def) << ", arg: " << ProtoDebugString(arg);
+                   << ProtoDebugString(def)
+                   << ", arg: " << ProtoDebugString(arg);
     }
 
     arg_map_[arg.name()] = arg;
@@ -171,10 +169,9 @@ ArgumentHelper::ArgumentHelper(const OperatorDef &def) {
 
 ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
   for (auto& arg : netdef.arg()) {
-    MACE_CHECK(
-        arg_map_.count(arg.name()) == 0,
-        "Duplicated argument name found in net def: ",
-        ProtoDebugString(netdef));
+    MACE_CHECK(arg_map_.count(arg.name()) == 0,
+               "Duplicated argument name found in net def: ",
+               ProtoDebugString(netdef));
     arg_map_[arg.name()] = arg;
   }
 }
@@ -192,32 +189,24 @@ bool SupportsLosslessConversion(const InputType& value) {
 }
 }
 
-#define INSTANTIATE_GET_SINGLE_ARGUMENT(                                      \
-    T, fieldname, enforce_lossless_conversion)                                \
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname,                         \
+                                        enforce_lossless_conversion)          \
   template <>                                                                 \
-  T ArgumentHelper::GetSingleArgument<T>(                                     \
-      const string& name, const T& default_value) const {                     \
+  T ArgumentHelper::GetSingleArgument<T>(const string& name,                  \
+                                         const T& default_value) const {      \
     if (arg_map_.count(name) == 0) {                                          \
       VLOG(1) << "Using default parameter value " << default_value            \
               << " for parameter " << name;                                   \
       return default_value;                                                   \
     }                                                                         \
-    MACE_CHECK(                                                            \
-        arg_map_.at(name).has_##fieldname(),                                  \
-        "Argument ",                                                          \
-        name,                                                                 \
-        " does not have the right field: expected field " #fieldname);        \
+    MACE_CHECK(arg_map_.at(name).has_##fieldname(), "Argument ", name,        \
+               " does not have the right field: expected field " #fieldname); \
     auto value = arg_map_.at(name).fieldname();                               \
     if (enforce_lossless_conversion) {                                        \
       auto supportsConversion =                                               \
           SupportsLosslessConversion<decltype(value), T>(value);              \
-      MACE_CHECK(                                                          \
-          supportsConversion,                                                 \
-          "Value",                                                            \
-          value,                                                              \
-          " of argument ",                                                    \
-          name,                                                               \
-          "cannot be represented correctly in a target type");                \
+      MACE_CHECK(supportsConversion, "Value", value, " of argument ", name,   \
+                 "cannot be represented correctly in a target type");         \
     }                                                                         \
     return value;                                                             \
   }                                                                           \
@@ -242,30 +231,25 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(size_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
 #undef INSTANTIATE_GET_SINGLE_ARGUMENT
 
-#define INSTANTIATE_GET_REPEATED_ARGUMENT(                             \
-    T, fieldname, enforce_lossless_conversion)                         \
-  template <>                                                          \
-  vector<T> ArgumentHelper::GetRepeatedArgument<T>(                    \
-      const string& name, const std::vector<T>& default_value) const { \
-    if (arg_map_.count(name) == 0) {                                   \
-      return default_value;                                            \
-    }                                                                  \
-    vector<T> values;                                                  \
-    for (const auto& v : arg_map_.at(name).fieldname()) {              \
-      if (enforce_lossless_conversion) {                               \
-        auto supportsConversion =                                      \
-            SupportsLosslessConversion<decltype(v), T>(v);             \
-        MACE_CHECK(                                                 \
-            supportsConversion,                                        \
-            "Value",                                                   \
-            v,                                                         \
-            " of argument ",                                           \
-            name,                                                      \
-            "cannot be represented correctly in a target type");       \
-      }                                                                \
-      values.push_back(v);                                             \
-    }                                                                  \
-    return values;                                                     \
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                   \
+                                          enforce_lossless_conversion)    \
+  template <>                                                             \
+  vector<T> ArgumentHelper::GetRepeatedArgument<T>(                       \
+      const string& name, const std::vector<T>& default_value) const {    \
+    if (arg_map_.count(name) == 0) {                                      \
+      return default_value;                                               \
+    }                                                                     \
+    vector<T> values;                                                     \
+    for (const auto& v : arg_map_.at(name).fieldname()) {                 \
+      if (enforce_lossless_conversion) {                                  \
+        auto supportsConversion =                                         \
+            SupportsLosslessConversion<decltype(v), T>(v);                \
+        MACE_CHECK(supportsConversion, "Value", v, " of argument ", name, \
+                   "cannot be represented correctly in a target type");   \
+      }                                                                   \
+      values.push_back(v);                                                \
+    }                                                                     \
+    return values;                                                        \
   }
 
 INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats, false)
@@ -281,14 +265,14 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(size_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
 
-#define MACE_MAKE_SINGULAR_ARGUMENT(T, fieldname)                            \
-template <>                                                                    \
-Argument MakeArgument(const string& name, const T& value) {                    \
-  Argument arg;                                                                \
-  arg.set_name(name);                                                          \
-  arg.set_##fieldname(value);                                                  \
-  return arg;                                                                  \
-}
+#define MACE_MAKE_SINGULAR_ARGUMENT(T, fieldname)             \
+  template <>                                                 \
+  Argument MakeArgument(const string& name, const T& value) { \
+    Argument arg;                                             \
+    arg.set_name(name);                                       \
+    arg.set_##fieldname(value);                               \
+    return arg;                                               \
+  }
 
 MACE_MAKE_SINGULAR_ARGUMENT(bool, i)
 MACE_MAKE_SINGULAR_ARGUMENT(float, f)
@@ -305,16 +289,16 @@ Argument MakeArgument(const string& name, const MessageLite& value) {
   return arg;
 }
 
-#define MACE_MAKE_REPEATED_ARGUMENT(T, fieldname)                            \
-template <>                                                                    \
-Argument MakeArgument(const string& name, const vector<T>& value) {            \
-  Argument arg;                                                                \
-  arg.set_name(name);                                                          \
-  for (const auto& v : value) {                                                \
-    arg.add_##fieldname(v);                                                    \
-  }                                                                            \
-  return arg;                                                                  \
-}
+#define MACE_MAKE_REPEATED_ARGUMENT(T, fieldname)                     \
+  template <>                                                         \
+  Argument MakeArgument(const string& name, const vector<T>& value) { \
+    Argument arg;                                                     \
+    arg.set_name(name);                                               \
+    for (const auto& v : value) {                                     \
+      arg.add_##fieldname(v);                                         \
+    }                                                                 \
+    return arg;                                                       \
+  }
 
 MACE_MAKE_REPEATED_ARGUMENT(float, floats)
 MACE_MAKE_REPEATED_ARGUMENT(int, ints)
@@ -328,31 +312,24 @@ const Argument& GetArgument(const OperatorDef& def, const string& name) {
       return arg;
     }
   }
-  MACE_CHECK(false,
-      "Argument named ",
-      name,
-      "does not exist in operator ",
-      ProtoDebugString(def));
+  MACE_CHECK(false, "Argument named ", name, "does not exist in operator ",
+             ProtoDebugString(def));
 }
 
-bool GetFlagArgument(
-    const OperatorDef& def,
-    const string& name,
-    bool def_value) {
+bool GetFlagArgument(const OperatorDef& def, const string& name,
+                     bool def_value) {
   for (const Argument& arg : def.arg()) {
     if (arg.name() == name) {
-      MACE_CHECK(
-          arg.has_i(), "Can't parse argument as bool: ", ProtoDebugString(arg));
+      MACE_CHECK(arg.has_i(), "Can't parse argument as bool: ",
+                 ProtoDebugString(arg));
       return arg.i();
     }
   }
   return def_value;
 }
 
-Argument* GetMutableArgument(
-    const string& name,
-    const bool create_if_missing,
-    OperatorDef* def) {
+Argument* GetMutableArgument(const string& name, const bool create_if_missing,
+                             OperatorDef* def) {
   for (int i = 0; i < def->arg_size(); ++i) {
     if (def->arg(i).name() == name) {
       return def->mutable_arg(i);
diff --git a/mace/core/proto_utils.h b/mace/core/proto_utils.h
index e50294ad..5f8074ae 100644
--- a/mace/core/proto_utils.h
+++ b/mace/core/proto_utils.h
@@ -12,15 +12,14 @@
 #include "google/protobuf/message.h"
 #endif  // !MACE_USE_LITE_PROTO
 
-#include "mace/proto/mace.pb.h"
 #include "mace/core/common.h"
+#include "mace/proto/mace.pb.h"
 
 namespace mace {
 
 using std::string;
 using ::google::protobuf::MessageLite;
 
-
 // Common interfaces that reads file contents into a string.
 bool ReadStringFromFile(const char* filename, string* str);
 bool WriteStringToFile(const string& str, const char* filename);
@@ -46,22 +45,20 @@ inline string ProtoDebugString(const MessageLite& proto) {
 // Text format MessageLite wrappers: these functions do nothing but just
 // allowing things to compile. It will produce a runtime error if you are using
 // MessageLite but still want text support.
-inline bool ReadProtoFromTextFile(
-    const char* /*filename*/,
-    MessageLite* /*proto*/) {
+inline bool ReadProtoFromTextFile(const char* /*filename*/,
+                                  MessageLite* /*proto*/) {
   LOG(FATAL) << "If you are running lite version, you should not be "
-                  << "calling any text-format protobuffers.";
+             << "calling any text-format protobuffers.";
   return false;  // Just to suppress compiler warning.
 }
 inline bool ReadProtoFromTextFile(const string filename, MessageLite* proto) {
   return ReadProtoFromTextFile(filename.c_str(), proto);
 }
 
-inline void WriteProtoToTextFile(
-    const MessageLite& /*proto*/,
-    const char* /*filename*/) {
+inline void WriteProtoToTextFile(const MessageLite& /*proto*/,
+                                 const char* /*filename*/) {
   LOG(FATAL) << "If you are running lite version, you should not be "
-                  << "calling any text-format protobuffers.";
+             << "calling any text-format protobuffers.";
 }
 inline void WriteProtoToTextFile(const MessageLite& proto,
                                  const string& filename) {
@@ -107,16 +104,13 @@ inline bool ReadProtoFromFile(const string& filename, Message* proto) {
 
 #endif  // MACE_USE_LITE_PROTO
 
-template <
-    class IterableInputs = std::initializer_list<string>,
-    class IterableOutputs = std::initializer_list<string>,
-    class IterableArgs = std::initializer_list<Argument>>
-OperatorDef CreateOperatorDef(
-    const string& type,
-    const string& name,
-    const IterableInputs& inputs,
-    const IterableOutputs& outputs,
-    const IterableArgs& args) {
+template <class IterableInputs = std::initializer_list<string>,
+          class IterableOutputs = std::initializer_list<string>,
+          class IterableArgs = std::initializer_list<Argument>>
+OperatorDef CreateOperatorDef(const string& type, const string& name,
+                              const IterableInputs& inputs,
+                              const IterableOutputs& outputs,
+                              const IterableArgs& args) {
   OperatorDef def;
   def.set_type(type);
   def.set_name(name);
@@ -134,20 +128,13 @@ OperatorDef CreateOperatorDef(
 
 // A simplified version compared to the full CreateOperator, if you do not need
 // to specify args.
-template <
-    class IterableInputs = std::initializer_list<string>,
-    class IterableOutputs = std::initializer_list<string>>
-inline OperatorDef CreateOperatorDef(
-    const string& type,
-    const string& name,
-    const IterableInputs& inputs,
-    const IterableOutputs& outputs) {
-  return CreateOperatorDef(
-      type,
-      name,
-      inputs,
-      outputs,
-      std::vector<Argument>());
+template <class IterableInputs = std::initializer_list<string>,
+          class IterableOutputs = std::initializer_list<string>>
+inline OperatorDef CreateOperatorDef(const string& type, const string& name,
+                                     const IterableInputs& inputs,
+                                     const IterableOutputs& outputs) {
+  return CreateOperatorDef(type, name, inputs, outputs,
+                           std::vector<Argument>());
 }
 
 /**
@@ -166,10 +153,8 @@ class ArgumentHelper {
   }
 
   template <typename Def, typename T>
-  static T GetSingleArgument(
-      const Def& def,
-      const string& name,
-      const T& default_value) {
+  static T GetSingleArgument(const Def& def, const string& name,
+                             const T& default_value) {
     return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
   }
 
@@ -180,8 +165,7 @@ class ArgumentHelper {
 
   template <typename Def, typename T>
   static vector<T> GetRepeatedArgument(
-      const Def& def,
-      const string& name,
+      const Def& def, const string& name,
       const std::vector<T>& default_value = std::vector<T>()) {
     return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
   }
@@ -192,9 +176,8 @@ class ArgumentHelper {
   }
 
   template <typename Def, typename MessageType>
-  static vector<MessageType> GetRepeatedMessageArgument(
-      const Def& def,
-      const string& name) {
+  static vector<MessageType> GetRepeatedMessageArgument(const Def& def,
+                                                        const string& name) {
     return ArgumentHelper(def).GetRepeatedMessageArgument<MessageType>(name);
   }
 
@@ -216,9 +199,8 @@ class ArgumentHelper {
     MACE_CHECK(arg_map_.count(name), "Cannot find parameter named " + name);
     MessageType message;
     if (arg_map_.at(name).has_s()) {
-      MACE_CHECK(
-          message.ParseFromString(arg_map_.at(name).s()),
-          "Faild to parse content from the string");
+      MACE_CHECK(message.ParseFromString(arg_map_.at(name).s()),
+                 "Faild to parse content from the string");
     } else {
       VLOG(1) << "Return empty message for parameter " << name;
     }
@@ -230,9 +212,8 @@ class ArgumentHelper {
     MACE_CHECK(arg_map_.count(name), "Cannot find parameter named " + name);
     vector<MessageType> messages(arg_map_.at(name).strings_size());
     for (int i = 0; i < messages.size(); ++i) {
-      MACE_CHECK(
-          messages[i].ParseFromString(arg_map_.at(name).strings(i)),
-          "Faild to parse content from the string");
+      MACE_CHECK(messages[i].ParseFromString(arg_map_.at(name).strings(i)),
+                 "Faild to parse content from the string");
     }
     return messages;
   }
@@ -242,15 +223,11 @@ class ArgumentHelper {
 };
 
 const Argument& GetArgument(const OperatorDef& def, const string& name);
-bool GetFlagArgument(
-    const OperatorDef& def,
-    const string& name,
-    bool def_value = false);
-
-Argument* GetMutableArgument(
-    const string& name,
-    const bool create_if_missing,
-    OperatorDef* def);
+bool GetFlagArgument(const OperatorDef& def, const string& name,
+                     bool def_value = false);
+
+Argument* GetMutableArgument(const string& name, const bool create_if_missing,
+                             OperatorDef* def);
 
 template <typename T>
 Argument MakeArgument(const string& name, const T& value);
diff --git a/mace/core/registry.h b/mace/core/registry.h
index 1f5a86fb..a4747e1b 100644
--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -12,7 +12,7 @@ namespace mace {
 template <class SrcType, class ObjectType, class... Args>
 class Registry {
  public:
-  typedef std::function<std::unique_ptr<ObjectType> (Args ...)> Creator;
+  typedef std::function<std::unique_ptr<ObjectType>(Args...)> Creator;
 
   Registry() : registry_() {}
 
@@ -24,7 +24,7 @@ class Registry {
 
   inline bool Has(const SrcType& key) { return registry_.count(key) != 0; }
 
-  unique_ptr<ObjectType> Create(const SrcType& key, Args ... args) {
+  unique_ptr<ObjectType> Create(const SrcType& key, Args... args) {
     if (registry_.count(key) == 0) {
       VLOG(2) << "Key not registered: " << key;
       return nullptr;
@@ -60,7 +60,7 @@ class Registerer {
   }
 
   template <class DerivedType>
-  static unique_ptr<ObjectType> DefaultCreator(Args ... args) {
+  static unique_ptr<ObjectType> DefaultCreator(Args... args) {
     return std::unique_ptr<ObjectType>(new DerivedType(args...));
   }
 };
@@ -74,36 +74,35 @@ class Registerer {
 #endif
 
 #define MACE_DECLARE_TYPED_REGISTRY(RegistryName, SrcType, ObjectType, ...) \
-  Registry<SrcType, ObjectType, ##__VA_ARGS__>* RegistryName();              \
-  typedef Registerer<SrcType, ObjectType, ##__VA_ARGS__>                     \
+  Registry<SrcType, ObjectType, ##__VA_ARGS__>* RegistryName();             \
+  typedef Registerer<SrcType, ObjectType, ##__VA_ARGS__>                    \
       Registerer##RegistryName;
 
 #define MACE_DEFINE_TYPED_REGISTRY(RegistryName, SrcType, ObjectType, ...) \
-  Registry<SrcType, ObjectType, ##__VA_ARGS__>* RegistryName() {            \
-    static Registry<SrcType, ObjectType, ##__VA_ARGS__>* registry =         \
-        new Registry<SrcType, ObjectType, ##__VA_ARGS__>();                 \
-    return registry;                                                        \
+  Registry<SrcType, ObjectType, ##__VA_ARGS__>* RegistryName() {           \
+    static Registry<SrcType, ObjectType, ##__VA_ARGS__>* registry =        \
+        new Registry<SrcType, ObjectType, ##__VA_ARGS__>();                \
+    return registry;                                                       \
   }
 
-#define MACE_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  MACE_DECLARE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, ##__VA_ARGS__)
+#define MACE_DECLARE_REGISTRY(RegistryName, ObjectType, ...)         \
+  MACE_DECLARE_TYPED_REGISTRY(RegistryName, std::string, ObjectType, \
+                              ##__VA_ARGS__)
 
-#define MACE_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  MACE_DEFINE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, ##__VA_ARGS__)
+#define MACE_DEFINE_REGISTRY(RegistryName, ObjectType, ...)         \
+  MACE_DEFINE_TYPED_REGISTRY(RegistryName, std::string, ObjectType, \
+                             ##__VA_ARGS__)
 
 #define MACE_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
-  namespace {                                                                 \
+  namespace {                                                                \
   static Registerer##RegistryName MACE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
       key, RegistryName(), __VA_ARGS__);
 
 #define MACE_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
-  namespace {                                                                 \
+  namespace {                                                                \
   static Registerer##RegistryName MACE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key,                                                                    \
-      RegistryName(),                                                         \
-      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);                 \
+      key, RegistryName(),                                                   \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);                \
   }
 
 #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \
@@ -112,6 +111,6 @@ class Registerer {
 #define MACE_REGISTER_CLASS(RegistryName, key, ...) \
   MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_CORE_REGISTRY_H_
+#endif  // MACE_CORE_REGISTRY_H_
diff --git a/mace/core/serializer.cc b/mace/core/serializer.cc
index 3e80e545..cfe2d935 100644
--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
@@ -4,19 +4,18 @@
 
 #include "mace/core/serializer.h"
 
-
 namespace mace {
 
 unique_ptr<TensorProto> Serializer::Serialize(const Tensor &tensor,
-                           const string &name) {
+                                              const string &name) {
   MACE_NOT_IMPLEMENTED;
   return nullptr;
 }
 
 unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                                            DeviceType type) {
-  unique_ptr<Tensor> tensor(new Tensor(GetDeviceAllocator(type),
-                                       proto.data_type()));
+  unique_ptr<Tensor> tensor(
+      new Tensor(GetDeviceAllocator(type), proto.data_type()));
   vector<index_t> dims;
   for (const index_t d : proto.dims()) {
     dims.push_back(d);
@@ -25,8 +24,7 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
 
   switch (proto.data_type()) {
     case DT_FLOAT:
-      tensor->Copy<float>(proto.float_data().data(),
-                          proto.float_data().size());
+      tensor->Copy<float>(proto.float_data().data(), proto.float_data().size());
       break;
     case DT_DOUBLE:
       tensor->Copy<double>(proto.double_data().data(),
@@ -34,39 +32,38 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
       break;
     case DT_INT32:
       tensor->template Copy<int32_t>(proto.int32_data().data(),
-                                   proto.int32_data().size());
+                                     proto.int32_data().size());
       break;
     case DT_UINT8:
       tensor->CopyWithCast<int32_t, uint8_t>(proto.int32_data().data(),
-                                         proto.int32_data().size());
+                                             proto.int32_data().size());
       break;
     case DT_INT16:
       tensor->CopyWithCast<int32_t, int16_t>(proto.int32_data().data(),
-                                         proto.int32_data().size());
+                                             proto.int32_data().size());
       break;
     case DT_INT8:
       tensor->CopyWithCast<int32_t, int8_t>(proto.int32_data().data(),
-                                        proto.int32_data().size());
+                                            proto.int32_data().size());
       break;
     case DT_INT64:
       tensor->Copy<int64_t>(proto.int64_data().data(),
-                          proto.int64_data().size());
+                            proto.int64_data().size());
       break;
     case DT_UINT16:
       tensor->CopyWithCast<int32_t, uint16_t>(proto.int32_data().data(),
-                                          proto.int32_data().size());
+                                              proto.int32_data().size());
       break;
     case DT_BOOL:
       tensor->CopyWithCast<int32_t, bool>(proto.int32_data().data(),
-                                        proto.int32_data().size());
+                                          proto.int32_data().size());
       break;
     case DT_STRING: {
       string *content = tensor->mutable_data<string>();
       for (int i = 0; i < proto.string_data().size(); ++i) {
         content[i] = proto.string_data(i);
       }
-    }
-      break;
+    } break;
     default:
       MACE_NOT_IMPLEMENTED;
       break;
@@ -75,4 +72,4 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
   return tensor;
 }
 
-} // namespace mace
\ No newline at end of file
+}  // namespace mace
\ No newline at end of file
diff --git a/mace/core/serializer.h b/mace/core/serializer.h
index 01f20748..f9966a5a 100644
--- a/mace/core/serializer.h
+++ b/mace/core/serializer.h
@@ -5,9 +5,9 @@
 #ifndef MACE_CORE_SERIALIZER_H_
 #define MACE_CORE_SERIALIZER_H_
 
-#include "mace/proto/mace.pb.h"
 #include "mace/core/common.h"
 #include "mace/core/tensor.h"
+#include "mace/proto/mace.pb.h"
 
 namespace mace {
 
@@ -20,9 +20,9 @@ class Serializer {
 
   unique_ptr<Tensor> Deserialize(const TensorProto& proto, DeviceType type);
 
- DISABLE_COPY_AND_ASSIGN(Serializer);
+  DISABLE_COPY_AND_ASSIGN(Serializer);
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_CORE_SERIALIZER_H_
+#endif  // MACE_CORE_SERIALIZER_H_
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 3dc3f1ed..224c342e 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -5,11 +5,11 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_
 
-#include "mace/core/common.h"
-#include "mace/proto/mace.pb.h"
 #include "mace/core/allocator.h"
-#include "mace/core/types.h"
+#include "mace/core/common.h"
 #include "mace/core/logging.h"
+#include "mace/core/types.h"
+#include "mace/proto/mace.pb.h"
 
 namespace mace {
 
@@ -25,13 +25,13 @@ namespace mace {
   switch (TYPE_ENUM) {                                         \
     CASE(float, SINGLE_ARG(STMTS))                             \
     CASE(double, SINGLE_ARG(STMTS))                            \
-    CASE(int32_t, SINGLE_ARG(STMTS))                             \
-    CASE(uint8_t, SINGLE_ARG(STMTS))                             \
-    CASE(uint16_t, SINGLE_ARG(STMTS))                            \
-    CASE(int16_t, SINGLE_ARG(STMTS))                             \
-    CASE(int8_t, SINGLE_ARG(STMTS))                              \
+    CASE(int32_t, SINGLE_ARG(STMTS))                           \
+    CASE(uint8_t, SINGLE_ARG(STMTS))                           \
+    CASE(uint16_t, SINGLE_ARG(STMTS))                          \
+    CASE(int16_t, SINGLE_ARG(STMTS))                           \
+    CASE(int8_t, SINGLE_ARG(STMTS))                            \
     CASE(string, SINGLE_ARG(STMTS))                            \
-    CASE(int64_t, SINGLE_ARG(STMTS))                             \
+    CASE(int64_t, SINGLE_ARG(STMTS))                           \
     CASE(bool, SINGLE_ARG(STMTS))                              \
     case DT_INVALID:                                           \
       INVALID;                                                 \
@@ -41,20 +41,17 @@ namespace mace {
       break;                                                   \
   }
 
-
 #define CASES(TYPE_ENUM, STMTS)                                      \
   CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \
                      , LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;)
 
-
 class Tensor {
  public:
   Tensor()
-      : alloc_(cpu_allocator()),
-        size_(0), dtype_(DT_FLOAT), data_(nullptr) {};
+      : alloc_(cpu_allocator()), size_(0), dtype_(DT_FLOAT), data_(nullptr){};
 
   Tensor(Allocator* a, DataType type)
-      : alloc_(a), size_(0), dtype_(type), data_(nullptr) {};
+      : alloc_(a), size_(0), dtype_(type), data_(nullptr){};
 
   ~Tensor() {
     if (alloc_ && data_.get()) {
@@ -92,9 +89,8 @@ class Tensor {
     if (data_.get() || size_ == 0) {
       return data_.get();
     } else {
-      CASES(dtype_, data_.reset(alloc_->New(size_ * sizeof(T)), [this](void* ptr) {
-        alloc_->Delete(ptr);
-      }));
+      CASES(dtype_, data_.reset(alloc_->New(size_ * sizeof(T)),
+                                [this](void* ptr) { alloc_->Delete(ptr); }));
       return data_.get();
     }
   }
@@ -116,13 +112,9 @@ class Tensor {
     }
   }
 
-  inline void ResizeLike(const Tensor& other) {
-    Resize(other.shape());
-  }
+  inline void ResizeLike(const Tensor& other) { Resize(other.shape()); }
 
-  inline void ResizeLike(const Tensor* other) {
-    Resize(other->shape());
-  }
+  inline void ResizeLike(const Tensor* other) { Resize(other->shape()); }
 
   template <typename T>
   inline void Copy(const T* src, index_t size) {
@@ -132,7 +124,8 @@ class Tensor {
 
   template <typename SrcType, typename DstType>
   inline void CopyWithCast(const SrcType* src, size_t size) {
-    MACE_CHECK(static_cast<index_t>(size) == size_, "copy src and dst with different size.");
+    MACE_CHECK(static_cast<index_t>(size) == size_,
+               "copy src and dst with different size.");
     unique_ptr<DstType[]> buffer(new DstType[size]);
     for (size_t i = 0; i < size; ++i) {
       buffer[i] = static_cast<DstType>(src[i]);
@@ -146,10 +139,11 @@ class Tensor {
 
   inline void DebugPrint() {
     std::stringstream os;
-    for (int i: shape_) {
+    for (int i : shape_) {
       os << i << ", ";
     }
-    LOG(INFO) << "Tensor shape: " << os.str() << " type: " << DataType_Name(dtype_);
+    LOG(INFO) << "Tensor shape: " << os.str()
+              << " type: " << DataType_Name(dtype_);
 
     os.str("");
     os.clear();
@@ -175,7 +169,8 @@ class Tensor {
 
  private:
   inline int64_t NumElements() const {
-    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>());
+    return std::accumulate(shape_.begin(), shape_.end(), 1,
+                           std::multiplies<int64_t>());
   }
 
   Allocator* alloc_;
@@ -184,9 +179,9 @@ class Tensor {
   std::shared_ptr<void> data_;
   vector<index_t> shape_;
 
- DISABLE_COPY_AND_ASSIGN(Tensor);
+  DISABLE_COPY_AND_ASSIGN(Tensor);
 };
 
-} // namespace tensor
+}  // namespace tensor
 
-#endif //MACE_CORE_TENSOR_H_
+#endif  // MACE_CORE_TENSOR_H_
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 7e09c28f..66078911 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -51,11 +51,8 @@ Benchmark* Benchmark::ArgPair(int x, int y) {
   return this;
 }
 
-
 // Run all benchmarks
-void Benchmark::Run() {
-  Run("all");
-}
+void Benchmark::Run() { Run("all"); }
 
 void Benchmark::Run(const char* pattern) {
   if (!all_benchmarks) return;
@@ -113,8 +110,8 @@ void Benchmark::Run(const char* pattern) {
                  (items_processed * 1e-6) / seconds);
         full_label += buf;
       }
-      printf("%-*s %10.0f %10d\t%s\n", width, name,
-             seconds * 1e9 / iters, iters, full_label.c_str());
+      printf("%-*s %10.0f %10d\t%s\n", width, name, seconds * 1e9 / iters,
+             iters, full_label.c_str());
     }
   }
 }
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index 6f96411b..25d12459 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -12,9 +12,9 @@
 #include "mace/core/types.h"
 
 #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
-#define BENCHMARK(n)                                            \
-  static ::mace::testing::Benchmark* MACE_BENCHMARK_CONCAT(__benchmark_, n, __LINE__) = \
-      (new ::mace::testing::Benchmark(#n, (n)))
+#define BENCHMARK(n)                                        \
+  static ::mace::testing::Benchmark* MACE_BENCHMARK_CONCAT( \
+      __benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n)))
 
 namespace mace {
 namespace testing {
diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc
index dfa87672..cc0c0172 100644
--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -17,4 +17,3 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
-
diff --git a/mace/core/types.h b/mace/core/types.h
index b174993d..21c502cf 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -18,26 +18,25 @@ struct DataTypeToEnum {
   static_assert(IsValidDataType<T>::value, "Specified Data Type not supported");
 };
 
-
 // EnumToDataType<VALUE>::Type is the type for DataType constant VALUE, e.g.
 // EnumToDataType<DT_FLOAT>::Type is float.
 template <DataType VALUE>
 struct EnumToDataType {};  // Specializations below
 
 // Template specialization for both DataTypeToEnum and EnumToDataType.
-#define MATCH_TYPE_AND_ENUM(TYPE, ENUM)                 \
-  template <>                                           \
-  struct DataTypeToEnum<TYPE> {                         \
-    static DataType v() { return ENUM; }                \
-    static constexpr DataType value = ENUM;             \
-  };                                                    \
-  template <>                                           \
-  struct IsValidDataType<TYPE> {                        \
-    static constexpr bool value = true;                 \
-  };                                                    \
-  template <>                                           \
-  struct EnumToDataType<ENUM> {                         \
-    typedef TYPE Type;                                  \
+#define MATCH_TYPE_AND_ENUM(TYPE, ENUM)     \
+  template <>                               \
+  struct DataTypeToEnum<TYPE> {             \
+    static DataType v() { return ENUM; }    \
+    static constexpr DataType value = ENUM; \
+  };                                        \
+  template <>                               \
+  struct IsValidDataType<TYPE> {            \
+    static constexpr bool value = true;     \
+  };                                        \
+  template <>                               \
+  struct EnumToDataType<ENUM> {             \
+    typedef TYPE Type;                      \
   }
 
 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
@@ -53,6 +52,6 @@ MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 
 static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_CORE_TYPES_H_
+#endif  // MACE_CORE_TYPES_H_
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index ae28d2df..953a5ba3 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include "mace/core/common.h"
 #include "mace/core/workspace.h"
+#include "mace/core/common.h"
 #include "mace/core/serializer.h"
 
 namespace mace {
@@ -16,8 +16,7 @@ vector<string> Workspace::Tensors() const {
   return names;
 }
 
-Tensor* Workspace::CreateTensor(const string& name,
-                                Allocator* alloc,
+Tensor* Workspace::CreateTensor(const string& name, Allocator* alloc,
                                 DataType type) {
   if (HasTensor(name)) {
     VLOG(1) << "Tensor " << name << " already exists. Skipping.";
@@ -46,14 +45,16 @@ const Tensor* Workspace::GetTensor(const string& name) const {
 }
 
 Tensor* Workspace::GetTensor(const string& name) {
-  return const_cast<Tensor*>(static_cast<const Workspace*>(this)->GetTensor(name));
+  return const_cast<Tensor*>(
+      static_cast<const Workspace*>(this)->GetTensor(name));
 }
 
-void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
+void Workspace::LoadModelTensor(const NetDef& net_def, DeviceType type) {
   Serializer serializer;
-  for (auto& tensor_proto: net_def.tensors()) {
-    tensor_map_[tensor_proto.name()] = serializer.Deserialize(tensor_proto, type);
+  for (auto& tensor_proto : net_def.tensors()) {
+    tensor_map_[tensor_proto.name()] =
+        serializer.Deserialize(tensor_proto, type);
   }
 }
 
-} // namespace mace
\ No newline at end of file
+}  // namespace mace
\ No newline at end of file
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 7de345bc..5d87abf7 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -5,7 +5,6 @@
 #ifndef MACE_CORE_WORKSPACE_H_
 #define MACE_CORE_WORKSPACE_H_
 
-
 #include "mace/core/common.h"
 #include "mace/core/tensor.h"
 #include "mace/proto/mace.pb.h"
@@ -37,10 +36,9 @@ class Workspace {
  private:
   TensorMap tensor_map_;
 
- DISABLE_COPY_AND_ASSIGN(Workspace);
+  DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 
-} // namespace mace
-
+}  // namespace mace
 
-#endif // MACE_CORE_WORKSPACE_H_
+#endif  // MACE_CORE_WORKSPACE_H_
diff --git a/mace/examples/benchmark_example.cc b/mace/examples/benchmark_example.cc
index 50e5184b..4fa34bea 100644
--- a/mace/examples/benchmark_example.cc
+++ b/mace/examples/benchmark_example.cc
@@ -14,7 +14,7 @@ static void foo(int iters) {
   float* out = new float[N];
 
   while (iters--) {
-    for (int i=0; i < N; i++) {
+    for (int i = 0; i < N; i++) {
       out[i] = inp[i] * 2.0;
     }
   }
@@ -24,7 +24,6 @@ static void foo(int iters) {
 
 BENCHMARK(foo);
 
-
 static void bar(int iters, int n) {
   const int64_t tot = static_cast<int64_t>(iters) * n;
   mace::testing::ItemsProcessed(tot);
@@ -34,7 +33,7 @@ static void bar(int iters, int n) {
   float* out = new float[n];
 
   while (iters--) {
-    for (int i=0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       out[i] = inp[i] * 2.0;
     }
   }
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index f1803ce3..3e5845b3 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -10,10 +10,9 @@
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct AddNFunctor {
-  void operator()(const vector<const T*>& inputs,
-                  T *output, index_t size) {
+  void operator()(const vector<const T*>& inputs, T* output, index_t size) {
     memset(output, 0, size * sizeof(T));
     int n = inputs.size();
     for (int i = 0; i < n; ++i) {
@@ -25,11 +24,10 @@ struct AddNFunctor {
 };
 
 template <>
-void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>& inputs,
-                                                      float *output,
-                                                      index_t size);
+void AddNFunctor<DeviceType::NEON, float>::operator()(
+    const vector<const float*>& inputs, float* output, index_t size);
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
 
-#endif // MACE_KERNELS_ADDN_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_ADDN_H_
\ No newline at end of file
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 0c1c2ef0..ad76be94 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -11,26 +11,21 @@
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct BatchNormFunctor {
   float variance_epsilon_;
 
   BatchNormFunctor(const float variance_epsilon)
-          : variance_epsilon_(variance_epsilon){}
+      : variance_epsilon_(variance_epsilon) {}
 
-  void operator()(const T* input,
-                  const T* scale,
-                  const T* offset,
-                  const T* mean,
-                  const T* var,
-                  const index_t n,
-                  const index_t channel,
-                  const index_t sample_size,
-                  T* output) {
+  void operator()(const T* input, const T* scale, const T* offset,
+                  const T* mean, const T* var, const index_t n,
+                  const index_t channel, const index_t sample_size, T* output) {
     // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
     // The calculation formula for inference is
     // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
-    //          ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} }
+    //          ( \offset - \frac { \scale * mean } {
+    //          \sqrt{var+\variance_epsilon} }
     // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
     // new_offset = \offset - mean * common_val;
     // Y = new_scale * X + new_offset;
@@ -53,18 +48,12 @@ struct BatchNormFunctor {
 };
 
 template <>
-void BatchNormFunctor<DeviceType::NEON, float>::operator()(const float* input,
-                                                           const float* scale,
-                                                           const float* offset,
-                                                           const float* mean,
-                                                           const float* var,
-                                                           const index_t n,
-                                                           const index_t channel,
-                                                           const index_t sample_size,
-                                                           float* output);
+void BatchNormFunctor<DeviceType::NEON, float>::operator()(
+    const float* input, const float* scale, const float* offset,
+    const float* mean, const float* var, const index_t n, const index_t channel,
+    const index_t sample_size, float* output);
 
+}  //  namepsace kernels
+}  //  namespace mace
 
-} //  namepsace kernels
-} //  namespace mace
-
-#endif //  MACE_KERNELS_BATCH_NORM_H_
+#endif  //  MACE_KERNELS_BATCH_NORM_H_
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 34044b4e..28e9011e 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -10,114 +10,103 @@
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class Conv2dFunctor {
-  public:
-    Conv2dFunctor(const int* strides,
-                  const int* paddings,
-                  const int* dilations) :
-      strides_(strides),
-      paddings_(paddings),
-      dilations_(dilations) {}
-
-    void operator()(const T* input, // NCHW
-                    const index_t* input_shape,
-                    const T* filter, // c_out, c_in, kernel_h, kernel_w
-                    const index_t* filter_shape,
-                    const T* bias, // c_out
-                    T* output, // NCHW
-                    const index_t* output_shape) {
-      MACE_CHECK_NOTNULL(output);
-
-      index_t batch    = output_shape[0];
-      index_t channels = output_shape[1];
-      index_t height   = output_shape[2];
-      index_t width    = output_shape[3];
-
-      index_t input_batch    = input_shape[0];
-      index_t input_channels = input_shape[1];
-      index_t input_height   = input_shape[2];
-      index_t input_width    = input_shape[3];
-
-      index_t kernel_h = filter_shape[2];
-      index_t kernel_w  = filter_shape[3];
-
-      int stride_h = strides_[0];
-      int stride_w = strides_[1];
-
-      int dilation_h = dilations_[0];
-      int dilation_w = dilations_[1];
-
-      MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
-
-      // The left-upper most offset of the padded input
-      int padded_h_start = 0 - paddings_[0] / 2;
-      int padded_w_start = 0 - paddings_[1] / 2;
-      index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
-      index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
-
-      index_t kernel_size = input_channels * kernel_h * kernel_w;
+ public:
+  Conv2dFunctor(const int* strides, const int* paddings, const int* dilations)
+      : strides_(strides), paddings_(paddings), dilations_(dilations) {}
+
+  void operator()(const T* input,  // NCHW
+                  const index_t* input_shape,
+                  const T* filter,  // c_out, c_in, kernel_h, kernel_w
+                  const index_t* filter_shape,
+                  const T* bias,  // c_out
+                  T* output,      // NCHW
+                  const index_t* output_shape) {
+    MACE_CHECK_NOTNULL(output);
+
+    index_t batch = output_shape[0];
+    index_t channels = output_shape[1];
+    index_t height = output_shape[2];
+    index_t width = output_shape[3];
+
+    index_t input_batch = input_shape[0];
+    index_t input_channels = input_shape[1];
+    index_t input_height = input_shape[2];
+    index_t input_width = input_shape[3];
+
+    index_t kernel_h = filter_shape[2];
+    index_t kernel_w = filter_shape[3];
+
+    int stride_h = strides_[0];
+    int stride_w = strides_[1];
+
+    int dilation_h = dilations_[0];
+    int dilation_w = dilations_[1];
+
+    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
+
+    // The left-upper most offset of the padded input
+    int padded_h_start = 0 - paddings_[0] / 2;
+    int padded_w_start = 0 - paddings_[1] / 2;
+    index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
+    index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
+
+    index_t kernel_size = input_channels * kernel_h * kernel_w;
 
 #pragma omp parallel for collapse(2)
-      for (int n = 0; n < batch; ++n) {
-        for (int c = 0; c < channels; ++c) {
-          for (int h = 0; h < height; ++h) {
-            for (int w = 0; w < width; ++w) {
-              index_t offset = n * channels * height * width +
-                               c * height * width +
-                               h * width + w;
-              T sum = 0;
-              const T* filter_ptr = filter + c * kernel_size;
-              for (int inc = 0; inc < input_channels; ++inc) {
-                for (int kh = 0; kh < kernel_h; ++kh) {
-                  for (int kw = 0; kw < kernel_w; ++kw) {
-
-                    int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                    int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                    if (inh < 0 || inh >= input_height ||
-                        inw < 0 || inw >= input_width) {
-                      MACE_CHECK(inh >= padded_h_start &&
-                                 inh < padded_h_stop &&
-                                 inw >= padded_w_start &&
-                                 inw < padded_w_stop,
-                                 "Out of range read from input: ",
-                                 inh, ", ", inw);
-                      // else padding with 0:
-                      // sum += 0;
-                    } else {
-                      index_t input_offset =
+    for (int n = 0; n < batch; ++n) {
+      for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            index_t offset = n * channels * height * width +
+                             c * height * width + h * width + w;
+            T sum = 0;
+            const T* filter_ptr = filter + c * kernel_size;
+            for (int inc = 0; inc < input_channels; ++inc) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
+                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
+                  if (inh < 0 || inh >= input_height || inw < 0 ||
+                      inw >= input_width) {
+                    MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop &&
+                                   inw >= padded_w_start && inw < padded_w_stop,
+                               "Out of range read from input: ", inh, ", ",
+                               inw);
+                    // else padding with 0:
+                    // sum += 0;
+                  } else {
+                    index_t input_offset =
                         n * input_channels * input_height * input_width +
-                        inc * input_height * input_width +
-                        inh * input_width + inw;
-                      sum += input[input_offset] * *filter_ptr;
-                    }
-                    ++filter_ptr;
+                        inc * input_height * input_width + inh * input_width +
+                        inw;
+                    sum += input[input_offset] * *filter_ptr;
                   }
+                  ++filter_ptr;
                 }
-                output[offset] = sum + bias[c];
               }
+              output[offset] = sum + bias[c];
             }
           }
         }
       }
     }
+  }
 
-  private:
-    const int* strides_; // [stride_h, stride_w]
-    const int* paddings_; // [padding_h, padding_w]
-    const int* dilations_; // [dilation_h, dilation_w]
+ private:
+  const int* strides_;    // [stride_h, stride_w]
+  const int* paddings_;   // [padding_h, padding_w]
+  const int* dilations_;  // [dilation_h, dilation_w]
 };
 
 template <>
-void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float* input,
-                                                        const index_t* input_shape,
-                                                        const float* filter,
-                                                        const index_t* filter_shape,
-                                                        const float* bias,
-                                                        float* output,
-                                                        const index_t* output_shape);
-
-} //  namespace kernels
-} //  namespace mace
-
-#endif // MACE_KERNELS_CONV_2D_H_
+void Conv2dFunctor<DeviceType::NEON, float>::operator()(
+    const float* input, const index_t* input_shape, const float* filter,
+    const index_t* filter_shape, const float* bias, float* output,
+    const index_t* output_shape);
+
+}  //  namespace kernels
+}  //  namespace mace
+
+#endif  // MACE_KERNELS_CONV_2D_H_
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index be5a742a..d2fcacc5 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -7,12 +7,10 @@
 namespace mace {
 namespace kernels {
 
-void CalcPaddingAndOutputSize(const index_t *input_shape,  // NCHW
+void CalcPaddingAndOutputSize(const index_t *input_shape,   // NCHW
                               const index_t *filter_shape,  // OIHW
-                              const int *dilations,
-                              const int *strides,
-                              Padding padding,
-                              index_t *output_shape,
+                              const int *dilations, const int *strides,
+                              Padding padding, index_t *output_shape,
                               int *padding_size) {
   MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
              "Invalid dilations, must >= 1");
@@ -43,14 +41,16 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,  // NCHW
       output_height = (input_shape[2] - k_extent_height) / strides[0] + 1;
       output_width = (input_shape[3] - k_extent_width) / strides[1] + 1;
       break;
-    case SAME:output_height = (input_shape[2] - 1) / strides[0] + 1;
+    case SAME:
+      output_height = (input_shape[2] - 1) / strides[0] + 1;
       output_width = (input_shape[3] - 1) / strides[1] + 1;
       break;
     case FULL:
       output_height = (input_shape[2] + k_extent_height - 2) / strides[0] + 1;
       output_width = (input_shape[3] + k_extent_width - 2) / strides[1] + 1;
       break;
-    default:MACE_CHECK(false, "Unsupported padding type: ", padding);
+    default:
+      MACE_CHECK(false, "Unsupported padding type: ", padding);
   }
 
   // Note: TensorFlow may padded one more on the right/bottom side
@@ -58,10 +58,10 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,  // NCHW
   // utilize the more centered features. We need to benchmark
   // based on the model accuracy.
 
-  padding_size[0] = (output_height - 1) * strides[0] +
-      k_extent_height - input_shape[2];
-  padding_size[1] = (output_width - 1) * strides[1] +
-      k_extent_width - input_shape[3];
+  padding_size[0] =
+      (output_height - 1) * strides[0] + k_extent_height - input_shape[2];
+  padding_size[1] =
+      (output_width - 1) * strides[1] + k_extent_width - input_shape[3];
 
   output_shape[0] = input_shape[0];
   output_shape[1] = output_channels;
@@ -69,19 +69,15 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,  // NCHW
   output_shape[3] = output_width;
 }
 
-void ConstructInputWithPadding(const float *input,
-                               const index_t *input_shape,
-                               const int *paddings,
-                               Tensor *output_tensor) {
+void ConstructInputWithPadding(const float *input, const index_t *input_shape,
+                               const int *paddings, Tensor *output_tensor) {
   index_t batch = input_shape[0];
   index_t channels = input_shape[1];
   index_t height = input_shape[2];
   index_t width = input_shape[3];
 
-  std::vector<index_t> output_shape({batch,
-                                     channels,
-                                     paddings[0] + height,
-                                     paddings[1] + width});
+  std::vector<index_t> output_shape(
+      {batch, channels, paddings[0] + height, paddings[1] + width});
 
   const index_t output_width = output_shape[3];
   const int padded_top = paddings[0] / 2;
@@ -105,5 +101,5 @@ void ConstructInputWithPadding(const float *input,
     }
   }
 }
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/kernels/conv_pool_2d_util.h
index c6b9f090..ff46887e 100644
--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -10,26 +10,22 @@
 namespace mace {
 
 enum Padding {
-  VALID = 0, // No padding
-  SAME = 1, // Pads with half the filter size (rounded down) on both sides
-  FULL = 2, // Pads with one less than the filter size on both sides
+  VALID = 0,  // No padding
+  SAME = 1,   // Pads with half the filter size (rounded down) on both sides
+  FULL = 2,   // Pads with one less than the filter size on both sides
 };
 
 namespace kernels {
 
-void CalcPaddingAndOutputSize(const index_t *input_shape,  // NCHW
+void CalcPaddingAndOutputSize(const index_t *input_shape,   // NCHW
                               const index_t *filter_shape,  // OIHW
-                              const int *dilations,
-                              const int *strides,
-                              Padding padding,
-                              index_t *output_shape,
+                              const int *dilations, const int *strides,
+                              Padding padding, index_t *output_shape,
                               int *padding_size);
 
-void ConstructInputWithPadding(const float *input,
-                               const index_t *input_shape,
-                               const int *paddings,
-                               Tensor *output_tensor);
-} //  namespace kernels
-} //  namespace mace
+void ConstructInputWithPadding(const float *input, const index_t *input_shape,
+                               const int *paddings, Tensor *output_tensor);
+}  //  namespace kernels
+}  //  namespace mace
 
-#endif // MACE_KERNELS_CONV_POOL_2D_UTIL_H_
+#endif  // MACE_KERNELS_CONV_POOL_2D_UTIL_H_
diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc
index 86e53bcb..19f621d4 100644
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -2,16 +2,15 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <arm_neon.h>
 #include "mace/kernels/addn.h"
+#include <arm_neon.h>
 
 namespace mace {
 namespace kernels {
 
 template <>
-void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>& inputs,
-                                                float *output,
-                                                index_t size) {
+void AddNFunctor<DeviceType::NEON, float>::operator()(
+    const vector<const float *> &inputs, float *output, index_t size) {
   // TODO: neon mem copy
   memset(output, 0, size * sizeof(float));
   int n = inputs.size();
@@ -22,7 +21,7 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>
   }
   int64_t element_per_group = size / groups;
 
-#pragma omp parallel for num_threads(1) // no significant performance improve
+#pragma omp parallel for num_threads(1)  // no significant performance improve
   for (int64_t i = 0; i < size; i += element_per_group) {
     int64_t count = std::min(element_per_group, size - i);
     int nn = count >> 2;
@@ -48,5 +47,5 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>
   }
 };
 
-} // namespace kernels
-} // namespace mace
\ No newline at end of file
+}  // namespace kernels
+}  // namespace mace
\ No newline at end of file
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index 244d564b..918a94c5 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -2,29 +2,25 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <arm_neon.h>
 #include "mace/kernels/batch_norm.h"
+#include <arm_neon.h>
 
 namespace mace {
 namespace kernels {
 
 template <>
-void BatchNormFunctor<DeviceType::NEON, float>::operator()(const float* input,
-                                                           const float* scale,
-                                                           const float* offset,
-                                                           const float* mean,
-                                                           const float* var,
-                                                           const index_t n,
-                                                           const index_t channel,
-                                                           const index_t sample_size,
-                                                           float* output) {
-    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
-    // The calculation formula for inference is
-    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
-    //          ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} }
-    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
-    // new_offset = \offset - mean * common_val;
-    // Y = new_scale * X + new_offset;
+void BatchNormFunctor<DeviceType::NEON, float>::operator()(
+    const float* input, const float* scale, const float* offset,
+    const float* mean, const float* var, const index_t n, const index_t channel,
+    const index_t sample_size, float* output) {
+  // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
+  // The calculation formula for inference is
+  // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
+  //          ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon}
+  //          }
+  // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
+  // new_offset = \offset - mean * common_val;
+  // Y = new_scale * X + new_offset;
   float new_scale, new_offset;
   index_t count = sample_size >> 2;
   index_t remain_count = sample_size - (count << 2);
@@ -36,8 +32,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(const float* input,
     float32x4_t new_scale_f = vdupq_n_f32(new_scale);
     float32x4_t new_offset_f = vdupq_n_f32(new_offset);
     for (index_t i = 0; i < n; ++i) {
-      const float *input_sample_ptr = input + pos;
-      float *output_sample_ptr = output + pos;
+      const float* input_sample_ptr = input + pos;
+      float* output_sample_ptr = output + pos;
 
       for (index_t j = 0; j < count; ++j) {
         float32x4_t input_f = vld1q_f32(input_sample_ptr);
@@ -57,5 +53,5 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(const float* input,
   }
 };
 
-} // namespace kernels
-} //  namespace mace
\ No newline at end of file
+}  // namespace kernels
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/kernels/neon/conv_2d_neon.cc b/mace/kernels/neon/conv_2d_neon.cc
index 75b22e9a..5268eb1c 100644
--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -20,62 +20,39 @@ extern void Conv2dNeonK5x5S1(const float *input, const index_t *input_shape,
                              const float *filter, const float *bias,
                              float *output, const index_t *output_shape);
 
-template<>
+template <>
 void Conv2dFunctor<DeviceType::NEON,
-                   float>::operator()(const float *input, // NCHW
-                                      const index_t *input_shape,
-                                      const float *filter, // c_out, c_in, kernel_h, kernel_w
-                                      const index_t *filter_shape,
-                                      const float *bias, // c_out
-                                      float *output, // NCHW
-                                      const index_t *output_shape) {
-
-  typedef void (*Conv2dNeonFunction)(const float *input, // NCHW
-                                     const index_t *input_shape,
-                                     const float *filter, // c_out, c_in, kernel_h, kernel_w
-                                     const float *bias, // c_out
-                                     float *output, // NCHW
-                                     const index_t *output_shape);
+                   float>::
+operator()(const float *input,  // NCHW
+           const index_t *input_shape,
+           const float *filter,  // c_out, c_in, kernel_h, kernel_w
+           const index_t *filter_shape,
+           const float *bias,  // c_out
+           float *output,      // NCHW
+           const index_t *output_shape) {
+  typedef void (*Conv2dNeonFunction)(
+      const float *input,  // NCHW
+      const index_t *input_shape,
+      const float *filter,  // c_out, c_in, kernel_h, kernel_w
+      const float *bias,    // c_out
+      float *output,        // NCHW
+      const index_t *output_shape);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dNeonFunction selector[5][2] = {
-      {
-          Conv2dNeonK1x1S1,
-          nullptr
-      },
-      {
-          nullptr,
-          nullptr
-      },
-      {
-          Conv2dNeonK3x3S1,
-          nullptr
-      },
-      {
-          nullptr,
-          nullptr
-      },
-      {
-          Conv2dNeonK5x5S1,
-          nullptr
-      }
-  };
+      {Conv2dNeonK1x1S1, nullptr},
+      {nullptr, nullptr},
+      {Conv2dNeonK3x3S1, nullptr},
+      {nullptr, nullptr},
+      {Conv2dNeonK5x5S1, nullptr}};
   // not implement yet
   index_t kernel_h = filter_shape[2];
   index_t kernel_w = filter_shape[3];
-  if (kernel_h != kernel_w || kernel_h > 5 ||
-      strides_[0] != strides_[1] || strides_[0] > 2 ||
-      dilations_[0] != 1 || dilations_[1] != 1 ||
+  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
       selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
     LOG(WARNING) << "NEON conv2d kernel not implementated, using slow vesion";
     Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input,
-        input_shape,
-        filter,
-        filter_shape,
-        bias,
-        output,
-        output_shape
-    );
+        input, input_shape, filter, filter_shape, bias, output, output_shape);
     return;
   }
 
@@ -87,13 +64,8 @@ void Conv2dFunctor<DeviceType::NEON,
     input_shape = padded_input.shape().data();
   }
   auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_neon_func(input,
-                   input_shape,
-                   filter,
-                   bias,
-                   output,
-                   output_shape);
+  conv2d_neon_func(input, input_shape, filter, bias, output, output_shape);
 }
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
diff --git a/mace/kernels/neon/conv_2d_neon_1x1.cc b/mace/kernels/neon/conv_2d_neon_1x1.cc
index 86b7b328..59cd101b 100644
--- a/mace/kernels/neon/conv_2d_neon_1x1.cc
+++ b/mace/kernels/neon/conv_2d_neon_1x1.cc
@@ -8,25 +8,24 @@
 namespace mace {
 namespace kernels {
 
-void Conv2dNeonK1x1S1(const float* input, // NCHW
+void Conv2dNeonK1x1S1(const float* input,  // NCHW
                       const index_t* input_shape,
-                      const float* filter, // c_out, c_in, kernel_h, kernel_w
-                      const float* bias, // c_out
-                      float* output, // NCHW
+                      const float* filter,  // c_out, c_in, kernel_h, kernel_w
+                      const float* bias,    // c_out
+                      float* output,        // NCHW
                       const index_t* output_shape) {
-  const index_t batch    = output_shape[0];
+  const index_t batch = output_shape[0];
   const index_t channels = output_shape[1];
-  const index_t height   = output_shape[2];
-  const index_t width    = output_shape[3];
+  const index_t height = output_shape[2];
+  const index_t width = output_shape[3];
 
-  const index_t input_batch    = input_shape[0];
+  const index_t input_batch = input_shape[0];
   const index_t input_channels = input_shape[1];
-  const index_t input_height   = input_shape[2];
-  const index_t input_width    = input_shape[3];
+  const index_t input_height = input_shape[2];
+  const index_t input_width = input_shape[3];
 
-  MACE_CHECK(input_batch  == batch &&
-             input_height == height &&
-             input_width  == width);
+  MACE_CHECK(input_batch == batch && input_height == height &&
+             input_width == width);
 
   const index_t total_pixels = height * width;
   // Process 4 * 2 = 8 pixels for each innermost loop
@@ -37,17 +36,18 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
   // benchmark omp collapsed(2)
   for (index_t n = 0; n < batch; ++n) {
     const float* filter_ptr = filter;
-    #pragma omp parallel for
+#pragma omp parallel for
     for (index_t c = 0; c < channels; ++c) {
       // TODO Will GCC opt these out?
       float* channel_output_start =
-        output + n * channels * height * width + c * height * width;
-      const float* input_ptr = input + n * input_channels * input_height * input_width;
+          output + n * channels * height * width + c * height * width;
+      const float* input_ptr =
+          input + n * input_channels * input_height * input_width;
 
       // Fill with bias
       float* output_ptr = channel_output_start;
       for (index_t ptr = 0; ptr < total_pixels; ++ptr) {
-        output_ptr[ptr] = bias[c]; // TODO can we avoid this?
+        output_ptr[ptr] = bias[c];  // TODO can we avoid this?
       }
 
       index_t inc = 0;
@@ -55,15 +55,14 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
       for (; inc + 3 < input_channels; inc += 4) {
         float* output_ptr = channel_output_start;
         // The begining of each input feature map channel
-        MACE_ASSERT(input_ptr == input + n * input_channels *
-                                         input_height * input_width +
-                                 inc * input_height * input_width);
+        MACE_ASSERT(input_ptr ==
+                    input + n * input_channels * input_height * input_width +
+                        inc * input_height * input_width);
 
-        const float* input_ptr1 = input_ptr  + total_pixels;
+        const float* input_ptr1 = input_ptr + total_pixels;
         const float* input_ptr2 = input_ptr1 + total_pixels;
         const float* input_ptr3 = input_ptr2 + total_pixels;
 
-
         // filter is in c_out, c_in, 1, 1 order
         MACE_ASSERT(filter_ptr == filter + c * input_channels + inc);
         const float k0 = filter_ptr[0];
@@ -113,7 +112,7 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
           vst1q_f32(output_ptr + 4, out4);
 
           output_ptr += 8;
-          input_ptr  += 8;
+          input_ptr += 8;
           input_ptr1 += 8;
           input_ptr2 += 8;
           input_ptr3 += 8;
@@ -121,7 +120,7 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
         // Process the remaining pixels
         index_t remaining_pixels = loop_remaining;
         for (; remaining_pixels > 0; --remaining_pixels) {
-          const float mul  = *input_ptr  * k0;
+          const float mul = *input_ptr * k0;
           const float mul1 = *input_ptr1 * k1;
           const float mul2 = *input_ptr2 * k2;
           const float mul3 = *input_ptr3 * k3;
@@ -141,9 +140,9 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
       // Process the remaining channels
       for (; inc < input_channels; ++inc) {
         float* output_ptr = channel_output_start;
-        MACE_ASSERT(input_ptr == input + n * input_channels *
-                                         input_height * input_width +
-                                 inc * input_height * input_width);
+        MACE_ASSERT(input_ptr ==
+                    input + n * input_channels * input_height * input_width +
+                        inc * input_height * input_width);
         MACE_ASSERT(filter_ptr == filter + c * input_channels + inc);
 
         const float k0 = filter_ptr[0];
@@ -166,13 +165,13 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
           vst1q_f32(output_ptr + 4, out4);
 
           output_ptr += 8;
-          input_ptr  += 8;
+          input_ptr += 8;
         }
         // Process the remaining pixels
         index_t remaining_pixels = loop_remaining;
         for (; remaining_pixels > 0; --remaining_pixels) {
           const float mul = *input_ptr * k0;
-          
+
           *output_ptr += mul;
 
           ++output_ptr;
@@ -183,5 +182,5 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW
   }
 };
 
-} // namespace kernels
-} // namespace mace
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/neon/conv_2d_neon_3x3.cc b/mace/kernels/neon/conv_2d_neon_3x3.cc
index 8ba5e82d..3853a5b5 100644
--- a/mace/kernels/neon/conv_2d_neon_3x3.cc
+++ b/mace/kernels/neon/conv_2d_neon_3x3.cc
@@ -10,78 +10,81 @@ namespace kernels {
 
 static const int kRegisterSize = 4;
 
-void Conv2dNeonK3x3S1(const float* input, // NCHW
-                       const index_t* input_shape,
-                       const float* filter, // c_out, c_in, kernel_h, kernel_w
-                       const float* bias, // c_out
-                       float* output, // NCHW
-                       const index_t* output_shape) {
-
-  int batch    = output_shape[0];
+void Conv2dNeonK3x3S1(const float* input,  // NCHW
+                      const index_t* input_shape,
+                      const float* filter,  // c_out, c_in, kernel_h, kernel_w
+                      const float* bias,    // c_out
+                      float* output,        // NCHW
+                      const index_t* output_shape) {
+  int batch = output_shape[0];
   int channels = output_shape[1];
-  int height   = output_shape[2];
-  int width    = output_shape[3];
+  int height = output_shape[2];
+  int width = output_shape[3];
 
-  int input_batch    = input_shape[0];
+  int input_batch = input_shape[0];
   int input_channels = input_shape[1];
-  int input_height   = input_shape[2];
-  int input_width    = input_shape[3];
+  int input_height = input_shape[2];
+  int input_width = input_shape[3];
 
   int kernel_h = 3;
-  int kernel_w  = 3;
+  int kernel_w = 3;
 
   int height_count = (height >> 1) << 1;
   for (int b = 0; b < batch; ++b) {
     float* output_ptr_base = output + b * channels * height * width;
     for (int oc = 0; oc < channels; ++oc) {
-      const float* filter_ptr = filter + oc * input_channels * kernel_h * kernel_w;
-      const float* input_ptr = input + b * input_channels * input_height * input_width;
+      const float* filter_ptr =
+          filter + oc * input_channels * kernel_h * kernel_w;
+      const float* input_ptr =
+          input + b * input_channels * input_height * input_width;
       float* output_ptr = output_ptr_base + oc * height * width;
 
       std::fill(output_ptr, output_ptr + height * width, bias[oc]);
       for (int ic = 0; ic < input_channels; ++ic) {
         float32x4_t filter0 = vld1q_f32(filter_ptr);
-        float32x4_t filter3 = vld1q_f32(filter_ptr+3);
-        float32x4_t filter6 = vld1q_f32(filter_ptr+6);
+        float32x4_t filter3 = vld1q_f32(filter_ptr + 3);
+        float32x4_t filter6 = vld1q_f32(filter_ptr + 6);
 
-        const float* row[kRegisterSize] = {
-                input_ptr, input_ptr + input_width,
-                input_ptr + 2 * input_width, input_ptr + 3 * input_width
-        };
+        const float* row[kRegisterSize] = {input_ptr, input_ptr + input_width,
+                                           input_ptr + 2 * input_width,
+                                           input_ptr + 3 * input_width};
 
         float* output_ptr1 = output_ptr;
         float* output_ptr2 = output_ptr + width;
 
         for (int h = 0; h < height_count; h += 2) {
-
           int count = width >> 2;
           int remain_count = width & 3;
 
           for (; count > 0; --count) {
             float32x4_t sum0 = vdupq_n_f32(.0f);
             float32x4_t sum1 = vdupq_n_f32(.0f);
-            float32x4_t row0_ext_0 = vld1q_f32(row[0]); //0123
-            float32x4_t row0_latter = vld1q_f32(row[0] + kRegisterSize); //4567
-            float32x4_t row0_ext_1 = vextq_f32(row0_ext_0, row0_latter, 1); //1234
-            float32x4_t row0_ext_2 = vextq_f32(row0_ext_0, row0_latter, 2); //2345
+            float32x4_t row0_ext_0 = vld1q_f32(row[0]);  // 0123
+            float32x4_t row0_latter = vld1q_f32(row[0] + kRegisterSize);  // 4567
+            float32x4_t row0_ext_1 =
+                vextq_f32(row0_ext_0, row0_latter, 1);  // 1234
+            float32x4_t row0_ext_2 =
+                vextq_f32(row0_ext_0, row0_latter, 2);  // 2345
 
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_0, filter0, 0);
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_1, filter0, 1);
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_2, filter0, 2);
 
-            float32x4_t row1_ext_0 = vld1q_f32(row[1]); //0123
-            float32x4_t row1_latter = vld1q_f32(row[1] + kRegisterSize); //4567
-            float32x4_t row1_ext_1 = vextq_f32(row1_ext_0, row1_latter, 1); //1234
-            float32x4_t row1_ext_2 = vextq_f32(row1_ext_0, row1_latter, 2); //2345
+            float32x4_t row1_ext_0 = vld1q_f32(row[1]);  // 0123
+            float32x4_t row1_latter = vld1q_f32(row[1] + kRegisterSize);  // 4567
+            float32x4_t row1_ext_1 =
+                vextq_f32(row1_ext_0, row1_latter, 1);  // 1234
+            float32x4_t row1_ext_2 =
+                vextq_f32(row1_ext_0, row1_latter, 2);  // 2345
 
             sum0 = vfmaq_laneq_f32(sum0, row1_ext_0, filter3, 0);
             sum0 = vfmaq_laneq_f32(sum0, row1_ext_1, filter3, 1);
             sum0 = vfmaq_laneq_f32(sum0, row1_ext_2, filter3, 2);
 
-            row0_ext_0 = vld1q_f32(row[2]); //0123
-            row0_latter = vld1q_f32(row[2] + kRegisterSize); //4567
-            row0_ext_1 = vextq_f32(row0_ext_0, row0_latter, 1); //1234
-            row0_ext_2 = vextq_f32(row0_ext_0, row0_latter, 2); //2345
+            row0_ext_0 = vld1q_f32(row[2]);                      // 0123
+            row0_latter = vld1q_f32(row[2] + kRegisterSize);     // 4567
+            row0_ext_1 = vextq_f32(row0_ext_0, row0_latter, 1);  // 1234
+            row0_ext_2 = vextq_f32(row0_ext_0, row0_latter, 2);  // 2345
 
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_0, filter6, 0);
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_1, filter6, 1);
@@ -96,10 +99,10 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
             sum1 = vfmaq_laneq_f32(sum1, row0_ext_1, filter3, 1);
             sum1 = vfmaq_laneq_f32(sum1, row0_ext_2, filter3, 2);
 
-            row1_ext_0 = vld1q_f32(row[3]); //0123
-            row1_latter = vld1q_f32(row[3] + kRegisterSize); //4567
-            row1_ext_1 = vextq_f32(row1_ext_0, row1_latter, 1); //1234
-            row1_ext_2 = vextq_f32(row1_ext_0, row1_latter, 2); //2345
+            row1_ext_0 = vld1q_f32(row[3]);                      // 0123
+            row1_latter = vld1q_f32(row[3] + kRegisterSize);     // 4567
+            row1_ext_1 = vextq_f32(row1_ext_0, row1_latter, 1);  // 1234
+            row1_ext_2 = vextq_f32(row1_ext_0, row1_latter, 2);  // 2345
 
             sum1 = vfmaq_laneq_f32(sum1, row1_ext_0, filter6, 0);
             sum1 = vfmaq_laneq_f32(sum1, row1_ext_1, filter6, 1);
@@ -114,15 +117,15 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
 
             output_ptr1 += kRegisterSize;
             output_ptr2 += kRegisterSize;
-            for(int i = 0; i < kRegisterSize; ++i) {
+            for (int i = 0; i < kRegisterSize; ++i) {
               row[i] += kRegisterSize;
             }
           }
           for (; remain_count > 0; --remain_count) {
-            float32x4_t row0 = vld1q_f32(row[0]); //0123
-            float32x4_t row1 = vld1q_f32(row[1]); //0123
-            float32x4_t row2 = vld1q_f32(row[2]); //0123
-            float32x4_t row3 = vld1q_f32(row[3]); //0123
+            float32x4_t row0 = vld1q_f32(row[0]);  // 0123
+            float32x4_t row1 = vld1q_f32(row[1]);  // 0123
+            float32x4_t row2 = vld1q_f32(row[2]);  // 0123
+            float32x4_t row3 = vld1q_f32(row[3]);  // 0123
 
             float32x4_t sum = vmulq_f32(row0, filter0);
             sum = vmlaq_f32(sum, row1, filter3);
@@ -138,13 +141,13 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
 
             ++output_ptr1;
             ++output_ptr2;
-            for(int i = 0; i < kRegisterSize; ++i) {
+            for (int i = 0; i < kRegisterSize; ++i) {
               row[i] += 1;
             }
           }
           output_ptr1 += width;
           output_ptr2 += width;
-          for(int i = 0; i < kRegisterSize; ++i) {
+          for (int i = 0; i < kRegisterSize; ++i) {
             row[i] += 2 + input_width;
           }
         }
@@ -152,30 +155,34 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
         if (height != height_count) {
           int count = width >> 2;
           int remain_count = width & 3;
-          for(; count > 0; --count) {
+          for (; count > 0; --count) {
             float32x4_t sum0 = vdupq_n_f32(.0f);
-            float32x4_t row0_ext_0 = vld1q_f32(row[0]); //0123
-            float32x4_t row0_latter = vld1q_f32(row[0] + kRegisterSize); //4567
-            float32x4_t row0_ext_1 = vextq_f32(row0_ext_0, row0_latter, 1); //1234
-            float32x4_t row0_ext_2 = vextq_f32(row0_ext_0, row0_latter, 2); //2345
+            float32x4_t row0_ext_0 = vld1q_f32(row[0]);  // 0123
+            float32x4_t row0_latter = vld1q_f32(row[0] + kRegisterSize);  // 4567
+            float32x4_t row0_ext_1 =
+                vextq_f32(row0_ext_0, row0_latter, 1);  // 1234
+            float32x4_t row0_ext_2 =
+                vextq_f32(row0_ext_0, row0_latter, 2);  // 2345
 
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_0, filter0, 0);
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_1, filter0, 1);
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_2, filter0, 2);
 
-            float32x4_t row1_ext_0 = vld1q_f32(row[1]); //0123
-            float32x4_t row1_latter = vld1q_f32(row[1] + kRegisterSize); //4567
-            float32x4_t row1_ext_1 = vextq_f32(row1_ext_0, row1_latter, 1); //1234
-            float32x4_t row1_ext_2 = vextq_f32(row1_ext_0, row1_latter, 2); //2345
+            float32x4_t row1_ext_0 = vld1q_f32(row[1]);  // 0123
+            float32x4_t row1_latter = vld1q_f32(row[1] + kRegisterSize);  // 4567
+            float32x4_t row1_ext_1 =
+                vextq_f32(row1_ext_0, row1_latter, 1);  // 1234
+            float32x4_t row1_ext_2 =
+                vextq_f32(row1_ext_0, row1_latter, 2);  // 2345
 
             sum0 = vfmaq_laneq_f32(sum0, row1_ext_0, filter3, 0);
             sum0 = vfmaq_laneq_f32(sum0, row1_ext_1, filter3, 1);
             sum0 = vfmaq_laneq_f32(sum0, row1_ext_2, filter3, 2);
 
-            row0_ext_0 = vld1q_f32(row[2]); //0123
-            row0_latter = vld1q_f32(row[2] + kRegisterSize); //4567
-            row0_ext_1 = vextq_f32(row0_ext_0, row0_latter, 1); //1234
-            row0_ext_2 = vextq_f32(row0_ext_0, row0_latter, 2); //2345
+            row0_ext_0 = vld1q_f32(row[2]);                      // 0123
+            row0_latter = vld1q_f32(row[2] + kRegisterSize);     // 4567
+            row0_ext_1 = vextq_f32(row0_ext_0, row0_latter, 1);  // 1234
+            row0_ext_2 = vextq_f32(row0_ext_0, row0_latter, 2);  // 2345
 
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_0, filter6, 0);
             sum0 = vfmaq_laneq_f32(sum0, row0_ext_1, filter6, 1);
@@ -185,14 +192,14 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
             output_row0 = vaddq_f32(output_row0, sum0);
             vst1q_f32(output_ptr1, output_row0);
             output_ptr1 += kRegisterSize;
-            for(int i = 0; i < 3; ++i) {
+            for (int i = 0; i < 3; ++i) {
               row[i] += kRegisterSize;
             }
           }
           for (; remain_count > 0; --remain_count) {
-            float32x4_t row0 = vld1q_f32(row[0]); //0123
-            float32x4_t row1 = vld1q_f32(row[1]); //0123
-            float32x4_t row2 = vld1q_f32(row[2]); //0123
+            float32x4_t row0 = vld1q_f32(row[0]);  // 0123
+            float32x4_t row1 = vld1q_f32(row[1]);  // 0123
+            float32x4_t row2 = vld1q_f32(row[2]);  // 0123
 
             float32x4_t sum = vmulq_f32(row0, filter0);
             sum = vmlaq_f32(sum, row1, filter3);
@@ -201,7 +208,7 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
             *output_ptr1 = vaddvq_f32(sum);
 
             ++output_ptr1;
-            for(int i = 0; i < 3; ++i) {
+            for (int i = 0; i < 3; ++i) {
               row[i] += 1;
             }
           }
@@ -213,5 +220,5 @@ void Conv2dNeonK3x3S1(const float* input, // NCHW
   }
 }
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
diff --git a/mace/kernels/neon/conv_2d_neon_5x5.cc b/mace/kernels/neon/conv_2d_neon_5x5.cc
index 693f1241..0e926eb2 100644
--- a/mace/kernels/neon/conv_2d_neon_5x5.cc
+++ b/mace/kernels/neon/conv_2d_neon_5x5.cc
@@ -10,11 +10,11 @@
 namespace mace {
 namespace kernels {
 
-void Conv2dNeonK5x5S1(const float* input, // NCHW
+void Conv2dNeonK5x5S1(const float* input,  // NCHW
                       const index_t* input_shape,
-                      const float* filter, // c_out, c_in, kernel_h, kernel_w
-                      const float* bias, // c_out
-                      float* output, // NCHW
+                      const float* filter,  // c_out, c_in, kernel_h, kernel_w
+                      const float* bias,    // c_out
+                      float* output,        // NCHW
                       const index_t* output_shape) {
   const index_t batch = output_shape[0];
   const index_t channels = output_shape[1];
@@ -30,17 +30,17 @@ void Conv2dNeonK5x5S1(const float* input, // NCHW
 
   const index_t input_total_pixels_per_channel = input_height * input_width;
   const index_t output_total_pixels_per_channel = height * width;
-  const index_t input_total_pixels_per_batch = input_total_pixels_per_channel
-      * input_channels;
-  const index_t output_total_pixels_per_batch = output_total_pixels_per_channel
-      * channels;
+  const index_t input_total_pixels_per_batch =
+      input_total_pixels_per_channel * input_channels;
+  const index_t output_total_pixels_per_batch =
+      output_total_pixels_per_channel * channels;
   const index_t patch_size = input_channels * 25;
 
 #pragma omp parallel for collapse(2)
   for (index_t n = 0; n < batch; ++n) {
     for (index_t c = 0; c < channels; ++c) {
-      float* output_ptr = output + n * output_total_pixels_per_batch
-          + c * output_total_pixels_per_channel;
+      float* output_ptr = output + n * output_total_pixels_per_batch +
+                          c * output_total_pixels_per_channel;
       const float* input_ptr = input + n * input_total_pixels_per_batch;
 
       // Fill with bias
@@ -53,7 +53,7 @@ void Conv2dNeonK5x5S1(const float* input, // NCHW
         float* outptr2 = outptr + width;
 
         const float* inptr = input_ptr + inc * input_total_pixels_per_channel;
-        const float* filter_ptr = filter + c * patch_size  + inc * 25;
+        const float* filter_ptr = filter + c * patch_size + inc * 25;
 
         const float* r0 = inptr;
         const float* r1 = inptr + input_width;
@@ -246,8 +246,8 @@ void Conv2dNeonK5x5S1(const float* input, // NCHW
             sum2 = r5[4] * k4[4];
 
             float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
-            float32x2_t
-                _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
+            float32x2_t _ss2 =
+                vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
             float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);
 
             sum += vget_lane_f32(_ss_ss2, 0);
@@ -414,7 +414,7 @@ void Conv2dNeonK5x5S1(const float* input, // NCHW
   }
 }
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
 
-#endif //  MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
+#endif  //  MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
diff --git a/mace/kernels/neon/max_pooling_neon_2x2.cc b/mace/kernels/neon/max_pooling_neon_2x2.cc
index 088ea467..3be9fa28 100644
--- a/mace/kernels/neon/max_pooling_neon_2x2.cc
+++ b/mace/kernels/neon/max_pooling_neon_2x2.cc
@@ -2,19 +2,17 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+#include <arm_neon.h>
 #include <float.h>
 #include <limits>
-#include <arm_neon.h>
 
 #include "mace/core/common.h"
 
 namespace mace {
 namespace kernels {
 
-void PoolingMaxNeonK2x2S2x2(const float *input,
-                            const index_t *in_shape,
-                            float *output,
-                            const index_t *out_shape,
+void PoolingMaxNeonK2x2S2x2(const float *input, const index_t *in_shape,
+                            float *output, const index_t *out_shape,
                             const int *paddings) {
   index_t batch = in_shape[0];
   index_t channels = in_shape[1];
@@ -44,7 +42,7 @@ void PoolingMaxNeonK2x2S2x2(const float *input,
         int w = 0;
         int num_vectors = 0;
         if (!((h == 0 && padding_top > 0) ||
-            (h == out_height - 1 && padding_bottom > 0))) {
+              (h == out_height - 1 && padding_bottom > 0))) {
           r0 = input + input_offset + (h * 2 - padding_top) * in_width;
           r1 = r0 + in_width;
           if (padding_left > 0) {
@@ -86,8 +84,7 @@ void PoolingMaxNeonK2x2S2x2(const float *input,
             for (int kw = 0; kw < 2; ++kw) {
               int inh = h * 2 - padding_top + kh;
               int inw = w * 2 - padding_left + kw;
-              if (inh >= 0 && inh < in_height &&
-                  inw >= 0 && inw < in_width) {
+              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                 max = std::max(max, input[input_offset + inh * in_width + inw]);
               }
             }
@@ -104,10 +101,8 @@ void PoolingMaxNeonK2x2S2x2(const float *input,
 }
 
 // assume the input has already been padded
-void PoolingMaxNeonK2x2S2x2Padded(const float *input,
-                                  const index_t *in_shape,
-                                  float *output,
-                                  const index_t *out_shape) {
+void PoolingMaxNeonK2x2S2x2Padded(const float *input, const index_t *in_shape,
+                                  float *output, const index_t *out_shape) {
   index_t batch = in_shape[0];
   index_t channels = in_shape[1];
   index_t in_height = in_shape[2];
diff --git a/mace/kernels/neon/max_pooling_neon_3x3.cc b/mace/kernels/neon/max_pooling_neon_3x3.cc
index 045ce7b0..129b4df2 100644
--- a/mace/kernels/neon/max_pooling_neon_3x3.cc
+++ b/mace/kernels/neon/max_pooling_neon_3x3.cc
@@ -2,19 +2,17 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+#include <arm_neon.h>
 #include <float.h>
 #include <limits>
-#include <arm_neon.h>
 
 #include "mace/core/common.h"
 
 namespace mace {
 namespace kernels {
 
-void PoolingMaxNeonK3x3S2x2(const float *input,
-                            const index_t *in_shape,
-                            float *output,
-                            const index_t *out_shape,
+void PoolingMaxNeonK3x3S2x2(const float *input, const index_t *in_shape,
+                            float *output, const index_t *out_shape,
                             const int *paddings) {
   index_t batch = in_shape[0];
   index_t channels = in_shape[1];
@@ -44,7 +42,7 @@ void PoolingMaxNeonK3x3S2x2(const float *input,
         int num_vectors = 0;
         const float *r0, *r1, *r2;
         if (!((h == 0 && padding_top > 0) ||
-            (h == out_height - 1 && padding_bottom > 0))) {
+              (h == out_height - 1 && padding_bottom > 0))) {
           r0 = input + input_offset + (h * 2 - padding_top) * in_width;
           r1 = r0 + in_width;
           r2 = r1 + in_width;
@@ -112,8 +110,7 @@ void PoolingMaxNeonK3x3S2x2(const float *input,
             for (int kw = 0; kw < 3; ++kw) {
               int inh = h * 2 - padding_top + kh;
               int inw = w * 2 - padding_left + kw;
-              if (inh >= 0 && inh < in_height &&
-                  inw >= 0 && inw < in_width) {
+              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                 max = std::max(max, input[input_offset + inh * in_width + inw]);
               }
             }
@@ -130,10 +127,8 @@ void PoolingMaxNeonK3x3S2x2(const float *input,
 }
 
 // assume the input has already been padded
-void PoolingMaxNeonK3x3S2x2Padded(const float *input,
-                                  const index_t *in_shape,
-                                  float *output,
-                                  const index_t *out_shape) {
+void PoolingMaxNeonK3x3S2x2Padded(const float *input, const index_t *in_shape,
+                                  float *output, const index_t *out_shape) {
   index_t batch = in_shape[0];
   index_t channels = in_shape[1];
   index_t in_height = in_shape[2];
@@ -218,5 +213,5 @@ void PoolingMaxNeonK3x3S2x2Padded(const float *input,
   }
 }
 
-} // namespace kernels
-} // namespace mace
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc
index 33d76341..bc6b1952 100644
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -2,45 +2,36 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <arm_neon.h>
 #include "mace/kernels/pooling.h"
+#include <arm_neon.h>
 #include "mace/kernels/conv_pool_2d_util.h"
 
 namespace mace {
 namespace kernels {
 
-extern void PoolingMaxNeonK2x2S2x2(const float *input,
-                                   const index_t *in_shape,
-                                   float *output,
-                                   const index_t *out_shape,
+extern void PoolingMaxNeonK2x2S2x2(const float *input, const index_t *in_shape,
+                                   float *output, const index_t *out_shape,
                                    const int *paddings);
 
-extern void PoolingMaxNeonK3x3S2x2(const float *input,
-                                   const index_t *in_shape,
-                                   float *output,
-                                   const index_t *out_shape,
+extern void PoolingMaxNeonK3x3S2x2(const float *input, const index_t *in_shape,
+                                   float *output, const index_t *out_shape,
                                    const int *paddings);
 
 #ifdef __COPY_MAKE_PADDING
-extern void PoolingMaxNeonK2x2S2x2Padded(const float* input,
-                                  const index_t* in_shape,
-                                  float* output,
-                                  const index_t* out_shape);
-extern void PoolingMaxNeonK3x3S2x2Padded(const float* input,
-                                  const index_t* in_shape,
-                                  float* output,
-                                  const index_t* out_shape);
+extern void PoolingMaxNeonK2x2S2x2Padded(const float *input,
+                                         const index_t *in_shape, float *output,
+                                         const index_t *out_shape);
+extern void PoolingMaxNeonK3x3S2x2Padded(const float *input,
+                                         const index_t *in_shape, float *output,
+                                         const index_t *out_shape);
 #endif
 
-template<>
+template <>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input,
-    const index_t *input_shape,
-    float *output,
+    const float *input, const index_t *input_shape, float *output,
     const index_t *output_shape) {
-  if (kernels_[0] == 2 && kernels_[1] == 2 &&
-      strides_[0] == 2 && strides_[1] == 2 &&
-      pooling_type_ == MAX) {
+  if (kernels_[0] == 2 && kernels_[1] == 2 && strides_[0] == 2 &&
+      strides_[1] == 2 && pooling_type_ == MAX) {
 #ifdef __COPY_MAKE_PADDING
     Tensor padded_input;
     ConstructInputWithPadding(input, input_shape, paddings_, &padded_input);
@@ -50,9 +41,8 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
 #else
     PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, paddings_);
 #endif
-  } else if (kernels_[0] == 3 && kernels_[1] == 3 &&
-      strides_[0] == 2 && strides_[1] == 2 &&
-      pooling_type_ == MAX) {
+  } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
+             strides_[1] == 2 && pooling_type_ == MAX) {
 #ifdef __COPY_MAKE_PADDING
     Tensor padded_input;
     ConstructInputWithPadding(input, input_shape, paddings_, &padded_input);
@@ -65,13 +55,9 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
   } else {  // not implement yet
     PoolingFunctor<DeviceType::CPU, float>(pooling_type_, kernels_, strides_,
                                            paddings_, dilations_)(
-        input,
-        input_shape,
-        output,
-        output_shape
-    );
+        input, input_shape, output, output_shape);
   }
 }
 
-} //  namespace kernels
-} //  namespace mace
\ No newline at end of file
+}  //  namespace kernels
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc
index 19ae6332..b03b8960 100644
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -2,17 +2,17 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <arm_neon.h>
 #include "mace/kernels/relu.h"
+#include <arm_neon.h>
 
 namespace mace {
 namespace kernels {
 
 template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
-                                                float *output,
-                                                index_t size) {
-#pragma omp parallel for num_threads(1) // no significant performance improve
+                                                      float *output,
+                                                      index_t size) {
+#pragma omp parallel for num_threads(1)  // no significant performance improve
   for (int64_t i = 0; i < size; i += kCostPerGroup) {
     int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
     int nn = count >> 2;
@@ -36,6 +36,5 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
   }
 };
 
-
-} // namespace kernels
-} // namespace mace
\ No newline at end of file
+}  // namespace kernels
+}  // namespace mace
\ No newline at end of file
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index b8a1bdd7..b40c2c1f 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -11,29 +11,24 @@
 namespace mace {
 
 enum PoolingType {
-  AVG = 1, // avg_pool
-  MAX = 2, // max_pool
+  AVG = 1,  // avg_pool
+  MAX = 2,  // max_pool
 };
 
 namespace kernels {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class PoolingFunctor {
  public:
-  PoolingFunctor(const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const int *paddings,
-                 const int *dilations)
+  PoolingFunctor(const PoolingType pooling_type, const int *kernels,
+                 const int *strides, const int *paddings, const int *dilations)
       : pooling_type_(pooling_type),
         kernels_(kernels),
         strides_(strides),
         paddings_(paddings),
         dilations_(dilations) {}
 
-  void operator()(const T *input,
-                  const index_t *input_shape,
-                  T *output,
+  void operator()(const T *input, const index_t *input_shape, T *output,
                   const index_t *output_shape) {
     index_t batch = output_shape[0];
     index_t channels = output_shape[1];
@@ -60,32 +55,31 @@ class PoolingFunctor {
 #pragma omp parallel for collapse(2)
     for (int n = 0; n < batch; ++n) {
       for (int c = 0; c < channels; ++c) {
-        index_t out_offset = n * channels * height * width +
-            c * height * width;
+        index_t out_offset = n * channels * height * width + c * height * width;
         index_t in_offset = n * input_channels * input_height * input_width +
-            c * input_height * input_width;
+                            c * input_height * input_width;
         for (int h = 0; h < height; ++h) {
           for (int w = 0; w < width; ++w) {
             T sum_or_max = 0;
             switch (pooling_type_) {
-              case AVG:break;
-              case MAX:sum_or_max = std::numeric_limits<T>::lowest();
+              case AVG:
+                break;
+              case MAX:
+                sum_or_max = std::numeric_limits<T>::lowest();
                 break;
               default:
-                MACE_CHECK(false,
-                           "Unsupported pooling type: ",
-                           pooling_type_);
+                MACE_CHECK(false, "Unsupported pooling type: ", pooling_type_);
             }
             for (int kh = 0; kh < kernel_h; ++kh) {
               for (int kw = 0; kw < kernel_w; ++kw) {
                 int inh = padded_h_start + h * stride_h + dilation_h * kh;
                 int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                if (inh >= 0 && inh < input_height &&
-                    inw >= 0 && inw < input_width) {
-                  index_t input_offset = in_offset +
-                      inh * input_width + inw;
+                if (inh >= 0 && inh < input_height && inw >= 0 &&
+                    inw < input_width) {
+                  index_t input_offset = in_offset + inh * input_width + inw;
                   switch (pooling_type_) {
-                    case AVG:sum_or_max += input[input_offset];
+                    case AVG:
+                      sum_or_max += input[input_offset];
                       break;
                     case MAX:
                       sum_or_max = std::max(sum_or_max, input[input_offset]);
@@ -98,14 +92,14 @@ class PoolingFunctor {
               }
             }
             switch (pooling_type_) {
-              case AVG:output[out_offset] = sum_or_max / (kernel_h * kernel_w);
+              case AVG:
+                output[out_offset] = sum_or_max / (kernel_h * kernel_w);
                 break;
-              case MAX:output[out_offset] = sum_or_max;
+              case MAX:
+                output[out_offset] = sum_or_max;
                 break;
               default:
-                MACE_CHECK(false,
-                           "Unsupported pooling type: ",
-                           pooling_type_);
+                MACE_CHECK(false, "Unsupported pooling type: ", pooling_type_);
             }
             out_offset += 1;
           }
@@ -122,14 +116,12 @@ class PoolingFunctor {
   const int *dilations_;
 };
 
-template<>
+template <>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input,
-    const index_t *input_shape,
-    float *output,
+    const float *input, const index_t *input_shape, float *output,
     const index_t *output_shape);
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
 
-#endif //MACE_KERNELS_POOLING_H
+#endif  // MACE_KERNELS_POOLING_H
diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h
index 8eed29a9..79788f03 100644
--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -10,7 +10,7 @@
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct ReluFunctor {
   void operator()(const T *input, T *output, index_t size) {
     for (index_t i = 0; i < size; ++i) {
@@ -24,7 +24,7 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
                                                       float *output,
                                                       index_t size);
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
 
-#endif // MACE_KERNELS_RELU_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_RELU_H_
\ No newline at end of file
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 4b0df869..1302f9f7 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -22,8 +22,8 @@ struct CachedInterpolation {
 inline float CalculateResizeScale(index_t in_size, index_t out_size,
                                   bool align_corners) {
   return (align_corners && out_size > 1)
-         ? (in_size - 1) / static_cast<float>(out_size - 1)
-         : in_size / static_cast<float>(out_size);
+             ? (in_size - 1) / static_cast<float>(out_size - 1)
+             : in_size / static_cast<float>(out_size);
 }
 
 inline void ComputeInterpolationWeights(const index_t out_size,
@@ -41,21 +41,20 @@ inline void ComputeInterpolationWeights(const index_t out_size,
 }
 
 inline float ComputeLerp(const float top_left, const float top_right,
-                          const float bottom_left, const float bottom_right,
-                          const float x_lerp, const float y_lerp) {
+                         const float bottom_left, const float bottom_right,
+                         const float x_lerp, const float y_lerp) {
   const float top = top_left + (top_right - top_left) * x_lerp;
   const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
   return top + (bottom - top) * y_lerp;
 }
 
-template<typename T>
-void ResizeImage(const T *images,
-                 const index_t batch_size, const index_t in_height,
-                 const index_t in_width, const index_t out_height,
-                 const index_t out_width, const index_t channels,
+template <typename T>
+void ResizeImage(const T *images, const index_t batch_size,
+                 const index_t in_height, const index_t in_width,
+                 const index_t out_height, const index_t out_width,
+                 const index_t channels,
                  const std::vector<CachedInterpolation> &xs_vec,
-                 const std::vector<CachedInterpolation> &ys,
-                 float *output) {
+                 const std::vector<CachedInterpolation> &ys, float *output) {
   const index_t in_channel_size = in_height * in_width;
   const index_t in_batch_num_values = channels * in_channel_size;
   const index_t out_channel_size = out_height * out_width;
@@ -65,10 +64,10 @@ void ResizeImage(const T *images,
 #pragma omp parallel for collapse(2)
   for (index_t b = 0; b < batch_size; ++b) {
     for (index_t c = 0; c < channels; ++c) {
-      const T* input_ptr = images + in_batch_num_values * b
-          + in_channel_size * c;
-      float *output_ptr = output + out_batch_num_values * b
-          + out_channel_size * c;
+      const T *input_ptr =
+          images + in_batch_num_values * b + in_channel_size * c;
+      float *output_ptr =
+          output + out_batch_num_values * b + out_channel_size * c;
       for (index_t y = 0; y < out_height; ++y) {
         const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width;
         const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width;
@@ -83,9 +82,8 @@ void ResizeImage(const T *images,
           const float bottom_left = ys_input_upper_ptr[xs_lower];
           const float bottom_right = ys_input_upper_ptr[xs_upper];
 
-          output_ptr[x] =
-              ComputeLerp(top_left, top_right, bottom_left, bottom_right,
-                                      xs_lerp, ys_lerp);
+          output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left,
+                                      bottom_right, xs_lerp, ys_lerp);
         }
         output_ptr += out_width;
       }
@@ -94,16 +92,15 @@ void ResizeImage(const T *images,
 }
 }
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct ResizeBilinearFunctor {
   bool align_corners_;
 
-  ResizeBilinearFunctor(bool align_corners)
-      : align_corners_(align_corners) {}
+  ResizeBilinearFunctor(bool align_corners) : align_corners_(align_corners) {}
 
-  void operator()(const T *input, T *output,
-                  index_t n, index_t channels, index_t in_height,
-                  index_t in_width, index_t out_height, index_t out_width) {
+  void operator()(const T *input, T *output, index_t n, index_t channels,
+                  index_t in_height, index_t in_width, index_t out_height,
+                  index_t out_width) {
     if (out_height == in_height && out_width == in_width) {
       std::copy(input, input + channels * in_height * in_width, output);
       return;
@@ -111,8 +108,8 @@ struct ResizeBilinearFunctor {
 
     float height_scale =
         CalculateResizeScale(in_height, out_height, align_corners_);
-    float
-        width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
+    float width_scale =
+        CalculateResizeScale(in_width, out_width, align_corners_);
 
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
@@ -121,12 +118,12 @@ struct ResizeBilinearFunctor {
     ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
     ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
 
-    ResizeImage(input, n, in_height, in_width, out_height, out_width,
-                channels, xs, ys, output);
+    ResizeImage(input, n, in_height, in_width, out_height, out_width, channels,
+                xs, ys, output);
   }
 };
 
-} //  namespace kernels
-} //  namespace mace
+}  //  namespace kernels
+}  //  namespace mace
 
-#endif // MACE_KERNELS_RESIZE_BILINEAR_H_
+#endif  // MACE_KERNELS_RESIZE_BILINEAR_H_
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index 766a223e..0598d1cd 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -10,6 +10,6 @@ REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
 REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
-#endif // __ARM_NEON
+#endif  // __ARM_NEON
 
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
index c25db759..064be034 100644
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -10,10 +10,10 @@
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
  public:
-  AddNOp(const OperatorDef &operator_def, Workspace *ws)
+  AddNOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<D, T>(operator_def, ws) {}
 
   bool Run() override {
@@ -36,6 +36,6 @@ class AddNOp : public Operator<D, T> {
   kernels::AddNFunctor<D, T> functor_;
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_OPS_ADDN_H_
+#endif  // MACE_OPS_ADDN_H_
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index 8e3f1b29..f7329d1b 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -10,7 +10,6 @@
 namespace mace {
 template <DeviceType D, typename T>
 static void AddNBenchmark(int iters, int n, int size) {
-
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -18,8 +17,7 @@ static void AddNBenchmark(int iters, int n, int size) {
   for (int i = 0; i < n; ++i) {
     op_def_builder.Input(internal::MakeString("Input", i).c_str());
   }
-  op_def_builder.Output("Output")
-      .Finalize(net.operator_def());
+  op_def_builder.Output("Output").Finalize(net.operator_def());
 
   // Add input data
   for (int i = 0; i < n; ++i) {
@@ -32,27 +30,26 @@ static void AddNBenchmark(int iters, int n, int size) {
   }
 
   mace::testing::StartTiming();
-  while(iters--) {
+  while (iters--) {
     net.RunOp(D);
   }
 }
 
-#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE)                       \
-  static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(            \
-        int iters) {                                               \
-    const int64_t tot = static_cast<int64_t>(iters) * N * SIZE;    \
-    mace::testing::ItemsProcessed(tot);                            \
-    mace::testing::BytesProcessed(tot * (sizeof(TYPE)));           \
-    AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE);                   \
-  }                                                                \
+#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE)                        \
+  static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \
+    const int64_t tot = static_cast<int64_t>(iters) * N * SIZE;     \
+    mace::testing::ItemsProcessed(tot);                             \
+    mace::testing::BytesProcessed(tot*(sizeof(TYPE)));              \
+    AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE);                    \
+  }                                                                 \
   BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
 
-#define BM_ADDN(N, SIZE, TYPE)        \
-  BM_ADDN_MACRO(N, SIZE, TYPE, CPU);  \
+#define BM_ADDN(N, SIZE, TYPE)       \
+  BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
   BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
 
 BM_ADDN(10, 1000, float);
 BM_ADDN(10, 10000, float);
 BM_ADDN(100, 1000, float);
 BM_ADDN(100, 10000, float);
-} //  namespace mace
\ No newline at end of file
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 453458ff..dd5f906f 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -36,4 +36,4 @@ TEST_F(AddnOpTest, AddnOp) {
   ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
 }
 
-} // namespace mace
+}  // namespace mace
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 9a48b669..f5b050f1 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -10,6 +10,6 @@ REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
 REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>);
-#endif // __ARM_NEON
+#endif  // __ARM_NEON
 
-} //  namespace mace
\ No newline at end of file
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index e58886b0..a9b1f9f5 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -10,50 +10,55 @@
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class BatchNormOp : public Operator<D, T> {
-  public:
-    BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-            : Operator<D, T>(operator_def, ws),
-              functor_(OperatorBase::GetSingleArgument<float>("variance_epsilon", 1e-4)){}
-
-    bool Run() override {
-      const Tensor* input = this->Input(0);
-      const Tensor* scale = this->Input(1);
-      const Tensor* offset = this->Input(2);
-      const Tensor* mean = this->Input(3);
-      const Tensor* var = this->Input(4);
-
-      MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", input->dim_size());
-      MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", scale->dim_size());
-      MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", offset->dim_size());
-      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", mean->dim_size());
-      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", var->dim_size());
-
-      Tensor* output = this->Output(0);
-      output->ResizeLike(input);
-
-      const index_t n = input->dim(0);
-      const index_t channel = input->dim(1);
-      const index_t sample_size = input->dim(2) * input->dim(3);
-
-      const T* input_ptr = input->data<T>();
-      const T* scale_ptr = scale->data<T>();
-      const T* offset_ptr = offset->data<T>();
-      const T* mean_ptr = mean->data<T>();
-      const T* var_ptr = var->data<T>();
-      T* output_ptr = output->mutable_data<T>();
-
-      functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr,
-                                     n, channel, sample_size,
-                                     output_ptr);
-      return true;
-    }
-  private:
-    kernels::BatchNormFunctor<D, T> functor_;
+ public:
+  BatchNormOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<D, T>(operator_def, ws),
+        functor_(
+            OperatorBase::GetSingleArgument<float>("variance_epsilon", 1e-4)) {}
 
+  bool Run() override {
+    const Tensor* input = this->Input(0);
+    const Tensor* scale = this->Input(1);
+    const Tensor* offset = this->Input(2);
+    const Tensor* mean = this->Input(3);
+    const Tensor* var = this->Input(4);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
+               input->dim_size());
+    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
+               scale->dim_size());
+    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
+               offset->dim_size());
+    MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
+               mean->dim_size());
+    MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
+               var->dim_size());
+
+    Tensor* output = this->Output(0);
+    output->ResizeLike(input);
+
+    const index_t n = input->dim(0);
+    const index_t channel = input->dim(1);
+    const index_t sample_size = input->dim(2) * input->dim(3);
+
+    const T* input_ptr = input->data<T>();
+    const T* scale_ptr = scale->data<T>();
+    const T* offset_ptr = offset->data<T>();
+    const T* mean_ptr = mean->data<T>();
+    const T* var_ptr = var->data<T>();
+    T* output_ptr = output->mutable_data<T>();
+
+    functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr, n, channel,
+             sample_size, output_ptr);
+    return true;
+  }
+
+ private:
+  kernels::BatchNormFunctor<D, T> functor_;
 };
 
-} //  namespace mace
+}  //  namespace mace
 
-#endif //  MACE_BATCH_NORM_H_
+#endif  //  MACE_BATCH_NORM_H_
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index 789934fb..f9de40c5 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -8,19 +8,19 @@
 
 namespace mace {
 template <DeviceType D, typename T>
-static void BatchNorm(int iters, int batch, int channels, int height, int width) {
-
+static void BatchNorm(int iters, int batch, int channels, int height,
+                      int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
   OpDefBuilder("BatchNorm", "BatchNormBM")
-          .Input("Input")
-          .Input("Scale")
-          .Input("Offset")
-          .Input("Mean")
-          .Input("Var")
-          .Output("Output")
-          .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add input data
   net.AddRandomInput<T>("Input", {batch, channels, height, width});
@@ -35,23 +35,23 @@ static void BatchNorm(int iters, int batch, int channels, int height, int width)
   }
 
   mace::testing::StartTiming();
-  while(iters--) {
+  while (iters--) {
     net.RunOp(D);
   }
 }
 
-#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE)                   \
-  static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
-        int iters) {                                                    \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;    \
-    mace::testing::ItemsProcessed(tot);                                 \
-    mace::testing::BytesProcessed(tot * (sizeof(TYPE)));                \
-    BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                         \
-  }                                                                     \
+#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE)                  \
+  static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
+      int iters) {                                                     \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;   \
+    mace::testing::ItemsProcessed(tot);                                \
+    mace::testing::BytesProcessed(tot*(sizeof(TYPE)));                 \
+    BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
+  }                                                                    \
   BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_BATCH_NORM(N, C, H, W, TYPE)        \
-  BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU);  \
+#define BM_BATCH_NORM(N, C, H, W, TYPE)       \
+  BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \
   BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON);
 
 BM_BATCH_NORM(1, 1, 512, 512, float);
@@ -65,4 +65,4 @@ BM_BATCH_NORM(1, 128, 256, 256, float);
 BM_BATCH_NORM(1, 128, 512, 512, float);
 BM_BATCH_NORM(32, 1, 256, 256, float);
 BM_BATCH_NORM(32, 3, 256, 256, float);
-} //  namespace mace
\ No newline at end of file
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index f4e07416..f963de21 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -13,17 +13,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("BatchNorm", "BatchNormTest")
-        .Input("Input")
-        .Input("Scale")
-        .Input("Offset")
-        .Input("Mean")
-        .Input("Var")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add input data
   net.AddInputFromArray<float>("Input", {1, 1, 6, 2},
-                    {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
+                               {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
   net.AddInputFromArray<float>("Scale", {1}, {4.0f});
   net.AddInputFromArray<float>("Offset", {1}, {2.0});
   net.AddInputFromArray<float>("Mean", {1}, {10});
@@ -33,8 +33,8 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 6, 2},
-                                        {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+  auto expected =
+      CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
                                          3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.01);
@@ -51,13 +51,13 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("BatchNorm", "BatchNormTest")
-          .Input("Input")
-          .Input("Scale")
-          .Input("Offset")
-          .Input("Mean")
-          .Input("Var")
-          .Output("Output")
-          .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add input data
   net.AddRandomInput<float>("Input", {batch, channels, height, width});
@@ -77,5 +77,4 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
-
 }
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index a236856b..33c60956 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -11,6 +11,6 @@ REGISTER_CPU_OPERATOR(Conv2d, Conv2dOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
 REGISTER_NEON_OPERATOR(Conv2d, Conv2dOp<DeviceType::NEON, float>);
-#endif // __ARM_NEON
+#endif  // __ARM_NEON
 
-} // namespace mace
+}  // namespace mace
diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h
index 6ae1e06f..ad3206b0 100644
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -13,11 +13,11 @@
 
 namespace mace {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class Conv2dOp : public ConvPool2dOpBase<D, T> {
  public:
   Conv2dOp(const OperatorDef& op_def, Workspace* ws)
-    : ConvPool2dOpBase<D, T>(op_def, ws) {};
+      : ConvPool2dOpBase<D, T>(op_def, ws){};
 
   bool Run() override {
     const Tensor* input = this->Input(INPUT);
@@ -27,21 +27,16 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
 
     std::vector<index_t> output_shape(4);
     std::vector<int> paddings(2);
-    kernels::CalcPaddingAndOutputSize(input->shape().data(),
-                                      filter->shape().data(),
-                                      this->dilations_.data(),
-                                      this->strides_.data(),
-                                      this->padding_,
-                                      output_shape.data(),
-                                      paddings.data());
+    kernels::CalcPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), this->dilations_.data(),
+        this->strides_.data(), this->padding_, output_shape.data(),
+        paddings.data());
     output->Resize(output_shape);
 
-    auto conv2d = kernels::Conv2dFunctor<D, T>(this->strides_.data(),
-                                               paddings.data(),
-                                               this->dilations_.data());
-    conv2d(input->data<T>(), input->shape().data(),
-           filter->data<T>(), filter->shape().data(),
-           bias->data<T>(), output->mutable_data<T>(),
+    auto conv2d = kernels::Conv2dFunctor<D, T>(
+        this->strides_.data(), paddings.data(), this->dilations_.data());
+    conv2d(input->data<T>(), input->shape().data(), filter->data<T>(),
+           filter->shape().data(), bias->data<T>(), output->mutable_data<T>(),
            output->shape().data());
 
     return true;
@@ -52,6 +47,6 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
   OP_OUTPUT_TAGS(OUTPUT);
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_OPS_CONV_2D_H_
+#endif  // MACE_OPS_CONV_2D_H_
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 96843971..e26f7ac8 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -13,17 +13,17 @@ namespace mace {
 
 template <DeviceType D, typename T>
 static void Conv2d(int iters, int batch, int channels, int height, int width,
-                   int kernel_h, int kernel_w, int stride,
-                   Padding padding, int output_channels) {
+                   int kernel_h, int kernel_w, int stride, Padding padding,
+                   int output_channels) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
   OpDefBuilder("Conv2d", "Conv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("strides", {stride, stride});
@@ -32,7 +32,8 @@ static void Conv2d(int iters, int batch, int channels, int height, int width,
 
   // Add input data
   net.AddRandomInput<float>("Input", {batch, channels, height, width});
-  net.AddRandomInput<float>("Filter", {output_channels, channels, kernel_h, kernel_w});
+  net.AddRandomInput<float>("Filter",
+                            {output_channels, channels, kernel_h, kernel_w});
   net.AddRandomInput<float>("Bias", {output_channels});
 
   // Warm-up
@@ -41,27 +42,30 @@ static void Conv2d(int iters, int batch, int channels, int height, int width,
   }
 
   mace::testing::StartTiming();
-  while(iters--) {
+  while (iters--) {
     net.RunOp(D);
   }
 }
 
-#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \
-  static void BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_OC##_##TYPE##_##DEVICE(  \
-        int iters) {                                                               \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;               \
-    mace::testing::ItemsProcessed(tot);                                            \
-    mace::testing::BytesProcessed(tot * (sizeof(TYPE)));                           \
-    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, OC); \
-  }                                                                                \
-  BENCHMARK(BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_OC##_##TYPE##_##DEVICE)
+#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE)                        \
+  static void                                                                                    \
+      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_OC##_##TYPE##_##DEVICE( \
+          int iters) {                                                                           \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                             \
+    mace::testing::ItemsProcessed(tot);                                                          \
+    mace::testing::BytesProcessed(tot*(sizeof(TYPE)));                                           \
+    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                    \
+                         OC);                                                                    \
+  }                                                                                              \
+  BENCHMARK(                                                                                     \
+      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_OC##_##TYPE##_##DEVICE)
 
-#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);  \
+#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)       \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON);
 
 BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
+BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
 BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
@@ -71,4 +75,4 @@ BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
 BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 4dbc5d34..db6f2b48 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include "mace/core/operator.h"
 #include "mace/ops/conv_2d.h"
+#include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"
 
 using namespace mace;
@@ -14,11 +14,11 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Conv2d", "Conv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("strides", {1, 1});
@@ -26,17 +26,13 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 2, 3, 3},
-                    {1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1});
-  net.AddInputFromArray<float>("Filter", {1, 2, 3, 3},
-                           {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<float>(
+      "Input", {1, 2, 3, 3},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<float>(
+      "Filter", {1, 2, 3, 3},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
   net.AddInputFromArray<float>("Bias", {1}, {0.1f});
 
   // Run
@@ -52,11 +48,11 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Conv2d", "Conv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("strides", {1, 1});
@@ -64,27 +60,22 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 2, 3, 3},
-                    {1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1,
-                     1, 1, 1});
-  net.AddInputFromArray<float>("Filter", {1, 2, 3, 3},
-                           {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<float>(
+      "Input", {1, 2, 3, 3},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<float>(
+      "Filter", {1, 2, 3, 3},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
   net.AddInputFromArray<float>("Bias", {1}, {0.1f});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 3, 3},
-                                        { 8.1f, 12.1f,  8.1f,
-                                         12.1f, 18.1f, 12.1f,
-                                          8.1f, 12.1f,  8.1f});
+  auto expected = CreateTensor<float>(
+      {1, 1, 3, 3},
+      {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -93,11 +84,11 @@ TEST_F(Conv2dOpTest, Combined) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Conv2d", "Conv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("strides", {2, 2});
@@ -105,36 +96,24 @@ TEST_F(Conv2dOpTest, Combined) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 2, 5, 5},
-                    {1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1});
-  net.AddInputFromArray<float>("Filter", {2, 2, 3, 3},
-                           {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
-                            0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
+  net.AddInputFromArray<float>(
+      "Input", {1, 2, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<float>(
+      "Filter", {2, 2, 3, 3},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+       0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
   net.AddInputFromArray<float>("Bias", {2}, {0.1f, 0.2f});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 3, 3},
-                                        { 8.1f, 12.1f,  8.1f,
-                                         12.1f, 18.1f, 12.1f,
-                                          8.1f, 12.1f,  8.1f,
-                                          4.2f, 6.2f, 4.2f,
-                                          6.2f, 9.2f, 6.2f,
-                                          4.2f, 6.2f, 4.2f});
-
+  auto expected = CreateTensor<float>(
+      {1, 2, 3, 3}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f,
+                     4.2f, 6.2f, 4.2f, 6.2f, 9.2f, 6.2f, 4.2f, 6.2f, 4.2f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -143,11 +122,11 @@ TEST_F(Conv2dOpTest, Conv1x1) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Conv2d", "Conv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("strides", {1, 1});
@@ -155,38 +134,32 @@ TEST_F(Conv2dOpTest, Conv1x1) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 5, 3, 10},
-                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<float>("Filter", {2, 5, 1, 1},
-                           {1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-                            2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
+  net.AddInputFromArray<float>(
+      "Input", {1, 5, 3, 10},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<float>(
+      "Filter", {2, 5, 1, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
   net.AddInputFromArray<float>("Bias", {2}, {0.1f, 0.2f});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 3, 10},
-                                        {5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
-                                         5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
-                                         5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
-                                         10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
-                                         10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
-                                         10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f});
+  auto expected = CreateTensor<float>(
+      {1, 2, 3, 10},
+      {5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,
+       5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,
+       5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,  5.1f,
+       10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
+       10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
+       10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -194,8 +167,7 @@ TEST_F(Conv2dOpTest, Conv1x1) {
 // TODO we need more tests
 TEST_F(Conv2dOpTest, ConvNxNS12) {
   testing::internal::LogToStderr();
-  auto func = [&](int kernel_h, int kernel_w,
-                  int stride_h, int stride_w,
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
     srand(time(NULL));
 
@@ -206,7 +178,7 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
     index_t width = 7 + rand() % 100;
     index_t output_channels = 1 + rand() % 50;
     // Construct graph
-    auto &net = test_net();
+    auto& net = test_net();
     OpDefBuilder("Conv2d", "Conv2dTest")
         .Input("Input")
         .Input("Filter")
@@ -221,8 +193,8 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
 
     // Add input data
     net.AddRandomInput<float>("Input", {batch, input_channels, height, width});
-    net.AddRandomInput<float>("Filter", {output_channels, input_channels,
-                                         kernel_h, kernel_w});
+    net.AddRandomInput<float>(
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
     net.AddRandomInput<float>("Bias", {output_channels});
     // run cpu
     net.RunOp();
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
index a84e4152..a572b71e 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -10,16 +10,15 @@
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class ConvPool2dOpBase : public Operator<D, T> {
  public:
   ConvPool2dOpBase(const OperatorDef& op_def, Workspace* ws)
-    : Operator<D, T>(op_def, ws),
-    strides_(OperatorBase::GetRepeatedArgument<int>("strides")),
-    padding_(static_cast<Padding>(
-          OperatorBase::GetSingleArgument<int>("padding",
-                                               static_cast<int>(SAME)))),
-    dilations_(OperatorBase::GetRepeatedArgument<int>("dilations")) {}
+      : Operator<D, T>(op_def, ws),
+        strides_(OperatorBase::GetRepeatedArgument<int>("strides")),
+        padding_(static_cast<Padding>(OperatorBase::GetSingleArgument<int>(
+            "padding", static_cast<int>(SAME)))),
+        dilations_(OperatorBase::GetRepeatedArgument<int>("dilations")) {}
 
  protected:
   std::vector<int> strides_;
@@ -27,6 +26,6 @@ class ConvPool2dOpBase : public Operator<D, T> {
   std::vector<int> dilations_;
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_OPS_CONV_POOL_2D_BASE_H_
+#endif  // MACE_OPS_CONV_POOL_2D_BASE_H_
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index d2b9a2c1..0315a71e 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -43,31 +43,33 @@ class OpsTestNet {
  public:
   OpsTestNet() {}
 
-  template<typename T>
-  void AddInputFromArray(const char *name,
-                         const std::vector<index_t> &shape,
+  template <typename T>
+  void AddInputFromArray(const char *name, const std::vector<index_t> &shape,
                          const std::vector<T> &data) {
-    Tensor *input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+    Tensor *input =
+        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
     input->Resize(shape);
     T *input_data = input->mutable_data<T>();
     MACE_CHECK(input->size() == data.size());
     memcpy(input_data, data.data(), data.size() * sizeof(T));
   }
 
-  template<typename T>
-  void AddRepeatedInput(const char *name,
-                         const std::vector<index_t> &shape,
-                         const T data) {
-    Tensor *input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+  template <typename T>
+  void AddRepeatedInput(const char *name, const std::vector<index_t> &shape,
+                        const T data) {
+    Tensor *input =
+        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
     input->Resize(shape);
     T *input_data = input->mutable_data<T>();
     MACE_CHECK(input->size() == data.size());
     std::fill(input_data, input_data + input->size(), data);
   }
 
-  template<typename T>
-  void AddRandomInput(const char *name, const std::vector<index_t> &shape, bool positive = false) {
-    Tensor *input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+  template <typename T>
+  void AddRandomInput(const char *name, const std::vector<index_t> &shape,
+                      bool positive = false) {
+    Tensor *input =
+        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
     input->Resize(shape);
     float *input_data = input->mutable_data<T>();
 
@@ -76,12 +78,16 @@ class OpsTestNet {
     std::normal_distribution<T> nd(0, 1);
 
     std::generate(input_data, input_data + input->size(),
-                  [&gen, &nd, positive] { return positive ? std::abs(nd(gen)) : nd(gen); });
+                  [&gen, &nd, positive] {
+                    return positive ? std::abs(nd(gen)) : nd(gen);
+                  });
   }
 
-  template<typename T>
-  void AddFixedInput(const char *name, const std::vector<index_t> &shape, T value) {
-    Tensor *input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+  template <typename T>
+  void AddFixedInput(const char *name, const std::vector<index_t> &shape,
+                     T value) {
+    Tensor *input =
+        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
     input->Resize(shape);
     float *input_data = input->mutable_data<T>();
 
@@ -122,7 +128,8 @@ class OpsTestNet {
     }
   }
 
-  void AddStringsArg(const char *name, const std::vector<const char *> &values) {
+  void AddStringsArg(const char *name,
+                     const std::vector<const char *> &values) {
     auto arg = op_def_.add_arg();
     arg->set_name(name);
     for (auto value : values) {
@@ -145,9 +152,7 @@ class OpsTestNet {
     return net_->Run();
   }
 
-  bool RunOp() {
-    return RunOp(DeviceType::CPU);
-  }
+  bool RunOp() { return RunOp(DeviceType::CPU); }
 
   Tensor *GetOutput(const char *output_name) {
     return ws_.GetTensor(output_name);
@@ -177,8 +182,9 @@ class OpsTestBase : public ::testing::Test {
   OpsTestNet test_net_;
 };
 
-template<typename T>
-unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape, const std::vector<T> &data) {
+template <typename T>
+unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
+                                const std::vector<T> &data) {
   unique_ptr<Tensor> res(new Tensor(cpu_allocator(), DataTypeToEnum<T>::v()));
   res->Resize(shape);
   T *input_data = res->mutable_data<T>();
@@ -209,40 +215,38 @@ inline std::string ShapeToString(const Tensor &x) {
   return std::string(stream.str());
 }
 
-
-template<typename T>
+template <typename T>
 struct is_floating_point_type {
-  static const bool value = std::is_same<T, float>::value ||
-                            std::is_same<T, double>::value;
+  static const bool value =
+      std::is_same<T, float>::value || std::is_same<T, double>::value;
 };
 
-template<typename T>
+template <typename T>
 inline void ExpectEqual(const T &a, const T &b) {
   EXPECT_EQ(a, b);
 }
 
-template<>
+template <>
 inline void ExpectEqual<float>(const float &a, const float &b) {
   EXPECT_FLOAT_EQ(a, b);
 }
 
-template<>
+template <>
 inline void ExpectEqual<double>(const double &a, const double &b) {
   EXPECT_DOUBLE_EQ(a, b);
 }
 
 inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
   ASSERT_EQ(x.dtype(), y.dtype());
-  ASSERT_TRUE(IsSameSize(x, y))
-                        << "x.shape [" << ShapeToString(x) << "] vs "
-                        << "y.shape [ " << ShapeToString(y) << "]";
+  ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
+                                << "y.shape [ " << ShapeToString(y) << "]";
 }
 
-template<typename T, bool is_fp = is_floating_point_type<T>::value>
+template <typename T, bool is_fp = is_floating_point_type<T>::value>
 struct Expector;
 
 // Partial specialization for float and double.
-template<typename T>
+template <typename T>
 struct Expector<T, true> {
   static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
 
@@ -262,18 +266,19 @@ struct Expector<T, true> {
     auto a = x.data<T>();
     auto b = y.data<T>();
     for (int i = 0; i < x.size(); ++i) {
-      EXPECT_NEAR(a[i], b[i], abs_err)
-                    << "a = " << a << " b = " << b << " index = " << i;
+      EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
+                                       << " index = " << i;
     }
   }
 };
 
-template<typename T>
+template <typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
-  static_assert(is_floating_point_type<T>::value, "T is not a floating point type");
+  static_assert(is_floating_point_type<T>::value,
+                "T is not a floating point type");
   Expector<T>::Near(x, y, abs_err);
 }
 
-} // namespace mace
+}  // namespace mace
 
-#endif //  MACE_OPS_TEST_UTIL_H_
+#endif  //  MACE_OPS_TEST_UTIL_H_
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index cab59685..4b972647 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -2,7 +2,6 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-
 #include "mace/ops/pooling.h"
 
 namespace mace {
@@ -11,6 +10,6 @@ REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
 REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>);
-#endif // __ARM_NEON
+#endif  // __ARM_NEON
 
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index 4d0001df..597a4724 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -11,17 +11,17 @@
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class PoolingOp : public ConvPool2dOpBase<D, T> {
-public:
+ public:
   PoolingOp(const OperatorDef& op_def, Workspace* ws)
-  : ConvPool2dOpBase<D, T>(op_def, ws),
-    kernels_(OperatorBase::GetRepeatedArgument<int>("kernels")),
-    pooling_type_(static_cast<PoolingType>(
-                  OperatorBase::GetSingleArgument<int>(
-                  "pooling_type", static_cast<int>(AVG)))) {};
+      : ConvPool2dOpBase<D, T>(op_def, ws),
+        kernels_(OperatorBase::GetRepeatedArgument<int>("kernels")),
+        pooling_type_(
+            static_cast<PoolingType>(OperatorBase::GetSingleArgument<int>(
+                "pooling_type", static_cast<int>(AVG)))){};
 
-  bool Run() override{
+  bool Run() override {
     const Tensor* input = this->Input(INPUT);
     Tensor* output = this->Output(OUTPUT);
     std::vector<index_t> in_shape = input->shape();
@@ -33,28 +33,21 @@ public:
     filter_shape[1] = in_shape[0];
     filter_shape[2] = kernels_[0];
     filter_shape[3] = kernels_[1];
-    kernels::CalcPaddingAndOutputSize(in_shape.data(),
-                                      filter_shape.data(),
+    kernels::CalcPaddingAndOutputSize(in_shape.data(), filter_shape.data(),
                                       this->dilations_.data(),
-                                      this->strides_.data(),
-                                      this->padding_,
-                                      output_shape.data(),
-                                      paddings.data());
+                                      this->strides_.data(), this->padding_,
+                                      output_shape.data(), paddings.data());
     output->Resize(output_shape);
 
-    auto pooling_func = kernels::PoolingFunctor<D, T>(pooling_type_,
-                                                      kernels_.data(),
-                                                      this->strides_.data(),
-                                                      paddings.data(),
-                                                      this->dilations_.data());
-    pooling_func(input->data<float>(),
-                 in_shape.data(),
-                 output->mutable_data<float>(),
-                 output->shape().data());
+    auto pooling_func = kernels::PoolingFunctor<D, T>(
+        pooling_type_, kernels_.data(), this->strides_.data(), paddings.data(),
+        this->dilations_.data());
+    pooling_func(input->data<float>(), in_shape.data(),
+                 output->mutable_data<float>(), output->shape().data());
     return true;
   };
 
-protected:
+ protected:
   std::vector<int> kernels_;
   PoolingType pooling_type_;
 
@@ -62,6 +55,6 @@ protected:
   OP_OUTPUT_TAGS(OUTPUT);
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif //MACE_OPS_POOLING_H_
+#endif  // MACE_OPS_POOLING_H_
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index ccdcb206..aa2ae140 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -2,20 +2,19 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include "mace/core/testing/test_benchmark.h"
-#include "mace/core/operator.h"
 #include "mace/kernels/pooling.h"
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 using namespace mace;
 using namespace mace::kernels;
 
-template<DeviceType D>
-static void Pooling(int iters, int batch, int channels, int height,
-                    int width, int kernel, int stride, Padding padding,
+template <DeviceType D>
+static void Pooling(int iters, int batch, int channels, int height, int width,
+                    int kernel, int stride, Padding padding,
                     PoolingType pooling_type) {
-
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -45,18 +44,21 @@ static void Pooling(int iters, int batch, int channels, int height,
   }
 }
 
-#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE)                  \
-  static void BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE(  \
-        int iters) {                                                    \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                        \
-    mace::testing::ItemsProcessed(tot);                                 \
-    mace::testing::BytesProcessed(tot * (sizeof(float)));\
-    Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, PoolingType::PO);                         \
-  }                                                                     \
-  BENCHMARK(BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE)
+#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE)                    \
+  static void                                                                       \
+      BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
+          int iters) {                                                              \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                \
+    mace::testing::ItemsProcessed(tot);                                             \
+    mace::testing::BytesProcessed(tot*(sizeof(float)));                             \
+    Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,                     \
+                    PoolingType::PO);                                               \
+  }                                                                                 \
+  BENCHMARK(                                                                        \
+      BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE)
 
-#define BM_POOLING(N, C, H, W, K, S, PA, PO)                    \
-  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU);  \
+#define BM_POOLING(N, C, H, W, K, S, PA, PO)       \
+  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
   BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, NEON);
 
 BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 7ff8e351..7ca43f19 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -5,9 +5,9 @@
 #include "gtest/gtest.h"
 
 #include "mace/core/operator.h"
-#include "mace/ops/ops_test_util.h"
-#include "mace/ops/conv_pool_2d_base.h"
 #include "mace/kernels/pooling.h"
+#include "mace/ops/conv_pool_2d_base.h"
+#include "mace/ops/ops_test_util.h"
 
 using namespace mace;
 
@@ -17,9 +17,9 @@ TEST_F(PoolingOpTest, MAX_VALID) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("kernels", {2, 2});
@@ -29,34 +29,28 @@ TEST_F(PoolingOpTest, MAX_VALID) {
   net.AddIntArg("pooling_type", PoolingType::MAX);
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 2, 4, 4},
-                          {0, 1, 2, 3,
-                           4, 5, 6, 7,
-                           8, 9, 10, 11,
-                           12, 13, 14, 15,
-                           16, 17, 18, 19,
-                           20, 21, 22, 23,
-                           24, 25, 26, 27,
-                           28, 29, 30, 31});
+  net.AddInputFromArray<float>(
+      "Input", {1, 2, 4, 4},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 2, 2},
-                                        {5, 7, 13, 15, 21, 23, 29, 31});
+  auto expected =
+      CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-
 TEST_F(PoolingOpTest, AVG_VALID) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("kernels", {2, 2});
@@ -66,22 +60,17 @@ TEST_F(PoolingOpTest, AVG_VALID) {
   net.AddIntArg("pooling_type", PoolingType::AVG);
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 2, 4, 4},
-                          {0, 1, 2, 3,
-                           4, 5, 6, 7,
-                           8, 9, 10, 11,
-                           12, 13, 14, 15,
-                           16, 17, 18, 19,
-                           20, 21, 22, 23,
-                           24, 25, 26, 27,
-                           28, 29, 30, 31});
+  net.AddInputFromArray<float>(
+      "Input", {1, 2, 4, 4},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 2, 2, 2},
-                                        {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
+  auto expected = CreateTensor<float>(
+      {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -90,9 +79,9 @@ TEST_F(PoolingOpTest, MAX_SAME) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("kernels", {2, 2});
@@ -103,16 +92,13 @@ TEST_F(PoolingOpTest, MAX_SAME) {
 
   // Add input data
   net.AddInputFromArray<float>("Input", {1, 1, 3, 3},
-                          {0, 1, 2, 
-                           3, 4, 5, 
-                           6, 7, 8});
+                               {0, 1, 2, 3, 4, 5, 6, 7, 8});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2},
-                                        {4, 5, 7, 8});
+  auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -121,9 +107,9 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntsArg("kernels", {2, 2});
@@ -133,18 +119,15 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
   net.AddIntArg("pooling_type", PoolingType::MAX);
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 1, 4, 4},
-                          {0, 1, 2, 3,
-                           4, 5, 6, 7,
-                           8, 9, 10, 11,
-                           12, 13, 14, 15});
+  net.AddInputFromArray<float>(
+      "Input", {1, 1, 4, 4},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2},
-                                        {10, 11, 14, 15});
+  auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -153,9 +136,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntArg("pooling_type", PoolingType::MAX);
@@ -165,18 +148,14 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 1, 4, 5},
-                                {0, 1, 2, 3, 4,
-                                 5, 6, 7, 8, 9,
-                                 10, 11, 12, 13, 14,
-                                 15, 16, 17, 18, 19});
+  net.AddInputFromArray<float>(
+      "Input", {1, 1, 4, 5},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
   // Run
   net.RunOp(DeviceType::NEON);
 
   // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3},
-                                        {6, 8, 9,
-                                         16, 18, 19});
+  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
 
   ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
 }
@@ -185,9 +164,9 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.operator_def());
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
 
   // Add args
   net.AddIntArg("pooling_type", PoolingType::MAX);
@@ -197,18 +176,14 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>("Input", {1, 1, 4, 5},
-                                {0, 1, 2, 3, 4,
-                                 5, 6, 7, 8, 9,
-                                 10, 11, 12, 13, 14,
-                                 15, 16, 17, 18, 19});
+  net.AddInputFromArray<float>(
+      "Input", {1, 1, 4, 5},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
   // Run
   net.RunOp(DeviceType::NEON);
 
   // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3},
-                                        {11, 13, 14,
-                                         16, 18, 19});
+  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
 
   ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
 }
diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc
index c2193080..8602f932 100644
--- a/mace/ops/relu.cc
+++ b/mace/ops/relu.cc
@@ -10,6 +10,6 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
 REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
-#endif // __ARM_NEON
+#endif  // __ARM_NEON
 
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/ops/relu.h b/mace/ops/relu.h
index 166c7733..c195c78f 100644
--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -10,10 +10,10 @@
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class ReluOp : public Operator<D, T> {
  public:
-  ReluOp(const OperatorDef &operator_def, Workspace *ws)
+  ReluOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<D, T>(operator_def, ws) {}
   bool Run() override {
     const Tensor* input_tensor = this->inputs_[0];
@@ -31,6 +31,6 @@ class ReluOp : public Operator<D, T> {
   kernels::ReluFunctor<D, T> functor_;
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_OPS_RELU_H_
+#endif  // MACE_OPS_RELU_H_
diff --git a/mace/ops/relu_benchmark.cc b/mace/ops/relu_benchmark.cc
index 371c7eca..4605990e 100644
--- a/mace/ops/relu_benchmark.cc
+++ b/mace/ops/relu_benchmark.cc
@@ -10,7 +10,6 @@
 namespace mace {
 template <DeviceType D, typename T>
 static void ReluBenchmark(int iters, int size) {
-
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -28,26 +27,25 @@ static void ReluBenchmark(int iters, int size) {
   }
 
   mace::testing::StartTiming();
-  while(iters--) {
+  while (iters--) {
     net.RunOp(D);
   }
 }
 
-#define BM_RELU_MACRO(SIZE, TYPE, DEVICE)                        \
-  static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(                \
-        int iters) {                                             \
-    const int64_t tot = static_cast<int64_t>(iters) * SIZE;      \
-    mace::testing::ItemsProcessed(tot);                          \
-    mace::testing::BytesProcessed(tot * (sizeof(TYPE)));         \
-    ReluBenchmark<DEVICE, TYPE>(iters, SIZE);                    \
-  }                                                              \
+#define BM_RELU_MACRO(SIZE, TYPE, DEVICE)                     \
+  static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(int iters) { \
+    const int64_t tot = static_cast<int64_t>(iters) * SIZE;   \
+    mace::testing::ItemsProcessed(tot);                       \
+    mace::testing::BytesProcessed(tot*(sizeof(TYPE)));        \
+    ReluBenchmark<DEVICE, TYPE>(iters, SIZE);                 \
+  }                                                           \
   BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE)
 
-#define BM_RELU(SIZE, TYPE)        \
-  BM_RELU_MACRO(SIZE, TYPE, CPU);  \
+#define BM_RELU(SIZE, TYPE)       \
+  BM_RELU_MACRO(SIZE, TYPE, CPU); \
   BM_RELU_MACRO(SIZE, TYPE, NEON);
 
 BM_RELU(1000, float);
 BM_RELU(100000, float);
 BM_RELU(10000000, float);
-} //  namespace mace
\ No newline at end of file
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc
index 6ca8f6e3..1277722c 100644
--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -32,4 +32,4 @@ TEST_F(ReluOpTest, ReluOp) {
   ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
 }
 
-} // namespace mace
+}  // namespace mace
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 8e931cc9..a20c9f13 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -9,7 +9,8 @@ namespace mace {
 REGISTER_CPU_OPERATOR(ResizeBilinear, ResizeBilinearOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(ResizeBilinear, ResizeBilinearOp<DeviceType::NEON, float>);
-#endif // __ARM_NEON
+REGISTER_NEON_OPERATOR(ResizeBilinear,
+                       ResizeBilinearOp<DeviceType::NEON, float>);
+#endif  // __ARM_NEON
 
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h
index 2d1b6f59..8daa3176 100644
--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -5,18 +5,18 @@
 #ifndef MACE_RESIZE_BILINEAR_H
 #define MACE_RESIZE_BILINEAR_H
 
-
 #include "mace/core/operator.h"
 #include "mace/kernels/resize_bilinear.h"
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class ResizeBilinearOp : public Operator<D, T> {
  public:
-  ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws)
+  ResizeBilinearOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetSingleArgument<bool>("align_corners", false)) {}
+        functor_(
+            OperatorBase::GetSingleArgument<bool>("align_corners", false)) {}
 
   bool Run() override {
     const Tensor* input = this->Input(0);
@@ -24,8 +24,8 @@ class ResizeBilinearOp : public Operator<D, T> {
 
     MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
                input->dim_size());
-    MACE_CHECK(resize_dims->dim_size() == 1, "resize dim must be 2-dimensional.",
-               resize_dims->dim_size());
+    MACE_CHECK(resize_dims->dim_size() == 1,
+               "resize dim must be 2-dimensional.", resize_dims->dim_size());
 
     Tensor* output = this->Output(0);
 
@@ -35,7 +35,7 @@ class ResizeBilinearOp : public Operator<D, T> {
     index_t in_width = input->dim(3);
     index_t out_height = resize_dims->data<index_t>()[0];
     index_t out_width = resize_dims->data<index_t>()[1];
-    vector<index_t> out_shape {n, channels, out_height, out_width};
+    vector<index_t> out_shape{n, channels, out_height, out_width};
     output->Resize(out_shape);
 
     const T* input_ptr = input->data<T>();
@@ -45,10 +45,11 @@ class ResizeBilinearOp : public Operator<D, T> {
              out_height, out_width);
     return true;
   }
+
  private:
   kernels::ResizeBilinearFunctor<D, T> functor_;
 };
 
-} //  namespace mace
+}  //  namespace mace
 
-#endif // MACE_RESIZE_BILINEAR_H
+#endif  // MACE_RESIZE_BILINEAR_H
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index 4887e136..333d32af 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -2,9 +2,9 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+#include "mace/ops/resize_bilinear.h"
 #include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/resize_bilinear.h"
 
 using namespace mace;
 
-- 
GitLab